In [None]:
import fasttext

model = fasttext.train_unsupervised('corpus.txt', model='skipgram', dim=64, ws=3)

model.save_model('fasttext_model.bin')
model.save_model('fasttext_model.vec')

In [None]:
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [None]:
def get_random_word_vectors(model, num_samples=20):
    """Get random word vectors from the FastText model."""
    # Get the list of words in the vocabulary
    words = model.get_words()
    
    # Randomly sample words
    sampled_words = random.sample(words, num_samples)
    
    # Retrieve corresponding vectors
    vectors = np.array([model.get_word_vector(word) for word in sampled_words])
    
    return sampled_words, vectors

In [None]:
def visualize_embeddings_2D(words, vectors, dimension='tsne'):
    """Visualize word vectors in 2D using t-SNE or PCA."""
    # Reduce dimensionality using t-SNE
    if dimension == 'tsne':
        perplexity = min(30, len(words) - 1)  # Ensure perplexity is less than the number of samples
        tsne = TSNE(n_components=2, perplexity=perplexity, random_state=0)
        reduced_vectors = tsne.fit_transform(vectors)
    # Reduce dimensionality using PCA
    # elif dimension == 'pca':
    #     pca = PCA(n_components=2)
    #     reduced_vectors = pca.fit_transform(vectors)
    else:
        raise ValueError("Invalid dimension specified. Use 'tsne' or 'pca'.")

    # Create a scatter plot
    plt.figure(figsize=(10, 8))
    plt.scatter(reduced_vectors[:, 0], reduced_vectors[:, 1])

    # Annotate each point with the corresponding word
    for i, word in enumerate(words):
        plt.annotate(word, (reduced_vectors[i, 0], reduced_vectors[i, 1]), fontsize=9)

    plt.title(f'2D Visualization of Word Vectors ({dimension.upper()})')
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')
    plt.grid()
    plt.show()

In [None]:
model = fasttext.load_model("models/fasttext_32model.bin")

In [None]:
words, vectors = get_random_word_vectors(model,num_samples=20)
visualize_embeddings_2D(words,vectors)

In [None]:
words, vectors = load_vec_file('models/fasttext_32model.vec', num_samples=20)
visualize_embeddings_2D(words, vectors)

## Notas

- fasttext.train_unsupervised('data/isc_sentences.txt', model='skipgram', dim=100, ws=3, lr=0.1, epoch=10, minCount=1)
   - Progress: 100.0% words/sec/thread:   59718 lr: -0.000162 avg.loss:  2.933889 ETA:   0h 0m 0s%
- fasttext.train_unsupervised('data/isc_sentences.txt', model='skipgram', dim=32, ws=3, lr=0.1, epoch=10, minCount=1)
   - Progress: 100.0% words/sec/thread:   59890 lr: -0.000275 avg.loss:  2.932989 ETA:   0h 0m 0s
- fasttext.train_unsupervised('data/isc_sentences.txt', model='skipgram', dim=64, ws=3, lr=0.1, epoch=10, minCount=1)
   - Progress: 100.0% words/sec/thread:   59352 lr:  0.000000 avg.loss:  2.936122 ETA:   0h 0m 0s
- fasttext.train_unsupervised('data/isc_sentences.txt', model='skipgram', dim=16, ws=3, lr=0.1, epoch=10, minCount=1)
   - Progress: 100.0% words/sec/thread:   59578 lr:  0.000000 avg.loss:  2.876869 ETA:   0h 0m 0s
- fasttext.train_unsupervised('data/isc_sentences.txt', model='skipgram', dim=16, ws=3, lr=0.05, epoch=10, minCount=1)
   - Progress: 100.0% words/sec/thread:   59471 lr:  0.000000 avg.loss:  3.276145 ETA:   0h 0m 0s

- fasttext.train_unsupervised('data/isc_sentences.txt', model='cbow', dim=100, ws=3, lr=0.1, epoch=10, minCount=1)
   - Progress: 100.0% words/sec/thread:   58820 lr:  0.000000 avg.loss:  3.199011 ETA:   0h 0m 0s
- fasttext.train_unsupervised('data/isc_sentences.txt', model='cbow', dim=32, ws=3, lr=0.1, epoch=10, minCount=1)
   - Progress: 100.0% words/sec/thread:   59630 lr: -0.000108 avg.loss:  3.182995 ETA:   0h 0m 0ss
- fasttext.train_unsupervised('data/isc_sentences.txt', model='cbow', dim=64, ws=3, lr=0.1, epoch=10, minCount=1)
   - Progress: 100.0% words/sec/thread:   59718 lr: -0.000181 avg.loss:  3.207726 ETA:   0h 0m 0s
- fasttext.train_unsupervised('data/isc_sentences.txt', model='cbow', dim=16, ws=3, lr=0.1, epoch=10, minCount=1)
   - Progress: 100.0% words/sec/thread:   59288 lr: -0.000107 avg.loss:  3.229246 ETA:   0h 0m 0s
- fasttext.train_unsupervised('data/isc_sentences.txt', model='cbow', dim=16, ws=3, lr=0.05, epoch=10, minCount=1)
   - Progress: 100.0% words/sec/thread:   59624 lr: -0.000054 avg.loss:  3.624382 ETA:   0h 0m 0s

- fasttext.train_unsupervised('data/isc_sentences.txt', model='cbow', dim=100, ws=5, lr=0.1, epoch=50, minCount=1)
   - Progress: 100.0% words/sec/thread:  149359 lr:  0.000000 avg.loss:  1.709613 ETA:   0h 0m 0s
- fasttext.train_unsupervised('data/isc_sentences.txt', model='cbow', dim=32, ws=5, lr=0.1, epoch=50, minCount=1)
   - Progress: 100.0% words/sec/thread:  296740 lr:  0.000000 avg.loss:  1.673889 ETA:   0h 0m 0s
- fasttext.train_unsupervised('data/isc_sentences.txt', model='cbow', dim=64, ws=5, lr=0.1, epoch=50, minCount=1)
   - Progress: 100.0% words/sec/thread:  149585 lr:  0.000000 avg.loss:  1.661034 ETA:   0h 0m 0s
- fasttext.train_unsupervised('data/isc_sentences.txt', model='cbow', dim=16, ws=5, lr=0.1, epoch=50, minCount=1)
   - Progress: 100.0% words/sec/thread:  295987 lr:  0.000000 avg.loss:  1.800488 ETA:   0h 0m 0s
- fasttext.train_unsupervised('data/isc_sentences.txt', model='cbow', dim=16, ws=5, lr=0.05, epoch=50, minCount=1)
   - Progress: 100.0% words/sec/thread:  294007 lr:  0.000000 avg.loss:  2.609519 ETA:   0h 0m 0s

- fasttext.train_unsupervised('data/isc_sentences.txt', model='cbow', dim=32, ws=7, lr=0.1, epoch=50, minCount=1)
   - Progress: 100.0% words/sec/thread:  296381 lr:  0.000000 avg.loss:  1.644967 ETA:   0h 0m 0s
- fasttext.train_unsupervised('data/isc_sentences.txt', model='cbow', dim=64, ws=7, lr=0.1, epoch=50, minCount=1)
   - Progress: 100.0% words/sec/thread:  149516 lr:  0.000000 avg.loss:  1.670575 ETA:   0h 0m 0s

**Configuración final**
- Modelo: cbow
- Dimensión: 32
- Window size: 7

**Configuración alternativa**
- Modelo: cbow
- Dimensión: 64
- Window size: 5

**Entrenamiento**
- Epochs: 50
- Learning rate: 0.1
- Min count: 1

**Observaciones**
- cbow es mejor que skipgram para el tamaño de datos que se tiene
- Las mejores configuraciones son con dimensiones de 32 y 64. Las dimensiones de 16 y 100 no son tan buenas.
- El window size de 7 es mejor con dimensiones más pequeñas y el window size de 5 es mejor con dimensiones más grandes. Porque con dimensiones más grandes, ya se tiene más información de contexto.