# Create the FastText Embedding Matrix for the vocabulary of our dataset

In [23]:
import pandas as pd
import numpy as np
np.set_printoptions(suppress=True)

import nltk
from nltk.tokenize import word_tokenize
from gensim.models import FastText
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import plotly.express as px

In [5]:
df = pd.read_csv('data/preprocessed_data.csv')

### 1. Tokenize the text

Download the Punkt tokenizer as it's very effective tokenizing sentences in Spanish as it uses punctuation and capitalization cues that are very informative in Spanish:

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aaron\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [11]:
sentences = [word_tokenize(rev.lower()) for rev in df.Description]

In [12]:
sentences[0][:10]

['alquiler', 'de', 'piso', 'en', 'avenida', 'de', 'ramón', 'y', 'cajal', '.']

### 2. Create the FastText Embedding Matrix

Here he have changed multiple times the vector_size in order to make the explainability of the model easier.

In [13]:
model = FastText(sentences, vector_size=128, window=5, min_count=3, workers=18, epochs=100, seed=42, sg=1)

### 3. Plot the Embedding Matrix to see the words in the vector space

In [16]:
def plot_top_similar(query_word, model, limit=10, color=['maroon', 'blue']):
    embed_dim = model.wv.vectors.shape[1]
    vectors = np.empty((0, embed_dim), dtype='f')
    labels = [query_word]
    types = ['Query Word']
    
    vectors = np.append(vectors, model.wv.__getitem__([query_word]), axis=0)
    
    similar_words = model.wv.most_similar(query_word, topn=limit)
    for word, similarity in similar_words:
        vector = model.wv.__getitem__([word])
        labels.append(word)
        types.append('Similar Words')
        vectors = np.append(vectors, vector, axis=0)
        
    vectors_tsne = TSNE(n_components=2, random_state=42, init='pca').fit_transform(vectors)
    vectors_tsne_df = pd.DataFrame({
        'X': [x for x in vectors_tsne[:, 0]],
        'Y': [y for y in vectors_tsne[:, 1]],
        'label': labels,
        'Type': types
    })
    
    fig = px.scatter(vectors_tsne_df, x='X', y='Y', text='label', color='Type', size_max=60, color_discrete_map={'Query Word': color[0], 'Similar Words': color[1]})
    fig.for_each_trace(lambda t: t.update(textfont_color=t.marker.color, textposition='top right'))
    fig.update_layout(
        height=800,
        title_text=f't-SNE visualization for Top {limit} Similar Words to "{query_word}"'
    )
    
    return fig