### Libraries

In [19]:
import pandas as pd
import numpy as np
np.set_printoptions(suppress=True)

import nltk
from nltk.tokenize import word_tokenize
from gensim.models import FastText
from sklearn.manifold import TSNE
import plotly.express as px

## Create the FastText Embeddings for our dataset

In [11]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/aaron/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
df = pd.read_csv('../data/Big_AHR_Spanish.csv')

In [13]:
sentences = [word_tokenize(rev.lower()) for rev in df.text_content]

In [14]:
sentences

[['excelente',
  'y',
  'personal',
  'amable',
  'un',
  'hotel',
  'muy',
  'bueno',
  '.',
  'el',
  'personal',
  'fue',
  'muy',
  'amable',
  'y',
  'profesional',
  '.',
  'nos',
  'gustaban',
  'desayuno',
  'mucho',
  'también',
  '.',
  'el',
  'habitación',
  'cómoda',
  'y',
  'limpia',
  '.',
  'volvimos',
  'a',
  'próxima',
  'vez',
  'en',
  'seville',
  'por',
  'seguro',
  '.',
  'lo',
  'recomiendo',
  '!',
  '!',
  'y',
  'también',
  'ubicación',
  'muy',
  'cerca',
  'del',
  'centro',
  ':',
  ')',
  ')',
  ')',
  ')'],
 ['céntrico',
  'muy',
  'buen',
  'hotel',
  'al',
  'nivel',
  'de',
  'lo',
  'esperado',
  ',',
  'habitación',
  'junior',
  'suite',
  ',',
  'no',
  'muy',
  'grande',
  'pero',
  'con',
  'todo',
  'lo',
  'deseable',
  'para',
  'una',
  'estancia',
  'confortable',
  ',',
  'trato',
  'del',
  'personal',
  'agradable',
  ',',
  'ubicación',
  'muy',
  'buena',
  'para',
  'hacer',
  'turismo',
  ',',
  'super',
  'céntrico',
  ',',
  'i

In [17]:
model = FastText(sentences, vector_size=128, window=5, min_count=3, workers=18, epochs=100, seed=42, sg=1)

In [20]:
def plot_top_similar(query_word, model, limit=10, color=['maroon', 'blue']):
    embed_dim = model.wv.vectors.shape[1]
    vectors = np.empty((0, embed_dim), dtype='f')
    labels = [query_word]
    types = ['Query Word']
    
    vectors = np.append(vectors, model.wv.__getitem__([query_word]), axis=0)
    
    similar_words = model.wv.most_similar(query_word, topn=limit)
    for word, similarity in similar_words:
        vector = model.wv.__getitem__([word])
        labels.append(word)
        types.append('Similar Words')
        vectors = np.append(vectors, vector, axis=0)
        
    vectors_tsne = TSNE(n_components=2, random_state=42, init='pca').fit_transform(vectors)
    vectors_tsne_df = pd.DataFrame({
        'X': [x for x in vectors_tsne[:, 0]],
        'Y': [y for y in vectors_tsne[:, 1]],
        'label': labels,
        'Type': types
    })
    
    fig = px.scatter(vectors_tsne_df, x='X', y='Y', text='label', color='Type', size_max=60, color_discrete_map={'Query Word': color[0], 'Similar Words': color[1]})
    fig.for_each_trace(lambda t: t.update(textfont_color=t.marker.color, textposition='top right'))
    fig.update_layout(
        height=800,
        title_text=f't-SNE visualization for Top {limit} Similar Words to "{query_word}"'
    )
    
    return fig

In [23]:
hotel_fig = plot_top_similar("hotel", model, 30)

In [24]:
hotel_fig.show()

In [25]:
restaurant_fig = plot_top_similar("restaurante", model, 30)

In [26]:
restaurant_fig.show()

In [27]:
comodo_fig = plot_top_similar("comodo", model, 30)

In [28]:
comodo_fig.show()

#### Save the model

In [29]:
model.save("../embeddings/fastText_model")

## Load the dataset

In [30]:
model_loaded = FastText.load("../embeddings/fastText_model")

In [32]:
# test that it works succesfully
hotel_fig = plot_top_similar("hotel", model_loaded, 30)

In [33]:
hotel_fig.show()

In [41]:
# Perform the embeddings
query_word = "hotel"
limit = 10

embed_dim = model.wv.vectors.shape[1]
vectors = np.empty((0, embed_dim), dtype='f')
labels = [query_word]
types = ['Query Word']

hotel_embeddings = np.append(vectors, model.wv.__getitem__([query_word]), axis=0)


In [42]:
hotel_embeddings.shape

(1, 128)