# Importing Libraries

In [None]:
import pandas as pd
import numpy as np

# Pentru tokenization
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

from gensim.models import FastText, Word2Vec

# Reducerea nr. de dimensiuni
from sklearn.decomposition import PCA

# Salvarea modelului
import pickle

# vizualizare
import plotly.express as px

# Citirea Setului de Date

In [None]:
url = 'https://raw.githubusercontent.com/DanielaManate/SentimentAnalysis-TopicModeling/master/Data/Input/3.input_data_prepped_bow.csv'
reviews = pd.read_csv(url)
reviews.head(2)

In [None]:
reviews['text_prep_lim'] = reviews['text_prep_lim'].astype(str)

In [None]:
corpus = reviews['text_prep_lim'].apply(word_tokenize).to_list()
corpus[0:2]

# FastText
- size = 60 -> numarul de dimensiuni in care o sa fie reprezentat vectorial fiecare cuvant
- min_count =3 -> ignoram cuvintele care au o frecventa < min_count
- window = 10 -> numarul de cuvinte vecine considerate



In [None]:
# Corpus are 9365 de elemente
len(corpus)

In [None]:
reviews.shape

In [None]:
ft_model = FastText(min_count=3, size=60, window=10)

# Construim vocabularul din corpus
ft_model.build_vocab(corpus)

ft_model.train(corpus, 
               total_examples=len(corpus),
               epochs=5)

In [None]:
# Reprezentarea vectoriala a cuvantului 'order'
ft_model.wv['order']

In [None]:
len(ft_model.wv['order'])

In [None]:
ft_model.wv.most_similar('great', topn=5)

In [None]:
ft_model.wv.similarity('wrong', 'order')

## Salvarea modelului

In [None]:
pickle.dump(ft_model, open('ft_model.pkl', 'wb'))

In [None]:
# ft_model2 = pickle.load(open('ft_model.pkl', 'rb'))

## Vizualizarea reprezentarilor vectoriale FastText

In [None]:
lista_cuvinte = ['food', 'pizza', 'pasta',
                 'place', 'restaurant', 'service',
                 'good', 'great', 'delicious',
                 'terrible', 'awful', 'horrible']

In [None]:
X = ft_model.wv[lista_cuvinte]
# X

In [None]:
pca_ft_2 = PCA(n_components=2, random_state=42).fit_transform(X)
pca_ft_2

In [None]:
fig = px.scatter(x=pca_ft_2[:, 0],
                 y=pca_ft_2[:, 1],
                 text=lista_cuvinte)
fig.update_traces(textposition='top center',
                  textfont_size=14)
fig.show();

In [None]:
pca_ft_3 = PCA(n_components=3, random_state=42).fit_transform(X)
pca_ft_3

In [None]:
fig = px.scatter_3d(x=pca_ft_3[:, 0],
                    y=pca_ft_3[:, 1],
                    z=pca_ft_3[:, 2],
                    text=lista_cuvinte)
fig.update_traces(textposition='top center',
                  textfont_size=14)
fig.show();

# Word2Vec

In [None]:
wv_model = Word2Vec(min_count=3, size=60, window=10)

wv_model.build_vocab(corpus)

wv_model.train(corpus, 
               total_examples=len(corpus),
               epochs=5)

In [None]:
wv_model.wv['order']

In [None]:
wv_model.wv.most_similar('great', topn=30)

In [None]:
# Nume de persoane apar similare cu great (Karol, Susan, Steven, Ned)

# ('karol', 0.8677303791046143),
#  ('susan', 0.8649597764015198),
#  ('hookah', 0.857241153717041),
#  ('steven', 0.8558700680732727),
#  ('ned', 0.8550676107406616),
#  ('impeccable', 0.8520554304122925),
#  ('seth', 0.8516987562179565),

In [None]:
# Pentru fiecare rand din df. reviews, T/F daca text_prep_lim contine 'susan'
reviews['text_prep_lim'].str.contains('susan')

In [None]:
df_exemplu = reviews[reviews['text_prep_lim'].str.contains('susan')].copy()
print('Numarul de recenzii care contin cuvantul',
      len(df_exemplu))

In [None]:
print('Rating-ul mediu din recenziile care contin cuvantul',
      df_exemplu['rating'].mean()
      )

In [None]:
print('Restaurantele unice din recenziile care contin cuvantul',
      df_exemplu['rest_id'].nunique()
      )

In [None]:
df_exemplu['text'].to_list()

In [None]:
wv_model.wv.similarity('wrong', 'order')

In [None]:
wv_model.wv.similarity('fast', 'order')

In [None]:
wv_model.wv.most_similar('wait', topn=30)

## Salvarea Modelului

In [None]:
pickle.dump(wv_model, open('wv_model.pkl', 'wb'))

# Visualization

In [None]:
lista_cuvinte

In [None]:
X_wv = wv_model.wv[lista_cuvinte]
X_wv[0]

In [None]:
pca_wv_2 = PCA(n_components=2, random_state=42).fit_transform(X_wv)
pca_wv_2

In [None]:
pca_wv_3 = PCA(n_components=3, random_state=42).fit_transform(X_wv)
pca_wv_3

In [None]:
fig = px.scatter(x=pca_wv_2[:, 0],
                 y=pca_wv_2[:, 1],
                 text=lista_cuvinte)
fig.update_traces(textposition='top center',
                  textfont_size=14)
fig.show()

In [None]:
fig = px.scatter_3d(x=pca_wv_3[:, 0],
                 y=pca_wv_3[:, 1],
                 z=pca_wv_3[:, 2],
                 text=lista_cuvinte)
fig.update_traces(textposition='top center',
                  textfont_size=14)
fig.show()

In [None]:
lista2_cuvinte = ['waiter', 'server', 'staff', 'owner', 'chef',
                  'burger', 'fries', 'tacos', 'cake', 'pizza'] 

In [None]:
X_wv2 = wv_model.wv[lista2_cuvinte]

In [None]:
pca_list2cuv_3 = PCA(n_components=3).fit_transform(X_wv2)

In [None]:
fig = px.scatter_3d(x=pca_list2cuv_3[:, 0],
                    y=pca_list2cuv_3[:, 1],
                    z=pca_list2cuv_3[:, 2],
                    text=lista2_cuvinte
                    )
fig.update_traces(textposition='top center',
                  textfont_size=14)
fig.show()

In [None]:
# Cream un nou df, care sa contina toate recenziile cu cuvantul 'pizza'
df_pizza = reviews[reviews['text_prep_lim'].str.contains('pizza')].copy()
print('Numarul de recenzii care contin cuvantul pizza:', len(df_pizza))

df_burger = reviews[reviews['text_prep_lim'].str.contains('burger')].copy()
print('Numarul de recenzii care contin cuvantul burger:', len(df_burger))

In [None]:
print('Statisticile descriptive pentru rating pentru recenziile care contin cuvantul pizza',
      df_pizza['rating'].describe())

In [None]:
print('Statisticile descriptive pentru rating pentru recenziile care contin cuvantul burger',
      df_burger['rating'].describe())

Alte resurse:
https://amitness.com/2020/06/fasttext-embeddings/
