# Importing Libraries

In [None]:
import pandas as pd
import numpy as np

# Impartirea setului de date in train & test
from sklearn.model_selection import train_test_split

# Pentru alg. de clasificare fol. Log. Reg.
from sklearn.linear_model import LogisticRegression

# Evaluarea Performantei
from sklearn.metrics import classification_report

# Tokenization
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Embeddings
from gensim.models import FastText, Word2Vec

# Salvarea modelului
import pickle

# Citirea datelor

In [None]:
url = 'https://github.com/berinde/curs-analiza-datelor-complexe/blob/main/data/input/3.input_data_prepped_bow.csv?raw=True'
reviews = pd.read_csv(url)
reviews.head(2)

In [None]:
reviews.shape

In [None]:
url = 'https://github.com/berinde/curs-analiza-datelor-complexe/blob/main/data/input/dtm_1_bow.parquet?raw=True'
dtm_bow = pd.read_parquet(url)

In [None]:
dtm_bow.shape

In [None]:
dtm_bow.head(2)

In [None]:
url = 'https://github.com/berinde/curs-analiza-datelor-complexe/blob/main/data/input/dtm_2_tfidf.parquet?raw=True'
dtm_tf_idf = pd.read_parquet(url)
dtm_tf_idf.head(2)

# Clasificare folosing reprezentarea BOW

## BOW - impartirea setului de date in set de training & test

In [None]:
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(
    dtm_bow,
    reviews['positive'],
    train_size=0.8,
    random_state=42
    )

In [None]:
print(len(X_train_bow), len(X_test_bow), len(y_train_bow), len(y_test_bow))

In [None]:
y_test_bow

In [None]:
y_train_bow.mean()

In [None]:
y_test_bow.mean()

## BOW + Logistic Regression

In [None]:
m1_bow = LogisticRegression()
m1_bow.fit(X_train_bow, y_train_bow)

In [None]:
# Generam predictiile pe setul de test
ypred1_bow = m1_bow.predict(X_test_bow)
ypred1_bow

In [None]:
len(ypred1_bow)

In [None]:
print('Acuratetea pe setul de training:', m1_bow.score(X_train_bow, y_train_bow))
print('Acuratetea pe setul de test:', m1_bow.score(X_test_bow, y_test_bow))

print('Classification Report pe setul de test\n',
      classification_report(y_test_bow, ypred1_bow)
      )

In [None]:
pickle.dump(m1_bow, open('m1_bow.pkl', 'wb'))

* P = positive <-> reviews['positive']==1
* N = negative <-> reviews['positive']==0
* TP = recenziile prezise ca pozitive, sunt pozitive
* TN = recenziile prezise ca negative, sunt negative
* FP = recenziile prezise ca pozitive, sunt negative
* FN = recenziile prezise ca negative, sunt pozitive
* Precision = $\frac{TP}{TP+FP}$, din toate cazurile pe care le-am prezis ca pozitive, cate sunt de fapt?
* Recall = $\frac{TP}{TP+FN}$, din toate cazurile care sunt pozitive, cate le-am prezis corect?




# Clasificare folosind TF-IDF

## TF-IDF - impartirea setului de date in set de training & test

In [None]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(
    dtm_tf_idf,
    reviews['positive'],
    train_size=0.8,
    random_state=42
)

In [None]:
print(len(X_train_tfidf), len(X_test_tfidf),
      len(y_train_tfidf), len(y_test_tfidf))

In [None]:
X_train_tfidf.head(2)

In [None]:
y_train_tfidf

## TF-IDF + Logistic Regression

In [None]:
m2_tfidf = LogisticRegression()
m2_tfidf.fit(X_train_tfidf, y_train_tfidf)

In [None]:
# Predictiile pe setul de test
ypred2_tfidf = m2_tfidf.predict(X_test_tfidf)
ypred2_tfidf

In [None]:
len(ypred2_tfidf)

In [None]:
print('Acuratetea pe setul de training:', m2_tfidf.score(X_train_tfidf,
                                                         y_train_tfidf))
print('Acuratetea pe setul de test:', m2_tfidf.score(X_test_tfidf,
                                                     y_test_tfidf))
print('Classification Report pe setul de test\n',
      classification_report(y_test_tfidf, ypred2_tfidf)
      )

In [None]:
pickle.dump(m2_tfidf, open('m2_tfidf.pkl', 'wb'))

# Clasificare folosing FastText

## Reprezentarea FastText

In [None]:
# !wget -O trained_ft.pkl https://github.com/berinde/curs-analiza-datelor-complexe/blob/main/data/input/trained_ft.pkl?raw=true
# repr_fasttext = pickle.load(open('trained_ft.pkl', 'rb'))
# repr_fasttext.wv['restaurant']

In [None]:
corpus = reviews['text_prep_lim'].astype(str)
# Aplicam word tokenize
corpus = corpus.apply(word_tokenize).to_list()

In [None]:
ft_model = FastText(min_count=3, vector_size=60, window=10)

# Construim vocabularul din corpus
ft_model.build_vocab(corpus)

ft_model.train(corpus, 
               total_examples=len(corpus),
               epochs=5)

Reprezentarile FastText sunt pt fiecare cuvant. Pentru a crea o reprezentare vectoriala pentru intreaga recenzie, putem calcula media reprezentarilor vectoriale ale cuv. care alcatuiesc recenzia.

In [None]:
corpus[0:2]

In [None]:
# Lista de cuvinte din prima recenzie
doc1 = corpus[0]
doc1

In [None]:
# Reprezentarile vectoriale pentru fiecare cuvant din doc1
ft_model.wv[doc1]

In [None]:
# Media pe fiecare din cele 60 dimensiuni
repr_doc1 = np.mean(ft_model.wv[doc1], axis=0)
repr_doc1

In [None]:
len(repr_doc1)

In [None]:
for doc in corpus:
    medie = np.mean(ft_model.wv[doc], axis=0)
    
    if doc == corpus[0]:
      repr_recenzie = medie
    else:
      repr_recenzie = np.vstack((repr_recenzie, medie))

In [None]:
repr_recenzie.shape

repr_recenzie contine reprezentarile vectoriale agregate din cuvinte pentru fiecare recenzie in parte

In [None]:
repr_recenzie[0:2]

## FastText - impartirea setului de date in set de training & test

In [None]:
X_train_ft, X_test_ft, y_train_ft, y_test_ft = train_test_split(
    repr_recenzie,
    reviews['positive'],
    train_size=0.8,
    random_state=42)

In [None]:
X_train_ft.shape

In [None]:
X_test_ft.shape

# FastText + Logistic Regression

In [None]:
m3_ft = LogisticRegression()
m3_ft.fit(X_train_ft, y_train_ft)

In [None]:
# Predictiile pe setul de test
ypred3_ft = m3_ft.predict(X_test_ft)

In [None]:
# Evaluarea Performantei

print('Acuratetea pe setul de training',
      m3_ft.score(X_train_ft, y_train_ft))
print('Acuratetea pe setul de test',
      m3_ft.score(X_test_ft, y_test_ft))
print('Classification Report pe setul de test \n',
      classification_report(y_test_ft, ypred3_ft))