# Importing Libraries

In [1]:
import pandas as pd
import numpy as np

# Impartirea setului de date in train & test
from sklearn.model_selection import train_test_split

# Pentru alg. de clasificare fol. Log. Reg.
from sklearn.linear_model import LogisticRegression

# Evaluarea Performantei
from sklearn.metrics import classification_report

# Tokenization
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Salvarea modelului
import pickle

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mariaberinde/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Citirea datelor

In [2]:
url = 'https://github.com/berinde/curs-analiza-datelor-complexe/blob/main/data/input/3.input_data_prepped_bow.csv?raw=True'
reviews = pd.read_csv(url)
reviews.head(2)

Unnamed: 0,rest_id,text,rating,char_count,positive,text_prep,text_prep_tokens,word_len_prep,text_prep_lim,text_prep_lim_tokens,word_len_prep_lim
0,yGMCl0vYigshkXiZFIDTNw,We arrived for our reservation at 7:15pm. The...,4,302,1,arrived reservation pm seat -PRON- right time ...,"['arrived', 'reservation', 'pm', 'seat', '-PRO...",27,arrived reservation pm seat right time restura...,"['arrived', 'reservation', 'pm', 'seat', 'righ...",25
1,yGMCl0vYigshkXiZFIDTNw,We received amazing service again. The food wa...,5,111,1,receive amazing service food cook right waitre...,"['receive', 'amazing', 'service', 'food', 'coo...",10,receive amazing service food cook right waitre...,"['receive', 'amazing', 'service', 'food', 'coo...",9


In [3]:
reviews.shape

(9365, 11)

In [4]:
url = 'https://github.com/berinde/curs-analiza-datelor-complexe/blob/main/data/input/dtm_1_bow.parquet?raw=True'
dtm_bow = pd.read_parquet(url)

In [5]:
dtm_bow.shape

(9365, 6000)

In [6]:
dtm_bow.head(2)

Unnamed: 0,aaabar,aaron,abacus,ability,able,absolute,absolutely,absurd,abundance,abundant,...,zero,zesty,zillion,zipps,ziti,zoe,zone,zucchini,zuchinni,zupa
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
url = 'https://github.com/berinde/curs-analiza-datelor-complexe/blob/main/data/input/dtm_2_tfidf.parquet?raw=True'
dtm_tf_idf = pd.read_parquet(url)
dtm_tf_idf.head(2)

Unnamed: 0,aaabar,aaron,abacus,ability,able,absolute,absolutely,absurd,abundance,abundant,...,zesty,zillion,zipps,ziti,zoe,zone,zucchini,zuchinni,zupa,zupas
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Clasificare folosing reprezentarea BOW

## BOW - impartirea setului de date in set de training & test

In [8]:
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(
    dtm_bow,
    reviews['positive'],
    train_size=0.8,
    random_state=42
    )

In [9]:
print(len(X_train_bow), len(X_test_bow), len(y_train_bow), len(y_test_bow))

7492 1873 7492 1873


In [10]:
y_test_bow

1791    0
3884    0
8677    0
9132    0
7726    0
       ..
4640    1
5620    1
6739    1
7454    1
6242    1
Name: positive, Length: 1873, dtype: int64

In [11]:
y_train_bow.mean()

0.6987453283502403

In [12]:
y_test_bow.mean()

0.6892685531233316

## BOW + Logistic Regression

In [13]:
m1_bow = LogisticRegression()
m1_bow.fit(X_train_bow, y_train_bow)

In [14]:
# Generam predictiile pe setul de test
ypred1_bow = m1_bow.predict(X_test_bow)
ypred1_bow

array([0, 0, 0, ..., 1, 1, 1])

In [15]:
len(ypred1_bow)

1873

In [16]:
print('Acuratetea pe setul de training:', m1_bow.score(X_train_bow, y_train_bow))
print('Acuratetea pe setul de test:', m1_bow.score(X_test_bow, y_test_bow))

print('Classification Report pe setul de test\n',
      classification_report(y_test_bow, ypred1_bow)
      )

Acuratetea pe setul de training: 0.9963961558996263
Acuratetea pe setul de test: 0.9353977576081153
Classification Report pe setul de test
               precision    recall  f1-score   support

           0       0.90      0.89      0.90       582
           1       0.95      0.96      0.95      1291

    accuracy                           0.94      1873
   macro avg       0.93      0.92      0.92      1873
weighted avg       0.94      0.94      0.94      1873



In [17]:
pickle.dump(m1_bow, open('m1_bow.pkl', 'wb'))

* P = positive <-> reviews['positive']==1
* N = negative <-> reviews['positive']==0
* TP = recenziile prezise ca pozitive, sunt pozitive
* TN = recenziile prezise ca negative, sunt negative
* FP = recenziile prezise ca pozitive, sunt negative
* FN = recenziile prezise ca negative, sunt pozitive
* Precision = $\frac{TP}{TP+FP}$, din toate cazurile pe care le-am prezis ca pozitive, cate sunt de fapt?
* Recall = $\frac{TP}{TP+FN}$, din toate cazurile care sunt pozitive, cate le-am prezis corect?




# Clasificare folosind TF-IDF

## TF-IDF - impartirea setului de date in set de training & test

In [18]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(
    dtm_tf_idf,
    reviews['positive'],
    train_size=0.8,
    random_state=42
)

In [19]:
print(len(X_train_tfidf), len(X_test_tfidf),
      len(y_train_tfidf), len(y_test_tfidf))

7492 1873 7492 1873


In [20]:
X_train_tfidf.head(2)

Unnamed: 0,aaabar,aaron,abacus,ability,able,absolute,absolutely,absurd,abundance,abundant,...,zesty,zillion,zipps,ziti,zoe,zone,zucchini,zuchinni,zupa,zupas
4707,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
957,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
y_train_tfidf

4707    1
957     1
3569    1
3820    1
6041    1
       ..
5734    1
5191    1
5390    1
860     1
7270    0
Name: positive, Length: 7492, dtype: int64

## TF-IDF + Logistic Regression

In [22]:
m2_tfidf = LogisticRegression()
m2_tfidf.fit(X_train_tfidf, y_train_tfidf)

In [23]:
# Predictiile pe setul de test
ypred2_tfidf = m2_tfidf.predict(X_test_tfidf)
ypred2_tfidf

array([1, 0, 0, ..., 1, 1, 1])

In [24]:
len(ypred2_tfidf)

1873

In [25]:
print('Acuratetea pe setul de training:', m2_tfidf.score(X_train_tfidf,
                                                         y_train_tfidf))
print('Acuratetea pe setul de test:', m2_tfidf.score(X_test_tfidf,
                                                     y_test_tfidf))
print('Classification Report pe setul de test\n',
      classification_report(y_test_tfidf, ypred2_tfidf)
      )

Acuratetea pe setul de training: 0.9615589962626802
Acuratetea pe setul de test: 0.9418045915643353
Classification Report pe setul de test
               precision    recall  f1-score   support

           0       0.94      0.87      0.90       582
           1       0.94      0.97      0.96      1291

    accuracy                           0.94      1873
   macro avg       0.94      0.92      0.93      1873
weighted avg       0.94      0.94      0.94      1873



In [None]:
pickle.dump(m2_tfidf, open('m2_tfidf.pkl', 'wb'))

# Clasificare folosing FastText

## Reprezentarea FastText

In [None]:
!wget -O trained_ft.pkl https://github.com/berinde/curs-analiza-datelor-complexe/blob/main/data/input/trained_ft.pkl?raw=true

In [None]:
repr_fasttext = pickle.load(open('trained_ft.pkl', 'rb'))

In [None]:
repr_fasttext.wv['restaurant']

Reprezentarile FastText sunt pt fiecare cuvant. Pentru a crea o reprezentare vectoriala pentru intreaga recenzie, putem calcula media reprezentarilor vectoriale ale cuv. care alcatuiesc recenzia.

In [None]:
corpus = reviews['text_prep_lim'].astype(str)
# Aplicam word tokenize
corpus = corpus.apply(word_tokenize).to_list()

In [None]:
corpus[0:2]

In [None]:
# Lista de cuvinte din prima recenzie
doc1 = corpus[0]
doc1

In [None]:
# Reprezentarile vectoriale pentru fiecare cuvant din doc1
repr_fasttext.wv[doc1]

In [None]:
# Media pe fiecare din cele 60 dimensiuni
repr_doc1 = np.mean(repr_fasttext.wv[doc1], axis=0)
repr_doc1

In [None]:
len(repr_doc1)

In [None]:
for doc in corpus:
    medie = np.mean(repr_fasttext.wv[doc], axis=0)
    
    if doc == corpus[0]:
      repr_recenzie = medie
    else:
      repr_recenzie = np.vstack((repr_recenzie, medie))

In [None]:
repr_recenzie.shape

repr_recenzie contine reprezentarile vectoriale agregate din cuvinte pentru fiecare recenzie in parte

In [None]:
repr_recenzie[0:2]

## FastText - impartirea setului de date in set de training & test

In [None]:
X_train_ft, X_test_ft, y_train_ft, y_test_ft = train_test_split(
    repr_recenzie,
    reviews['positive'],
    train_size=0.8,
    random_state=42)

In [None]:
X_train_ft.shape

In [None]:
X_test_ft.shape

# FastText + Logistic Regression

In [None]:
m3_ft = LogisticRegression()
m3_ft.fit(X_train_ft, y_train_ft)

In [None]:
# Predictiile pe setul de test
ypred3_ft = m3_ft.predict(X_test_ft)

In [None]:
# Evaluarea Performantei

print('Acuratetea pe setul de training',
      m3_ft.score(X_train_ft, y_train_ft))
print('Acuratetea pe setul de test',
      m3_ft.score(X_test_ft, y_test_ft))
print('Classification Report pe setul de test \n',
      classification_report(y_test_ft, ypred3_ft))