# Importing Libraries

In [None]:
!pip install ktrain

In [None]:
import pandas as pd
import numpy as np

# Classification Report, Train Test Split
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Lexicon
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

# TF-IDF + LR
from gensim.parsing.preprocessing import preprocess_documents
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# BERT
import ktrain
from ktrain import text

# Citirea datelor

In [None]:
url = 'https://github.com/berinde/curs-analiza-datelor-complexe/blob/main/data/input/1.input_data.csv?raw=True'
reviews = pd.read_csv(url)
print(reviews.shape)
reviews.head(2)

In [None]:
reviews['rating'].value_counts()

In [None]:
# Excludem rating-urile de 3
reviews = reviews[reviews['rating']!=3].reset_index(drop=True)
print(reviews.shape)
reviews.head(2)

In [None]:
reviews['rating'].value_counts()

In [None]:
# Adaugam coloana 'positive'
reviews['positive'] = np.where(reviews['rating']<3, 0, 1)
reviews['positive'].value_counts()

# Sentiment Analysis using VADER Lexicon

In [None]:
reviews['vader_score'] = reviews['text'].apply(lambda x: SentimentIntensityAnalyzer().polarity_scores(x)['compound'])
reviews.head(2)

In [None]:
reviews['vader_score'].describe()

In [None]:
reviews['vader_class'] = np.where(reviews['vader_score']>=0, 1, 0)
reviews.head(2)

In [None]:
print(classification_report(reviews['positive'], 
                            reviews['vader_class']))

# Sentiment Analysis using TF-IDF + Logistic Regression
https://radimrehurek.com/gensim/parsing/preprocessing.html#gensim.parsing.preprocessing.stem_text


In [None]:
corpus = preprocess_documents(reviews['text'])

In [None]:
corpus[0:2]

In [None]:
len(corpus)

In [None]:
corpus_final = [" ".join(x) for x in corpus]
corpus_final[0:2]

In [None]:
reviews['text_prep'] = corpus_final
reviews.head(2)

In [None]:
# Consideram doar top 6000 de cuvinete pentru reprezentarea TFIDF
# (top 6000 dupa frecventa aparitiilor lor in corpus)
vec = TfidfVectorizer(max_features=6000)
tf_idf = vec.fit_transform(reviews['text_prep'])
# Cele top 6000 de cuvinte
cuvinte = vec.get_feature_names_out()

In [None]:
# Transformam in dataframe
dtm_tf_idf = pd.DataFrame(tf_idf.toarray(),
                          columns=cuvinte)
print(dtm_tf_idf.shape)
dtm_tf_idf.head(2)

In [None]:
print('Valoarea minima din reprezentarea TF IDF:', dtm_tf_idf.min().min())
print('Valoarea maxima din reprezentarea TF IDF:', dtm_tf_idf.max().max())

In [None]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(
     dtm_tf_idf,
     reviews['positive'],
     train_size=0.8,
     random_state=42
)
print(X_train_tfidf.shape, X_test_tfidf.shape)

In [None]:
X_train_tfidf.head(2)

In [None]:
model_lr = LogisticRegression()
model_lr.fit(X_train_tfidf, y_train_tfidf)

In [None]:
# Predictiile pe setul de test
ypred_test_tfidf = model_lr.predict(X_test_tfidf)

In [None]:
print('Acuratetea pe setul de training',
      model_lr.score(X_train_tfidf, y_train_tfidf))
print('Performanta Modelului pe setul de test:\n',
      classification_report(y_test_tfidf, ypred_test_tfidf))

# Sentiment Analysis using BERT

In [None]:
data_train, data_test, _, _ = train_test_split(
    reviews[['text', 'positive']],
    reviews['positive'],
    train_size=0.8,
    random_state=42)
print(data_train.shape)
print(data_test.shape)

In [None]:
(X_train, y_train), (X_test, y_test), preprocessor = text.texts_from_df(
    train_df=data_train,
    val_df=data_test,
    text_column='text',
    label_columns='positive',
    maxlen=128, #nr. de cuvinte considerat pentru fiecare recenzie
    preprocess_mode='bert'
    )

In [None]:
model_bert = text.text_classifier(name='bert',
                                  train_data=(X_train, y_train),
                                  preproc=preprocessor)

In [None]:
learner = ktrain.get_learner(model=model_bert,
                             train_data=(X_train, y_train),
                             val_data=(X_test, y_test))

In [None]:
learner.fit_onecycle(lr=3e-5, epochs=2)

In [None]:
learner.validate()