## Pre-processing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

filename1 = 'drive/MyDrive/TextAnalytics/training_set_1.csv'
filename2 = 'drive/MyDrive/TextAnalytics/training_set_2.csv'
filename3 = 'drive/MyDrive/TextAnalytics/test_set.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import *
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import GridSearchCV

In [None]:
df_train1 = pd.read_csv(filename1)
df_train2 = pd.read_csv(filename2)
df_test = pd.read_csv(filename3)

df_train1.head(2)

Unnamed: 0.1,Unnamed: 0,review_rating,date,year,review_title,body,product_title,product_rating,language,english,pos_review,body_tok,body_tok_ngrams,body_lem_ngrams
0,34840,1,"July 12, 2018",2018,One Star,"I hate this, Idont like, is the worse phone in...",Nokia 3310 3G - Unlocked Single SIM Feature Ph...,3.4,en,1,0,"['hate', 'idont', 'like', 'worse', 'phone', 'w...","['hate', 'idont', 'like', 'worse', 'phone', 'w...","['LEM_hate', 'LEM_idont', 'LEM_like', 'LEM_bad..."
1,39787,5,"July 12, 2018",2018,Amazing value for what it does,This phone has surprised me beyond measure. Th...,ASUS ZenFone Max Plus ZB570TL-MT67-3G32G-BL - ...,3.5,en,1,1,"['phone', 'surprised', 'beyond', 'measure', 't...","['phone', 'surprised', 'beyond', 'measure', 't...","['LEM_phone', 'LEM_surprise', 'LEM_beyond', 'L..."


In [None]:
from nltk.stem.wordnet import WordNetLemmatizer
from collections import defaultdict
from nltk.corpus import stopwords, wordnet
from nltk import word_tokenize, pos_tag

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
stopword_list = stopwords.words('english')
wnl = WordNetLemmatizer()
tag_map = defaultdict(lambda : wordnet.NOUN)
tag_map['J'] = wordnet.ADJ
tag_map['V'] = wordnet.VERB
tag_map['R'] = wordnet.ADV


def create_lemmas(series):

  new_observations = []

  for id, text in enumerate(series):
    
    text = text.lower()
    tokens = [token for token in word_tokenize(text) if token not in stopword_list]
    tokens = [token for token in tokens if token.isalnum()]

    lemmas = []
    for token, tag in pos_tag(tokens):
    
      lemmas.append('LEM_'+wnl.lemmatize(token, tag_map[tag[0]]))

    new_ngrams = []
    new_ngrams.extend(lemmas)
    new_observations.append(" ".join(new_ngrams))

  return pd.Series(new_observations)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df_test['body_lem'] = create_lemmas(df_test['body'])
df_train1['body_lem'] = create_lemmas(df_train1['body'])
df_train2['body_lem'] = create_lemmas(df_train2['body'])

In [None]:
df_test['body_lem']

0                              LEM_come LEM_ac LEM_charger
1        LEM_16 LEM_year LEM_old LEM_love LEM_phone LEM...
2        LEM_would LEM_give LEM_5 LEM_start LEM_phone L...
3        LEM_device LEM_hit LEM_main LEM_point LEM_want...
4        LEM_advertise LEM_unlocked LEM_phone LEM_good ...
                               ...                        
19995    LEM_really LEM_liked LEM_good LEM_condition LE...
19996    LEM_purchase LEM_phone LEM_buyspry LEM_little ...
19997                                  LEM_happy LEM_phone
19998    LEM_phone LEM_month LEM_great LEM_bright LEM_s...
19999    LEM_phone LEM_call LEM_speaker LEM_hear LEM_gr...
Name: body_lem, Length: 20000, dtype: object

In [None]:
# Selezione X e Y

train1 = df_train1[['body_lem', 'review_rating']]
train2 = df_train2[['body_lem', 'review_rating']]
test = df_test[['body_lem', 'review_rating']]

train1.head(4)

Unnamed: 0,body_lem,review_rating
0,LEM_hate LEM_idont LEM_like LEM_bad LEM_phone ...,1
1,LEM_phone LEM_surprise LEM_beyond LEM_measure ...,5
2,LEM_realize LEM_much LEM_really LEM_missed LEM...,5
3,LEM_nice LEM_phone LEM_wish LEM_charge LEM_por...,3


In [None]:
train2.head(4)

Unnamed: 0,body_lem,review_rating
0,LEM_perfect LEM_size,5
1,LEM_great LEM_product LEM_money LEM_work LEM_f...,5
2,LEM_perfect LEM_month LEM_use,5
3,LEM_phone LEM_really LEM_minor LEM_sign LEM_we...,4


In [None]:
# Preparazione due training set
x_train1 = train1['body_lem']
x_train2 = train2['body_lem']
y_train1 = train1['review_rating']
y_train2 = train2['review_rating']

# Preparazione test set
x_test = test['body_lem']
y_test = test['review_rating']

len(x_train1),len(y_train1),len(x_train2),len(y_train2),len(x_test),len(y_test)

(10000, 10000, 10000, 10000, 20000, 20000)

In [None]:
# Funzione per metriche di Evaluation

def model_evaluation(real_v, pred_v):
    print(f"Accuracy score: {accuracy_score(real_v, pred_v)}")
    print("Classification report:")
    print(classification_report(real_v, pred_v))
    cm = confusion_matrix(real_v, pred_v)
    print (f"Confusion matrix \n {cm}")

## SVM con CountVectorizer

#### Training set 1

In [None]:
# Preprocessing
vectorizer = CountVectorizer(min_df = 5, ngram_range = (1,3)) # scelta n degli n-grams
vectorizer.fit(x_train1)

# Training
training1_features = vectorizer.transform(x_train1)
test_features = vectorizer.transform(x_test)
learner = SVC()
classifier1 = learner.fit(training1_features, y_train1)

# Predizioni
predictions1 = classifier1.predict(test_features)

# Evaluation
model_evaluation(y_test, predictions1)

Accuracy sore: 0.69395
Classification report:
              precision    recall  f1-score   support

           1       0.59      0.78      0.67      3595
           2       0.00      0.00      0.00      1021
           3       0.33      0.01      0.02      1375
           4       0.71      0.01      0.02      2550
           5       0.73      0.96      0.83     11459

    accuracy                           0.69     20000
   macro avg       0.47      0.35      0.31     20000
weighted avg       0.64      0.69      0.60     20000

Confusion matrix 
 [[ 2816     0    11     3   765]
 [  625     0     5     1   390]
 [  567     1    16     5   786]
 [  319     1    10    30  2190]
 [  433     0     6     3 11017]]


#### Training set 2

In [None]:
# Preprocessing
vectorizer = CountVectorizer(min_df = 5, ngram_range = (1,3)) # scelta n degli n-grams
vectorizer.fit(x_train2)

# Training
training2_features = vectorizer.transform(x_train2)
test_features = vectorizer.transform(x_test)
learner = SVC()
classifier2 = learner.fit(training2_features, y_train2)

# Predizioni
predictions2 = classifier2.predict(test_features)

# Evaluation
model_evaluation(y_test, predictions2)

Accuracy sore: 0.68685
Classification report:
              precision    recall  f1-score   support

           1       0.62      0.72      0.66      3595
           2       0.00      0.00      0.00      1021
           3       0.33      0.00      0.00      1375
           4       0.56      0.01      0.01      2550
           5       0.71      0.97      0.82     11459

    accuracy                           0.69     20000
   macro avg       0.44      0.34      0.30     20000
weighted avg       0.61      0.69      0.59     20000

Confusion matrix 
 [[ 2576     0     0     2  1017]
 [  541     0     0     3   477]
 [  494     0     1     3   877]
 [  242     0     2    15  2291]
 [  310     0     0     4 11145]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## SVM con TF-IDF

#### Training set 1

In [None]:
# Preprocessing
vectorizer = TfidfVectorizer(min_df=5,ngram_range=(1, 3)) # scelta n degli n-grams
vectorizer.fit(x_train1)

# Training
training1_features = vectorizer.transform(x_train1)
test_features = vectorizer.transform(x_test)
learner = SVC()
classifier1 = learner.fit(training1_features, y_train1)

# Predizioni
predictions1 = classifier1.predict(test_features)

# Evaluation
model_evaluation(y_test, predictions1)

Accuracy sore: 0.7067
Classification report:
              precision    recall  f1-score   support

           1       0.58      0.87      0.70      3595
           2       0.33      0.01      0.02      1021
           3       0.55      0.03      0.05      1375
           4       0.42      0.04      0.07      2550
           5       0.76      0.95      0.84     11459

    accuracy                           0.71     20000
   macro avg       0.53      0.38      0.33     20000
weighted avg       0.65      0.71      0.62     20000

Confusion matrix 
 [[ 3117     6    12    11   449]
 [  719     8     4    12   278]
 [  650     5    36    37   647]
 [  377     2    11    92  2068]
 [  507     3     3    65 10881]]


#### Training set 2

In [None]:
# Preprocessing
vectorizer = TfidfVectorizer(min_df=5,ngram_range=(1, 3)) # scelta n degli n-grams
vectorizer.fit(x_train2)

# Training
training2_features = vectorizer.transform(x_train2)
test_features = vectorizer.transform(x_test)
learner = SVC()
classifier2 = learner.fit(training2_features, y_train2)

# Predizioni
predictions2 = classifier2.predict(test_features)

# Evaluation
model_evaluation(y_test, predictions2)

Accuracy sore: 0.7062
Classification report:
              precision    recall  f1-score   support

           1       0.62      0.83      0.71      3595
           2       0.14      0.00      0.00      1021
           3       0.56      0.01      0.01      1375
           4       0.43      0.02      0.03      2550
           5       0.74      0.97      0.84     11459

    accuracy                           0.71     20000
   macro avg       0.50      0.36      0.32     20000
weighted avg       0.63      0.71      0.61     20000

Confusion matrix 
 [[ 2971     8     3     5   608]
 [  642     2     0     9   368]
 [  567     1    10    26   771]
 [  271     3     3    45  2228]
 [  341     0     2    20 11096]]


## SVM con TF-IDF e Grid Search

L'approccio con TF-IDF ha portato ai risultati migliori. Cerchiamo di migliorare il modello con il tuning degli iperparametri.

#### Training set 1

In [None]:
# Definisco parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}
 
# Preprocessing
vectorizer = TfidfVectorizer(min_df=5,ngram_range=(1, 3)) # scelta n degli n-grams
vectorizer.fit(x_train1)

# Training
training1_features = vectorizer.transform(x_train1)
test_features = vectorizer.transform(x_test)

learner = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) # aggiungo grid search
classifier1 = learner.fit(training1_features, y_train1)

# Predizioni
predictions1 = classifier1.predict(test_features)

# Evaluation
model_evaluation(y_test, predictions1)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.497 total time=  25.6s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.494 total time=  25.5s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.497 total time=  25.7s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.498 total time=  25.7s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.496 total time=  26.0s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.486 total time=  22.2s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.487 total time=  22.0s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.487 total time=  22.3s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.487 total time=  22.1s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.486 total time=  22.4s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.486 total time=  20.9s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

#### Training set 2

In [None]:
# Definisco parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}
 
# Preprocessing
vectorizer = TfidfVectorizer(min_df=5,ngram_range=(1, 3)) # scelta n degli n-grams
vectorizer.fit(x_train2)

# Training
training2_features = vectorizer.transform(x_train2)
test_features = vectorizer.transform(x_test)

learner = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) # aggiungo grid search
classifier2 = learner.fit(training2_features, y_train2)

# Predizioni
predictions2 = classifier2.predict(test_features)

# Evaluation
model_evaluation(y_test, predictions2)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.533 total time=  27.5s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.532 total time=  27.3s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.533 total time=  27.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.533 total time=  26.9s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.533 total time=  27.3s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.533 total time=  22.7s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.533 total time=  22.4s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.533 total time=  22.2s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.533 total time=  22.2s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.533 total time=  22.4s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.533 total time=  20.9s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf