## Pre-processing

In [1]:
from google.colab import drive
drive.mount('/content/drive')

filename1 = 'drive/MyDrive/TextAnalytics/training_set_1.csv'
filename2 = 'drive/MyDrive/TextAnalytics/training_set_2.csv'
filename3 = 'drive/MyDrive/TextAnalytics/test_set.csv'

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import *
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import GridSearchCV

In [3]:
df_train1 = pd.read_csv(filename1)
df_train2 = pd.read_csv(filename2)
df_test = pd.read_csv(filename3)

df_train1.head(2)

Unnamed: 0.1,Unnamed: 0,review_rating,date,year,review_title,body,product_title,product_rating,language,english,pos_review,body_tok,body_tok_ngrams,body_lem_ngrams
0,27931,2,"September 13, 2018",2018,Bad battery,I’m happy with the way the phone looks but upo...,"Apple iPhone 6S, 64GB, Rose Gold - For AT&T / ...",3.6,en,1,0,"['happy', 'way', 'phone', 'looks', 'upon', 'op...","['happy', 'way', 'phone', 'looks', 'upon', 'op...","['LEM_happy', 'LEM_way', 'LEM_phone', 'LEM_loo..."
1,18619,1,"June 18, 2018",2018,Very bad experience and Amazon didn't help me ...,the brand itself is not a problem. the problem...,Samsung Galaxy S7 SM-G930A AT&T Unlocked Smart...,3.1,en,1,0,"['brand', 'problem', 'problem', 'seller', 'pho...","['brand', 'problem', 'problem', 'seller', 'pho...","['LEM_brand', 'LEM_problem', 'LEM_problem', 'L..."


In [4]:
# Selezione X e Y

train1 = df_train1[['body', 'review_rating']]
train2 = df_train2[['body', 'review_rating']]
test = df_test[['body', 'review_rating']]

train1.head(4)

Unnamed: 0,body,review_rating
0,I’m happy with the way the phone looks but upo...,2
1,the brand itself is not a problem. the problem...,1
2,"After reading a lot of the reviews, which I to...",3
3,arrived on time and was in prefect shape (coul...,5


In [5]:
train2.head(4)

Unnamed: 0,body,review_rating
0,Getting hot after 1 hour use. Though I return ...,1
1,Would not charge,1
2,Purchased this item as a gift for my daughter....,2
3,Always get a good product from this company!,5


In [6]:
# Preparazione due training set
x_train1 = train1['body']
x_train2 = train2['body']
y_train1 = train1['review_rating']
y_train2 = train2['review_rating']

# Preparazione test set
x_test = test['body']
y_test = test['review_rating']

len(x_train1),len(y_train1),len(x_train2),len(y_train2),len(x_test),len(y_test)

(10000, 10000, 10000, 10000, 20000, 20000)

In [7]:
# Funzione per metriche di Evaluation

def model_evaluation(real_v, pred_v):
    print(f"Accuracy score: {accuracy_score(real_v, pred_v)}")
    print("Classification report:")
    print(classification_report(real_v, pred_v))
    cm = confusion_matrix(real_v, pred_v)
    print (f"Confusion matrix \n {cm}")

## SVM con CountVectorizer

#### Training set 1

In [8]:
# Preprocessing
vectorizer = CountVectorizer(min_df = 5, ngram_range = (1,3)) # scelta n degli n-grams
vectorizer.fit(x_train1)

# Training
training1_features = vectorizer.transform(x_train1)
test_features = vectorizer.transform(x_test)
learner = SVC()
classifier1 = learner.fit(training1_features, y_train1)

# Predizioni
predictions1 = classifier1.predict(test_features)

# Evaluation
model_evaluation(y_test, predictions1)

Accuracy score: 0.6907
Classification report:
              precision    recall  f1-score   support

           1       0.58      0.76      0.66      3709
           2       0.50      0.00      0.00      1018
           3       0.27      0.02      0.04      1348
           4       0.47      0.01      0.02      2518
           5       0.73      0.96      0.83     11407

    accuracy                           0.69     20000
   macro avg       0.51      0.35      0.31     20000
weighted avg       0.63      0.69      0.60     20000

Confusion matrix 
 [[ 2834     1     7     1   866]
 [  639     1    11     2   365]
 [  582     0    31    13   722]
 [  350     0    38    32  2098]
 [  443     0    28    20 10916]]


#### Training set 2

In [9]:
# Preprocessing
vectorizer = CountVectorizer(min_df = 5, ngram_range = (1,3)) # scelta n degli n-grams
vectorizer.fit(x_train2)

# Training
training2_features = vectorizer.transform(x_train2)
test_features = vectorizer.transform(x_test)
learner = SVC()
classifier2 = learner.fit(training2_features, y_train2)

# Predizioni
predictions2 = classifier2.predict(test_features)

# Evaluation
model_evaluation(y_test, predictions2)

Accuracy score: 0.6873
Classification report:
              precision    recall  f1-score   support

           1       0.62      0.71      0.66      3709
           2       1.00      0.00      0.00      1018
           3       0.56      0.01      0.01      1348
           4       0.36      0.00      0.01      2518
           5       0.71      0.97      0.82     11407

    accuracy                           0.69     20000
   macro avg       0.65      0.34      0.30     20000
weighted avg       0.65      0.69      0.59     20000

Confusion matrix 
 [[ 2634     0     1     2  1072]
 [  574     2     3     2   437]
 [  498     0     9     7   834]
 [  265     0     3    10  2240]
 [  309     0     0     7 11091]]


## SVM con TF-IDF

#### Training set 1

In [10]:
# Preprocessing
vectorizer = TfidfVectorizer(min_df=5,ngram_range=(1, 3)) # scelta n degli n-grams
vectorizer.fit(x_train1)

# Training
training1_features = vectorizer.transform(x_train1)
test_features = vectorizer.transform(x_test)
learner = SVC()
classifier1 = learner.fit(training1_features, y_train1)

# Predizioni
predictions1 = classifier1.predict(test_features)

# Evaluation
model_evaluation(y_test, predictions1)

Accuracy score: 0.71785
Classification report:
              precision    recall  f1-score   support

           1       0.61      0.89      0.72      3709
           2       0.00      0.00      0.00      1018
           3       0.31      0.02      0.04      1348
           4       0.45      0.04      0.07      2518
           5       0.77      0.96      0.85     11407

    accuracy                           0.72     20000
   macro avg       0.43      0.38      0.34     20000
weighted avg       0.63      0.72      0.63     20000

Confusion matrix 
 [[ 3290     5     6     3   405]
 [  738     0    10     8   262]
 [  650     1    30    37   630]
 [  335     1    26    93  2063]
 [  373     0    25    65 10944]]


#### Training set 2

In [11]:
# Preprocessing
vectorizer = TfidfVectorizer(min_df=5,ngram_range=(1, 3)) # scelta n degli n-grams
vectorizer.fit(x_train2)

# Training
training2_features = vectorizer.transform(x_train2)
test_features = vectorizer.transform(x_test)
learner = SVC()
classifier2 = learner.fit(training2_features, y_train2)

# Predizioni
predictions2 = classifier2.predict(test_features)

# Evaluation
model_evaluation(y_test, predictions2)

Accuracy score: 0.7161
Classification report:
              precision    recall  f1-score   support

           1       0.64      0.85      0.73      3709
           2       0.29      0.00      0.00      1018
           3       0.48      0.01      0.02      1348
           4       0.50      0.02      0.04      2518
           5       0.74      0.97      0.84     11407

    accuracy                           0.72     20000
   macro avg       0.53      0.37      0.33     20000
weighted avg       0.65      0.72      0.62     20000

Confusion matrix 
 [[ 3147     2     1     6   553]
 [  687     2     2     6   321]
 [  572     1    16    17   742]
 [  250     0    12    48  2208]
 [  275     2     2    19 11109]]


## SVM con TF-IDF e Grid Search

L'approccio con TF-IDF ha portato ai risultati migliori. Cerchiamo di migliorare il modello con il tuning degli iperparametri.

#### Training set 1

In [12]:
# Definisco parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}
 
# Preprocessing
vectorizer = TfidfVectorizer(min_df=5,ngram_range=(1, 3)) # scelta n degli n-grams
vectorizer.fit(x_train1)

# Training
training1_features = vectorizer.transform(x_train1)
test_features = vectorizer.transform(x_test)

learner = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) # aggiungo grid search
classifier1 = learner.fit(training1_features, y_train1)

# Predizioni
predictions1 = classifier1.predict(test_features)

# Evaluation
model_evaluation(y_test, predictions1)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.499 total time=  54.3s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.500 total time=  54.2s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.502 total time=  56.2s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.499 total time=  56.3s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.501 total time=  53.4s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.497 total time=  47.5s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.497 total time=  47.5s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.497 total time=  48.9s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.496 total time=  48.9s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.497 total time=  46.5s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.497 total time=  44.7s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

#### Training set 2

In [13]:
# Definisco parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}
 
# Preprocessing
vectorizer = TfidfVectorizer(min_df=5,ngram_range=(1, 3)) # scelta n degli n-grams
vectorizer.fit(x_train2)

# Training
training2_features = vectorizer.transform(x_train2)
test_features = vectorizer.transform(x_test)

learner = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) # aggiungo grid search
classifier2 = learner.fit(training2_features, y_train2)

# Predizioni
predictions2 = classifier2.predict(test_features)

# Evaluation
model_evaluation(y_test, predictions2)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.526 total time= 1.1min
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.526 total time= 1.1min
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.526 total time= 1.2min
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.526 total time= 1.2min
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.526 total time= 1.1min
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.526 total time=  55.3s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.526 total time=  56.4s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.526 total time=  56.9s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.526 total time=  58.0s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.526 total time=  56.3s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.526 total time=  51.5s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf