## Pre-processing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

filename1 = 'drive/MyDrive/TextAnalytics/training_set_1.csv'
filename2 = 'drive/MyDrive/TextAnalytics/training_set_2.csv'
filename3 = 'drive/MyDrive/TextAnalytics/test_set.csv'

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import *
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [None]:
# Unisco i 4 dataframe

filename1 = 'drive/MyDrive/TextAnalytics/training_set_1.csv'
filename2 = 'drive/MyDrive/TextAnalytics/training_set_2.csv'
filename3 = 'drive/MyDrive/TextAnalytics/test_set.csv'
filename4 = 'drive/MyDrive/TextAnalytics/remaining_set.csv'

df_train1 = pd.read_csv(filename1)
df_train2 = pd.read_csv(filename2)
df_test = pd.read_csv(filename3)
df_remaining = pd.read_csv(filename4)

frames = [df_train1, df_train2, df_test, df_remaining]
df = pd.concat(frames)

df = df[['body', 'body_tok', 'pos_review', 'review_rating']]

len(df)

53337

In [None]:
# Selezione X e Y, split train e test set

X = df['body']
y = df['pos_review']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# Funzione per metriche di Evaluation

def model_evaluation(real_v, pred_v):
    print(f"Accuracy sore: {accuracy_score(real_v, pred_v)}")
    print("Classification report:")
    print(classification_report(real_v, pred_v))
    cm = confusion_matrix(real_v, pred_v)
    print (f"Confusion matrix \n {cm}")

## SVM con CountVectorizer

In [None]:
# Preprocessing
vectorizer = CountVectorizer(min_df = 5, ngram_range = (1,3)) # scelta n degli n-grams
vectorizer.fit(x_train)

# Training
training_features = vectorizer.transform(x_train)
test_features = vectorizer.transform(x_test)
learner = SVC()
classifier = learner.fit(training_features, y_train)

# Predizioni
predictions = classifier.predict(test_features)

# Evaluation
model_evaluation(y_test, predictions)

Accuracy sore: 0.8895580047721849
Classification report:
              precision    recall  f1-score   support

           0       0.88      0.76      0.82      5692
           1       0.89      0.95      0.92     11910

    accuracy                           0.89     17602
   macro avg       0.89      0.86      0.87     17602
weighted avg       0.89      0.89      0.89     17602

Confusion matrix 
 [[ 4341  1351]
 [  593 11317]]


## SVM con TF-IDF

In [None]:
# Preprocessing
vectorizer = TfidfVectorizer(min_df=5,ngram_range=(1, 3)) # scelta n degli n-grams
vectorizer.fit(x_train)

# Training
training_features = vectorizer.transform(x_train)
test_features = vectorizer.transform(x_test)
learner = SVC()
classifier = learner.fit(training_features, y_train)

# Predizioni
predictions = classifier.predict(test_features)

# Evaluation
model_evaluation(y_test, predictions)

Accuracy sore: 0.9142711055561868
Classification report:
              precision    recall  f1-score   support

           0       0.89      0.84      0.86      5692
           1       0.93      0.95      0.94     11910

    accuracy                           0.91     17602
   macro avg       0.91      0.90      0.90     17602
weighted avg       0.91      0.91      0.91     17602

Confusion matrix 
 [[ 4785   907]
 [  602 11308]]


## SVM con TF-IDF e Grid Search

L'approccio con TF-IDF ha portato ai risultati migliori. Cerchiamo di migliorare il modello con il tuning degli iperparametri.

In [None]:
# Definisco parameter range
param_grid = {'C': [0.1, 1, 10],
              'gamma': [1, 0.1, 0.01],
              'kernel': ['rbf']}
 
# Preprocessing
vectorizer = TfidfVectorizer(min_df=5,ngram_range=(1, 3)) # scelta n degli n-grams
vectorizer.fit(x_train)

# Training
training_features = vectorizer.transform(x_train)
test_features = vectorizer.transform(x_test)

learner = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) # aggiungo grid search
classifier = learner.fit(training_features, y_train)

# Predizioni
predictions = classifier.predict(test_features)

# Evaluation
model_evaluation(y_test, predictions)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.779 total time= 8.3min
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.783 total time= 8.2min
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.783 total time= 8.2min
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.785 total time= 8.3min
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.786 total time= 8.2min
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.690 total time= 7.7min
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.689 total time= 7.9min
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.687 total time= 7.7min
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.689 total time= 7.8min
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.688 total time= 7.7min
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.680 total time= 8.4min
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf;,