<a href="https://colab.research.google.com/github/amirhossein1376/ml-final-project/blob/main/ml_final_project_ph1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Import Libraries**

In [None]:
#Notebook
from google.colab import drive
from IPython.display import display

#NLP
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('words')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup

from gensim.models import KeyedVectors

#ML
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, plot_confusion_matrix
from sklearn.neural_network import MLPClassifier

import matplotlib.pyplot as plt  

import joblib
#from warnings import simplefilter
#from sklearn.exceptions import ConvergenceWarning
#simplefilter("ignore", category=ConvergenceWarning)

#Other
from collections import Counter
import re
import json
import string

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**Mount Drive**

In [None]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


**Config**

In [None]:
config = {
    "train_dataset_path" : "/content/gdrive/MyDrive/ML/dataset.csv",
    "word2vec_path" : "/content/gdrive/MyDrive/ML/word2vec-google-news-300.gz",
    "contractions_path" : "/content/gdrive/MyDrive/ML/contractions.json",
    "model_save_path" : "/content/gdrive/MyDrive/ML/"
}

**Read Data**

In [None]:
all_df = pd.read_csv(config["train_dataset_path"])

In [None]:
with open(config["contractions_path"]) as f:
    english_contractions = dict(json.load(f).items())
    contractions_re = re.compile('(%s)' % '|'.join(english_contractions.keys()))

**Convert Labels To Binary Numbers**

In [None]:
all_df['sentiment'] = all_df['sentiment'].apply(lambda x : 0 if x=="negative" else 1)

In [None]:
all_df.head()

Unnamed: 0,comment,sentiment
0,"Oh my god, it just doesn't get any worse than ...",0
1,If you're a layman interested in quantum theor...,0
2,It's amazing that this no talent actor Chapa g...,0
3,This must be one of the most overrated Spanish...,0
4,Some critics have compared Chop Shop with the ...,1


**Preprocessing**

In [None]:
class PreprocessorType:
    WITHOUT_PREPROCESSING = 0
    ELEMENTARY_PREPROCESSING = 1
    ADVANCED_PREPROCESSING = 2

    def get_name(t):
        if t == PreprocessorType.WITHOUT_PREPROCESSING:
            return "Without Preprocessing"
        elif t == PreprocessorType.ELEMENTARY_PREPROCESSING:
            return "Elementary Preprocessing"
        elif t == PreprocessorType.ADVANCED_PREPROCESSING:
            return "Advanced Preprocessing"
    
class Preprocessor:

    def __init__(self, preprocess_type):
        self.preprocess_type = preprocess_type
        self.stop_words = stopwords.words('english')
        self.porter = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()

    def process_with_elementary_preprocessing(self):
        self.data = self.data.apply(lambda x: x.lower())
        self.data = self.data.apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '' , x))
        self.data = self.data.apply(lambda x : word_tokenize(x))
        self.data = self.data.apply(lambda tkns : [word for word in tkns if word.isalpha()])
        
    def process_with_advanced_preprocessing(self, without_stem_lemma):
        self.data = self.data.apply(lambda txt : BeautifulSoup(txt, "lxml").text)
        self.data = self.data.apply(lambda txt : contractions_re.sub(lambda match : english_contractions[match.group(0)], txt))
            
        self.data = self.data.apply(lambda x: re.sub(r'[\w.+-]+@[\w-]+\.[\w.-]+', 'EMAIL_ADD',x))
        self.data = self.data.apply(lambda x: re.sub(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))", 'URL_ADD', x))

        self.process_with_elementary_preprocessing()

        if without_stem_lemma:
            self.data = self.data.apply(lambda tkns : [word for word in tkns if word not in self.stop_words])            
        else:
            self.data = self.data.apply(lambda tkns : [self.porter.stem(self.lemmatizer.lemmatize(word)) for word in tkns if word not in self.stop_words])            
    def process(self, data, without_stem_lemma=False):
        self.data = data.copy()
        
        if self.preprocess_type == PreprocessorType.ADVANCED_PREPROCESSING:
            self.process_with_advanced_preprocessing(without_stem_lemma)
            self.data = self.data.apply(lambda tkns : ' '.join(tkns))

        elif self.preprocess_type == PreprocessorType.ELEMENTARY_PREPROCESSING:
            self.process_with_elementary_preprocessing()
            self.data = self.data.apply(lambda tkns : ' '.join(tkns))

        return self.data

**Make data ready**

In [None]:
data_arrays = {}
for pt in [PreprocessorType.WITHOUT_PREPROCESSING, PreprocessorType.ELEMENTARY_PREPROCESSING, PreprocessorType.ADVANCED_PREPROCESSING]:

    preprocessor = Preprocessor(pt)
    current_df = all_df.copy()
    current_df["comment"] = preprocessor.process(current_df["comment"])

    train_df , test_df = train_test_split(current_df, test_size=0.1, random_state=42)
    #valid_df , test_df = train_test_split(test_df, test_size=0.5, random_state=42)

    vectorizer = CountVectorizer(lowercase=False, min_df=10, token_pattern=r"[^\s]+")
    vectorizer.fit(train_df["comment"].tolist())

    X_train = vectorizer.transform(train_df["comment"])
    y_train = train_df["sentiment"].tolist()
    X_test = vectorizer.transform(test_df["comment"])
    y_test = test_df["sentiment"].tolist()
    #X_valid = vectorizer.transform(valid_df["comment"])
    #y_valid = valid_df["sentiment"].tolist()

    data_arrays[pt] = (X_train, y_train, X_test, y_test)

In [None]:
class Model:
    LOGISTIC_REGRESSION = 1
    KNN = 2
    SVM = 3
    MLP = 4

    def get_name(m):
        if m == Model.LOGISTIC_REGRESSION:
            return "Logistic Regression"
        elif m == Model.KNN:
            return "K Nearest Neighbours"
        elif m == Model.SVM:
            return "Support Vector Machines"
        elif m == Model.MLP:
            return "Multi Layer Perceptron"
            
def show_grid_res(grid):
    res = pd.DataFrame(grid.cv_results_)
    res = res.filter(regex='param_.*|mean_test_score|rank_test_score').sort_values('rank_test_score')
    return res

def analysis(labels, predictions):
    print("Classification Report : \n", classification_report(labels, predictions, target_names=["negative", "positive"]))
    print("Confusion Matrix : \n", confusion_matrix(labels, predictions))
    print("Accuracy : \n", accuracy_score(labels, predictions))
    print("F1 : \n", f1_score(labels, predictions))

# **3.1**

In [None]:
settings = {
    Model.LOGISTIC_REGRESSION : {
        "C" : [0.01, 0.1, 1, 10],
        "tol" : [1e-6, 1e-5, 1e-4]
    },
    Model.KNN : {
        "n_neighbors" : [50, 100, 200, 350, 500],
        "weights" : ["uniform", "distance"],
    },
    Model.SVM : {
        "penalty" : ["l1", "l2"],
        "C" : [0, 0.001, 0.01, 0.1, 1, 10]
    }
}

## ***Logistic Regression***

### ***Without preprocessing***

In [None]:
X_train, y_train, X_test, y_test = data_arrays[PreprocessorType.WITHOUT_PREPROCESSING]
grid = GridSearchCV(LogisticRegression(max_iter=2000),settings[Model.LOGISTIC_REGRESSION],scoring='accuracy',n_jobs=-1, verbose=3)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  9.3min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=2000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 10],
                         'tol': [1e-06, 1e-05, 0.0001]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=3)

In [None]:
show_grid_res(grid)

Unnamed: 0,param_C,param_tol,mean_test_score,rank_test_score
3,0.1,1e-06,0.889284,1
4,0.1,1e-05,0.889284,1
5,0.1,0.0001,0.889284,1
6,1.0,1e-06,0.884,4
7,1.0,1e-05,0.884,4
8,1.0,0.0001,0.884,4
0,0.01,1e-06,0.878765,7
1,0.01,1e-05,0.878765,7
2,0.01,0.0001,0.878765,7
9,10.0,1e-06,0.877086,10


In [None]:
grid_predictions = grid.predict(X_train)
analysis(y_train, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.97      0.97      0.97     20274
    positive       0.97      0.97      0.97     20226

    accuracy                           0.97     40500
   macro avg       0.97      0.97      0.97     40500
weighted avg       0.97      0.97      0.97     40500

Confusion Matrix : 
 [[19649   625]
 [  527 19699]]
Accuracy : 
 0.9715555555555555
F1 : 
 0.9715906288532676


In [None]:
grid_predictions = grid.predict(X_test)
analysis(y_test, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.90      0.88      0.89      2225
    positive       0.89      0.90      0.89      2275

    accuracy                           0.89      4500
   macro avg       0.89      0.89      0.89      4500
weighted avg       0.89      0.89      0.89      4500

Confusion Matrix : 
 [[1961  264]
 [ 219 2056]]
Accuracy : 
 0.8926666666666667
F1 : 
 0.894885745375408


### ***With Elementary preprocessing***

In [None]:
X_train, y_train, X_test, y_test = data_arrays[PreprocessorType.ELEMENTARY_PREPROCESSING]
grid = GridSearchCV(LogisticRegression(max_iter=2000),settings[Model.LOGISTIC_REGRESSION],scoring='accuracy',n_jobs=-1, verbose=3)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  8.2min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=2000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 10],
                         'tol': [1e-06, 1e-05, 0.0001]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=3)

In [None]:
show_grid_res(grid)

Unnamed: 0,param_C,param_tol,mean_test_score,rank_test_score
3,0.1,1e-06,0.892938,1
4,0.1,1e-05,0.892938,1
5,0.1,0.0001,0.892938,1
0,0.01,1e-06,0.886642,4
1,0.01,1e-05,0.886642,4
2,0.01,0.0001,0.886642,4
6,1.0,1e-06,0.883877,7
7,1.0,1e-05,0.883877,7
8,1.0,0.0001,0.883877,7
9,10.0,1e-06,0.874222,10


In [None]:
grid_predictions = grid.predict(X_train)
analysis(y_train, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.96      0.96      0.96     20274
    positive       0.96      0.97      0.96     20226

    accuracy                           0.96     40500
   macro avg       0.96      0.96      0.96     40500
weighted avg       0.96      0.96      0.96     40500

Confusion Matrix : 
 [[19441   833]
 [  707 19519]]
Accuracy : 
 0.9619753086419753
F1 : 
 0.9620484006111686


In [None]:
grid_predictions = grid.predict(X_test)
analysis(y_test, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.90      0.88      0.89      2225
    positive       0.89      0.90      0.89      2275

    accuracy                           0.89      4500
   macro avg       0.89      0.89      0.89      4500
weighted avg       0.89      0.89      0.89      4500

Confusion Matrix : 
 [[1961  264]
 [ 221 2054]]
Accuracy : 
 0.8922222222222222
F1 : 
 0.8944045286305247


### ***With Advanced preprocessing***

In [None]:
X_train, y_train, X_test, y_test = data_arrays[PreprocessorType.ADVANCED_PREPROCESSING]
grid = GridSearchCV(LogisticRegression(max_iter=2000),settings[Model.LOGISTIC_REGRESSION],scoring='accuracy',n_jobs=-1, verbose=3)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   20.4s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  2.3min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=2000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 10],
                         'tol': [1e-06, 1e-05, 0.0001]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=3)

In [None]:
show_grid_res(grid)

Unnamed: 0,param_C,param_tol,mean_test_score,rank_test_score
3,0.1,1e-06,0.883136,1
4,0.1,1e-05,0.883136,1
5,0.1,0.0001,0.883136,1
0,0.01,1e-06,0.87842,4
1,0.01,1e-05,0.87842,4
2,0.01,0.0001,0.87842,4
6,1.0,1e-06,0.873333,7
7,1.0,1e-05,0.873333,7
8,1.0,0.0001,0.873333,7
9,10.0,1e-06,0.858667,10


In [None]:
grid_predictions = grid.predict(X_train)
analysis(y_train, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.95      0.94      0.95     20274
    positive       0.94      0.95      0.95     20226

    accuracy                           0.95     40500
   macro avg       0.95      0.95      0.95     40500
weighted avg       0.95      0.95      0.95     40500

Confusion Matrix : 
 [[19113  1161]
 [  984 19242]]
Accuracy : 
 0.947037037037037
F1 : 
 0.9472051982574023


In [None]:
grid_predictions = grid.predict(X_test)
analysis(y_test, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.90      0.88      0.89      2225
    positive       0.88      0.91      0.90      2275

    accuracy                           0.89      4500
   macro avg       0.89      0.89      0.89      4500
weighted avg       0.89      0.89      0.89      4500

Confusion Matrix : 
 [[1955  270]
 [ 212 2063]]
Accuracy : 
 0.8928888888888888
F1 : 
 0.8953993055555555


## ***KNN***

### ***Without preprocessing***

In [None]:
X_train, y_train, X_test, y_test = data_arrays[PreprocessorType.WITHOUT_PREPROCESSING]
grid = GridSearchCV(KNeighborsClassifier(),settings[Model.KNN],scoring='accuracy',n_jobs=-1, verbose=3)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 10.4min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_neighbors': [50, 100, 200, 350, 500],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=3)

In [None]:
show_grid_res(grid)

Unnamed: 0,param_n_neighbors,param_weights,mean_test_score,rank_test_score
3,100,distance,0.64558,1
5,200,distance,0.64484,2
1,50,distance,0.644519,3
2,100,uniform,0.643432,4
0,50,uniform,0.643111,5
4,200,uniform,0.640815,6
7,350,distance,0.639975,7
9,500,distance,0.638395,8
6,350,uniform,0.635309,9
8,500,uniform,0.633111,10


In [None]:
grid_predictions = grid.predict(X_train)
analysis(y_train, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       1.00      1.00      1.00     20274
    positive       1.00      1.00      1.00     20226

    accuracy                           1.00     40500
   macro avg       1.00      1.00      1.00     40500
weighted avg       1.00      1.00      1.00     40500

Confusion Matrix : 
 [[20274     0]
 [    0 20226]]
Accuracy : 
 1.0
F1 : 
 1.0


In [None]:
grid_predictions = grid.predict(X_test)
analysis(y_test, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.75      0.44      0.55      2225
    positive       0.61      0.85      0.71      2275

    accuracy                           0.65      4500
   macro avg       0.68      0.65      0.63      4500
weighted avg       0.68      0.65      0.63      4500

Confusion Matrix : 
 [[ 973 1252]
 [ 331 1944]]
Accuracy : 
 0.6482222222222223
F1 : 
 0.7106561871687077


### ***With Elementary preprocessing***

In [None]:
X_train, y_train, X_test, y_test = data_arrays[PreprocessorType.ELEMENTARY_PREPROCESSING]
grid = GridSearchCV(KNeighborsClassifier(),settings[Model.KNN],scoring='accuracy',n_jobs=-1, verbose=3)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 10.4min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_neighbors': [50, 100, 200, 350, 500],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=3)

In [None]:
show_grid_res(grid)

Unnamed: 0,param_n_neighbors,param_weights,mean_test_score,rank_test_score
1,50,distance,0.673358,1
3,100,distance,0.673037,2
5,200,distance,0.671827,3
0,50,uniform,0.670815,4
2,100,uniform,0.669654,5
4,200,uniform,0.668395,6
7,350,distance,0.667728,7
6,350,uniform,0.663235,8
9,500,distance,0.662815,9
8,500,uniform,0.658321,10


In [None]:
grid_predictions = grid.predict(X_train)
analysis(y_train, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       1.00      1.00      1.00     20274
    positive       1.00      1.00      1.00     20226

    accuracy                           1.00     40500
   macro avg       1.00      1.00      1.00     40500
weighted avg       1.00      1.00      1.00     40500

Confusion Matrix : 
 [[20274     0]
 [    0 20226]]
Accuracy : 
 1.0
F1 : 
 1.0


In [None]:
grid_predictions = grid.predict(X_test)
analysis(y_test, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.76      0.54      0.63      2225
    positive       0.65      0.83      0.73      2275

    accuracy                           0.69      4500
   macro avg       0.70      0.69      0.68      4500
weighted avg       0.70      0.69      0.68      4500

Confusion Matrix : 
 [[1202 1023]
 [ 387 1888]]
Accuracy : 
 0.6866666666666666
F1 : 
 0.7281141534901658


### ***With Advanced preprocessing***

In [None]:
X_train, y_train, X_test, y_test = data_arrays[PreprocessorType.ADVANCED_PREPROCESSING]
grid = GridSearchCV(KNeighborsClassifier(),settings[Model.KNN],scoring='accuracy',n_jobs=-1, verbose=3)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  6.3min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_neighbors': [50, 100, 200, 350, 500],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=3)

In [None]:
show_grid_res(grid)

Unnamed: 0,param_n_neighbors,param_weights,mean_test_score,rank_test_score
2,100,uniform,0.718667,1
3,100,distance,0.718543,2
5,200,distance,0.718222,3
4,200,uniform,0.716,4
7,350,distance,0.713383,5
6,350,uniform,0.711037,6
0,50,uniform,0.708494,7
1,50,distance,0.708444,8
9,500,distance,0.702642,9
8,500,uniform,0.699802,10


In [None]:
grid_predictions = grid.predict(X_train)
analysis(y_train, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.85      0.59      0.70     20274
    positive       0.69      0.90      0.78     20226

    accuracy                           0.74     40500
   macro avg       0.77      0.74      0.74     40500
weighted avg       0.77      0.74      0.74     40500

Confusion Matrix : 
 [[11934  8340]
 [ 2049 18177]]
Accuracy : 
 0.7434814814814815
F1 : 
 0.777742121815031


In [None]:
grid_predictions = grid.predict(X_test)
analysis(y_test, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.84      0.54      0.66      2225
    positive       0.67      0.90      0.77      2275

    accuracy                           0.72      4500
   macro avg       0.75      0.72      0.71      4500
weighted avg       0.75      0.72      0.71      4500

Confusion Matrix : 
 [[1208 1017]
 [ 235 2040]]
Accuracy : 
 0.7217777777777777
F1 : 
 0.7651912978244562


## ***SVM***

### ***Without preprocessing***

In [None]:
X_train, y_train, X_test, y_test = data_arrays[PreprocessorType.WITHOUT_PREPROCESSING]
grid = GridSearchCV(LinearSVC(max_iter=5000),settings[Model.SVM],scoring='accuracy',n_jobs=-1, verbose=3)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   23.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  6.7min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=5000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=None, tol=0.0001, verbose=0),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0, 0.001, 0.01, 0.1, 1, 10],
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=3)

In [None]:
show_grid_res(grid)

Unnamed: 0,param_C,param_penalty,mean_test_score,rank_test_score
5,0.01,l2,0.888321,1
3,0.001,l2,0.882,2
7,0.1,l2,0.87558,3
9,1.0,l2,0.865062,4
11,10.0,l2,0.863235,5
0,0.0,l1,,6
1,0.0,l2,,7
2,0.001,l1,,8
4,0.01,l1,,9
6,0.1,l1,,10


In [None]:
grid_predictions = grid.predict(X_train)
analysis(y_train, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.98      0.97      0.97     20274
    positive       0.97      0.98      0.97     20226

    accuracy                           0.97     40500
   macro avg       0.97      0.97      0.97     40500
weighted avg       0.97      0.97      0.97     40500

Confusion Matrix : 
 [[19693   581]
 [  466 19760]]
Accuracy : 
 0.9741481481481481
F1 : 
 0.9741908447753099


In [None]:
grid_predictions = grid.predict(X_test)
analysis(y_test, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.90      0.88      0.89      2225
    positive       0.89      0.90      0.89      2275

    accuracy                           0.89      4500
   macro avg       0.89      0.89      0.89      4500
weighted avg       0.89      0.89      0.89      4500

Confusion Matrix : 
 [[1964  261]
 [ 230 2045]]
Accuracy : 
 0.8908888888888888
F1 : 
 0.8928181619733683


### ***With Elementary preprocessing***

In [None]:
X_train, y_train, X_test, y_test = data_arrays[PreprocessorType.ELEMENTARY_PREPROCESSING]
grid = GridSearchCV(LinearSVC(max_iter=5000),settings[Model.SVM],scoring='accuracy',n_jobs=-1, verbose=3)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  4.9min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=5000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=None, tol=0.0001, verbose=0),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0, 0.001, 0.01, 0.1, 1, 10],
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=3)

In [None]:
show_grid_res(grid)

Unnamed: 0,param_C,param_penalty,mean_test_score,rank_test_score
5,0.01,l2,0.891975,1
3,0.001,l2,0.888123,2
7,0.1,l2,0.876963,3
9,1.0,l2,0.862667,4
11,10.0,l2,0.857753,5
0,0.0,l1,,6
1,0.0,l2,,7
2,0.001,l1,,8
4,0.01,l1,,9
6,0.1,l1,,10


In [None]:
grid_predictions = grid.predict(X_train)
analysis(y_train, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.97      0.96      0.96     20274
    positive       0.96      0.97      0.96     20226

    accuracy                           0.96     40500
   macro avg       0.96      0.96      0.96     40500
weighted avg       0.96      0.96      0.96     40500

Confusion Matrix : 
 [[19513   761]
 [  659 19567]]
Accuracy : 
 0.9649382716049383
F1 : 
 0.9649849583271687


In [None]:
grid_predictions = grid.predict(X_test)
analysis(y_test, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.90      0.88      0.89      2225
    positive       0.88      0.90      0.89      2275

    accuracy                           0.89      4500
   macro avg       0.89      0.89      0.89      4500
weighted avg       0.89      0.89      0.89      4500

Confusion Matrix : 
 [[1952  273]
 [ 226 2049]]
Accuracy : 
 0.8891111111111111
F1 : 
 0.8914509462693061


### ***With Advanced preprocessing***

In [None]:
X_train, y_train, X_test, y_test = data_arrays[PreprocessorType.ADVANCED_PREPROCESSING]
grid = GridSearchCV(LinearSVC(max_iter=5000),settings[Model.SVM],scoring='accuracy',n_jobs=-1, verbose=3)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  2.4min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=5000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=None, tol=0.0001, verbose=0),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0, 0.001, 0.01, 0.1, 1, 10],
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=3)

In [None]:
show_grid_res(grid)

Unnamed: 0,param_C,param_penalty,mean_test_score,rank_test_score
5,0.01,l2,0.882889,1
3,0.001,l2,0.879407,2
7,0.1,l2,0.868593,3
9,1.0,l2,0.847407,4
11,10.0,l2,0.838642,5
0,0.0,l1,,6
1,0.0,l2,,7
2,0.001,l1,,8
4,0.01,l1,,9
6,0.1,l1,,10


In [None]:
grid_predictions = grid.predict(X_train)
analysis(y_train, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.95      0.95      0.95     20274
    positive       0.95      0.95      0.95     20226

    accuracy                           0.95     40500
   macro avg       0.95      0.95      0.95     40500
weighted avg       0.95      0.95      0.95     40500

Confusion Matrix : 
 [[19164  1110]
 [  923 19303]]
Accuracy : 
 0.9498024691358025
F1 : 
 0.9499741627500677


In [None]:
grid_predictions = grid.predict(X_test)
analysis(y_test, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.90      0.87      0.89      2225
    positive       0.88      0.90      0.89      2275

    accuracy                           0.89      4500
   macro avg       0.89      0.89      0.89      4500
weighted avg       0.89      0.89      0.89      4500

Confusion Matrix : 
 [[1946  279]
 [ 219 2056]]
Accuracy : 
 0.8893333333333333
F1 : 
 0.8919739696312363


# **3.2**

In [None]:
settings = {
    Model.LOGISTIC_REGRESSION : {
        "C" : [0.04, 0.08, 0.16, 0.32, 0.64, 1]
    },
    Model.KNN : {
        "n_neighbors" : [50, 100, 150, 200, 250, 300, 350, 500],
        "weights" : ["uniform", "distance"],
    },
    Model.SVM : {
        "loss" : ["hinge", "squared_hinge"],
        "C" : [0.05, 0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1.28]
    }
}

## ***Bag Of Words***

In [None]:
preprocessor = Preprocessor(PreprocessorType.ADVANCED_PREPROCESSING)
current_df = all_df.copy()
current_df["comment"] = preprocessor.process(current_df["comment"])

In [None]:
current_df.head()

Unnamed: 0,comment,sentiment
0,oh god get wors alway love silli littl scifi b...,0
1,layman interest quantum theori string theori r...,0
2,amaz talent actor chapa got well known star ap...,0
3,must one overr spanish film histori lack subtl...,0
4,critic compar chop shop theatric releas citi g...,1


In [None]:
train_df , test_df = train_test_split(current_df, test_size=0.1, random_state=42)

vectorizer = CountVectorizer(lowercase=False, min_df=10, token_pattern=r"[^\s]+")
vectorizer.fit(train_df["comment"].tolist())

X_train = vectorizer.transform(train_df["comment"])
y_train = train_df["sentiment"].tolist()
X_test = vectorizer.transform(test_df["comment"])
y_test = test_df["sentiment"].tolist()

### ***Logistic Regression***

In [None]:
grid_bow_lr = GridSearchCV(LogisticRegression(max_iter=2000),settings[Model.LOGISTIC_REGRESSION],scoring='accuracy',n_jobs=-1, verbose=3)
grid_bow_lr.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   53.7s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=2000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.04, 0.08, 0.16, 0.32, 0.64, 1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=3)

In [None]:
show_grid_res(grid_bow_lr)

Unnamed: 0,param_C,mean_test_score,rank_test_score
1,0.08,0.883951,1
0,0.04,0.883284,2
2,0.16,0.882568,3
3,0.32,0.880444,4
4,0.64,0.876716,5
5,1.0,0.873333,6


In [None]:
grid_predictions = grid_bow_lr.predict(X_train)
analysis(y_train, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.95      0.94      0.94     20274
    positive       0.94      0.95      0.94     20226

    accuracy                           0.94     40500
   macro avg       0.94      0.94      0.94     40500
weighted avg       0.94      0.94      0.94     40500

Confusion Matrix : 
 [[19039  1235]
 [ 1046 19180]]
Accuracy : 
 0.943679012345679
F1 : 
 0.9438744125390616


In [None]:
grid_predictions = grid_bow_lr.predict(X_test)
analysis(y_test, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.90      0.88      0.89      2225
    positive       0.89      0.91      0.90      2275

    accuracy                           0.89      4500
   macro avg       0.89      0.89      0.89      4500
weighted avg       0.89      0.89      0.89      4500

Confusion Matrix : 
 [[1960  265]
 [ 212 2063]]
Accuracy : 
 0.894
F1 : 
 0.8963719313491201


### ***KNN***

In [None]:
grid = GridSearchCV(KNeighborsClassifier(),settings[Model.KNN],scoring='accuracy',n_jobs=-1, verbose=3)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  9.9min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_neighbors': [50, 100, 150, 200, 250, 300, 350, 500],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=3)

In [None]:
show_grid_res(grid)

Unnamed: 0,param_n_neighbors,param_weights,mean_test_score,rank_test_score
2,100,uniform,0.718667,1
5,150,distance,0.718617,2
3,100,distance,0.718543,3
4,150,uniform,0.718444,4
7,200,distance,0.718222,5
6,200,uniform,0.716,6
9,250,distance,0.713926,7
11,300,distance,0.713556,8
13,350,distance,0.713383,9
8,250,uniform,0.71242,10


In [None]:
grid_predictions = grid.predict(X_train)
analysis(y_train, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.85      0.59      0.70     20274
    positive       0.69      0.90      0.78     20226

    accuracy                           0.74     40500
   macro avg       0.77      0.74      0.74     40500
weighted avg       0.77      0.74      0.74     40500

Confusion Matrix : 
 [[11934  8340]
 [ 2049 18177]]
Accuracy : 
 0.7434814814814815
F1 : 
 0.777742121815031


In [None]:
grid_predictions = grid.predict(X_test)
analysis(y_test, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.84      0.54      0.66      2225
    positive       0.67      0.90      0.77      2275

    accuracy                           0.72      4500
   macro avg       0.75      0.72      0.71      4500
weighted avg       0.75      0.72      0.71      4500

Confusion Matrix : 
 [[1208 1017]
 [ 235 2040]]
Accuracy : 
 0.7217777777777777
F1 : 
 0.7651912978244562


### ***SVM***

In [None]:
grid_bow_svm = GridSearchCV(LinearSVC(max_iter=5000),settings[Model.SVM],scoring='accuracy',n_jobs=-1, verbose=3)
grid_bow_svm.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   22.3s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  4.8min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=5000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=None, tol=0.0001, verbose=0),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.05, 0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64,
                               1.28],
                         'loss': ['hinge', 'squared_hinge']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=3)

In [None]:
show_grid_res(grid_bow_svm)

Unnamed: 0,param_C,param_loss,mean_test_score,rank_test_score
3,0.01,squared_hinge,0.882889,1
4,0.02,hinge,0.882198,2
2,0.01,hinge,0.881506,3
6,0.04,hinge,0.881259,4
5,0.02,squared_hinge,0.881037,5
0,0.05,hinge,0.880444,6
7,0.04,squared_hinge,0.877111,7
8,0.08,hinge,0.877012,8
1,0.05,squared_hinge,0.874914,9
10,0.16,hinge,0.87042,10


In [None]:
grid_predictions = grid_bow_svm.predict(X_train)
analysis(y_train, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.95      0.95      0.95     20274
    positive       0.95      0.95      0.95     20226

    accuracy                           0.95     40500
   macro avg       0.95      0.95      0.95     40500
weighted avg       0.95      0.95      0.95     40500

Confusion Matrix : 
 [[19164  1110]
 [  923 19303]]
Accuracy : 
 0.9498024691358025
F1 : 
 0.9499741627500677


In [None]:
grid_predictions = grid_bow_svm.predict(X_test)
analysis(y_test, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.90      0.87      0.89      2225
    positive       0.88      0.90      0.89      2275

    accuracy                           0.89      4500
   macro avg       0.89      0.89      0.89      4500
weighted avg       0.89      0.89      0.89      4500

Confusion Matrix : 
 [[1946  279]
 [ 219 2056]]
Accuracy : 
 0.8893333333333333
F1 : 
 0.8919739696312363


## ***Word2Vec***

In [None]:
preprocessor = Preprocessor(PreprocessorType.ADVANCED_PREPROCESSING)
current_df = all_df.copy()
current_df["comment"] = preprocessor.process(current_df["comment"], without_stem_lemma=True)

In [None]:
current_df.head()

Unnamed: 0,comment,sentiment
0,oh god get worse always love silly little scif...,0
1,layman interested quantum theory string theory...,0
2,amazing talent actor chapa got well known star...,0
3,must one overrated spanish films history lack ...,0
4,critics compared chop shop theatrical releases...,1


In [None]:
w2v_model = KeyedVectors.load_word2vec_format(config["word2vec_path"], binary=True)

In [None]:
def sentence_2_vector(sentence):
    vec = np.zeros((300))
    count = 0
    for item in sentence.split(" "):
        if item in w2v_model.vocab:
            vec = vec + w2v_model.word_vec(item)
            count += 1
    vec = vec / count
    return vec

In [None]:
vectors = np.vstack(current_df["comment"].apply(lambda x : sentence_2_vector(x)).to_numpy())

w2v_df = current_df.copy()

del w2v_df["comment"]

for i in range(300):
    w2v_df[f"W{i+1}"] = vectors[:,i]

In [None]:
train_df , test_df = train_test_split(w2v_df, test_size=0.1, random_state=42)
# valid_df , test_df = train_test_split(test_df, test_size=0.5, random_state=42)

X_train = train_df.drop("sentiment", axis=1)
y_train = train_df["sentiment"].tolist()
X_test = test_df.drop("sentiment", axis=1)
y_test = test_df["sentiment"].tolist()

In [None]:
settings = {
    Model.LOGISTIC_REGRESSION : {
        "C" : [0.64, 1.28, 2.56, 5.12, 10.24, 20.48, 40.96]
    },
    Model.KNN : {
        "n_neighbors" : [20, 40, 60, 80, 100, 200],
        "weights" : ["uniform", "distance"],
    },
    Model.SVM : {
        "loss" : ["hinge", "squared_hinge"],
        "C" : [0.64, 1.28, 2.56, 5.12, 10.24, 20.48, 40.96]
    }
}

### ***Logistic Regression***

In [None]:
grid_w2v_lr = GridSearchCV(LogisticRegression(max_iter=2000),settings[Model.LOGISTIC_REGRESSION],scoring='accuracy',n_jobs=-1, verbose=3)
grid_w2v_lr.fit(X_train, y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   46.7s
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:  1.3min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=2000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.64, 1.28, 2.56, 5.12, 10.24, 20.48, 40.96]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=3)

In [None]:
show_grid_res(grid_w2v_lr)

Unnamed: 0,param_C,mean_test_score,rank_test_score
6,40.96,0.858222,1
5,20.48,0.857926,2
4,10.24,0.857852,3
3,5.12,0.857333,4
2,2.56,0.85637,5
1,1.28,0.854963,6
0,0.64,0.852123,7


In [None]:
grid_predictions = grid_w2v_lr.predict(X_train)
analysis(y_train, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.86      0.86      0.86     20274
    positive       0.86      0.87      0.86     20226

    accuracy                           0.86     40500
   macro avg       0.86      0.86      0.86     40500
weighted avg       0.86      0.86      0.86     40500

Confusion Matrix : 
 [[17436  2838]
 [ 2726 17500]]
Accuracy : 
 0.8626172839506173
F1 : 
 0.8628340400354994


In [None]:
grid_predictions = grid_w2v_lr.predict(X_test)
analysis(y_test, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.86      0.86      0.86      2225
    positive       0.86      0.86      0.86      2275

    accuracy                           0.86      4500
   macro avg       0.86      0.86      0.86      4500
weighted avg       0.86      0.86      0.86      4500

Confusion Matrix : 
 [[1907  318]
 [ 313 1962]]
Accuracy : 
 0.8597777777777778
F1 : 
 0.8614709110867179


### ***KNN***

In [None]:
grid_w2v_knn = GridSearchCV(KNeighborsClassifier(),settings[Model.KNN],scoring='accuracy',n_jobs=-1, verbose=3)
grid_w2v_knn.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 36.5min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 89.0min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_neighbors': [20, 40, 60, 80, 100, 200],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=3)

In [None]:
show_grid_res(grid_w2v_knn)

Unnamed: 0,param_n_neighbors,param_weights,mean_test_score,rank_test_score
1,20,distance,0.807062,1
3,40,distance,0.806938,2
5,60,distance,0.806247,3
7,80,distance,0.804543,4
9,100,distance,0.803062,5
2,40,uniform,0.80158,6
4,60,uniform,0.80084,7
6,80,uniform,0.799235,8
0,20,uniform,0.799062,9
8,100,uniform,0.797901,10


In [None]:
grid_predictions = grid_w2v_knn.predict(X_train)
analysis(y_train, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       1.00      1.00      1.00     20274
    positive       1.00      1.00      1.00     20226

    accuracy                           1.00     40500
   macro avg       1.00      1.00      1.00     40500
weighted avg       1.00      1.00      1.00     40500

Confusion Matrix : 
 [[20274     0]
 [    0 20226]]
Accuracy : 
 1.0
F1 : 
 1.0


In [None]:
grid_predictions = grid_w2v_knn.predict(X_test)
analysis(y_test, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.79      0.83      0.81      2225
    positive       0.83      0.78      0.80      2275

    accuracy                           0.81      4500
   macro avg       0.81      0.81      0.81      4500
weighted avg       0.81      0.81      0.81      4500

Confusion Matrix : 
 [[1851  374]
 [ 492 1783]]
Accuracy : 
 0.8075555555555556
F1 : 
 0.8046028880866427


### ***SVM***

In [None]:
grid_w2v_svm = GridSearchCV(LinearSVC(max_iter=5000),settings[Model.SVM],scoring='accuracy',n_jobs=-1, verbose=3)
grid_w2v_svm.fit(X_train, y_train)

Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   27.9s
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed: 10.1min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=5000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=None, tol=0.0001, verbose=0),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.64, 1.28, 2.56, 5.12, 10.24, 20.48, 40.96],
                         'loss': ['hinge', 'squared_hinge']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=3)

In [None]:
show_grid_res(grid_w2v_svm)

Unnamed: 0,param_C,param_loss,mean_test_score,rank_test_score
7,5.12,squared_hinge,0.858346,1
12,40.96,hinge,0.858272,2
9,10.24,squared_hinge,0.858247,3
13,40.96,squared_hinge,0.858148,4
10,20.48,hinge,0.858123,5
11,20.48,squared_hinge,0.858123,5
5,2.56,squared_hinge,0.858074,7
3,1.28,squared_hinge,0.857951,8
8,10.24,hinge,0.857901,9
1,0.64,squared_hinge,0.857877,10


In [None]:
grid_predictions = grid_w2v_svm.predict(X_train)
analysis(y_train, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.87      0.86      0.86     20274
    positive       0.86      0.87      0.86     20226

    accuracy                           0.86     40500
   macro avg       0.86      0.86      0.86     40500
weighted avg       0.86      0.86      0.86     40500

Confusion Matrix : 
 [[17414  2860]
 [ 2712 17514]]
Accuracy : 
 0.8624197530864197
F1 : 
 0.8627586206896554


In [None]:
grid_predictions = grid_w2v_svm.predict(X_test)
analysis(y_test, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.86      0.86      0.86      2225
    positive       0.86      0.87      0.86      2275

    accuracy                           0.86      4500
   macro avg       0.86      0.86      0.86      4500
weighted avg       0.86      0.86      0.86      4500

Confusion Matrix : 
 [[1903  322]
 [ 306 1969]]
Accuracy : 
 0.8604444444444445
F1 : 
 0.862461673236969


## ***Save Best Model***

In [None]:
joblib.dump(grid_bow_lr.best_estimator_, config["model_save_path"] + 'LR.pkl')
joblib.dump(grid_w2v_knn.best_estimator_, config["model_save_path"] + 'KNN.pkl')
joblib.dump(grid_bow_svm.best_estimator_, config["model_save_path"] + 'SVM.pkl')

['/content/gdrive/MyDrive/ML/SVM.pkl']

# **3.3**

In [None]:
settings = {
    Model.MLP : {
        "hidden_layer_sizes" : [(450, 100, 50)]
        # "hidden_layer_sizes" : [(100, 100), (100, 200, 50), (450, 100, 50), (400, 500, 400, 200, 100, 50)]
    }
}

In [None]:
preprocessor = Preprocessor(PreprocessorType.ADVANCED_PREPROCESSING)
current_df = all_df.copy()
current_df["comment"] = preprocessor.process(current_df["comment"])

In [None]:
train_df , test_df = train_test_split(current_df, test_size=0.1, random_state=42)

vectorizer = CountVectorizer(lowercase=False, min_df=10, token_pattern=r"[^\s]+")
vectorizer.fit(train_df["comment"].tolist())

X_train = vectorizer.transform(train_df["comment"])
y_train = train_df["sentiment"].tolist()
X_test = vectorizer.transform(test_df["comment"])
y_test = test_df["sentiment"].tolist()

### ***MLP***

In [None]:
grid_d2v_mlp = GridSearchCV(MLPClassifier(max_iter=2000),settings[Model.MLP],scoring='accuracy',n_jobs=-1, verbose=3)
grid_d2v_mlp.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(450, 100, 50), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=2000,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=True)

In [None]:
show_grid_res(grid_d2v_mlp)

Unnamed: 0,param_hidden_layer_sizes,mean_test_score,rank_test_score
2,"(450, 100, 50)",0.875852,1
0,"(100, 100)",0.874543,2
3,"(400, 500, 400, 200, 100, 50)",0.874049,3
1,"(100, 200, 50)",0.873259,4


In [None]:
grid_predictions = grid_d2v_mlp.predict(X_train)
analysis(y_train, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       1.00      1.00      1.00     20274
    positive       1.00      1.00      1.00     20226

    accuracy                           1.00     40500
   macro avg       1.00      1.00      1.00     40500
weighted avg       1.00      1.00      1.00     40500

Confusion Matrix : 
 [[20274     0]
 [    0 20226]]
Accuracy : 
 1.0
F1 : 
 1.0


In [None]:
grid_predictions = grid_d2v_mlp.predict(X_test)
analysis(y_test, grid_predictions)

Classification Report : 
               precision    recall  f1-score   support

    negative       0.89      0.87      0.88      2225
    positive       0.88      0.89      0.88      2275

    accuracy                           0.88      4500
   macro avg       0.88      0.88      0.88      4500
weighted avg       0.88      0.88      0.88      4500

Confusion Matrix : 
 [[1940  285]
 [ 252 2023]]
Accuracy : 
 0.8806666666666667
F1 : 
 0.8828278420248746


In [None]:
joblib.dump(grid_d2v_mlp, config["model_save_path"] + 'best.pkl')

['/content/gdrive/MyDrive/ML/best.pkl']