In [36]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
from sklearn.metrics import accuracy_score, f1_score,roc_auc_score,accuracy_score, confusion_matrix, roc_curve
from sklearn.model_selection import StratifiedKFold
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
import helper_functions as hf
from sklearn.linear_model import LogisticRegressionCV
seed = 123
from sklearn.svm import LinearSVC,SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [3]:
data_train = pd.read_pickle('train_set.pkl')
data_test = pd.read_pickle('test_set.pkl')

In [4]:
X = np.array(data_train.RAW_Text)
y = np.array(data_train.Positively_rated)

In [7]:
X.shape

(18113,)

In [9]:
import spacy
import en_core_web_sm
nlp2 = en_core_web_sm.load()

In [12]:
with nlp2.disable_pipes():
    doc_vectors = np.array([nlp2(text).vector for text in X])

In [13]:
doc_vectors.shape

(18113, 96)

In [9]:
doc_vectors[:1]

array([[-0.06839881,  0.49917546,  0.3466028 , -0.5722529 ,  1.7716508 ,
        -0.03340167,  0.860545  ,  0.00988218,  0.98621637,  0.8129556 ,
        -0.2191528 ,  0.06015332, -0.68227893, -0.2821764 , -0.3924308 ,
        -0.07562292, -1.0181385 , -0.179397  , -0.8134333 , -0.29726607,
         0.36450526,  0.08876623, -0.34658512, -0.35243672, -0.9508774 ,
         1.1564338 , -0.63084066,  0.3920868 ,  0.6826346 , -0.73938125,
         0.9411365 ,  0.38900703, -0.27723703, -0.5213536 ,  0.33099028,
        -0.34149036,  0.11893742, -0.7945201 , -0.9216471 , -0.34951952,
         0.8404143 ,  0.19083022, -0.22636835, -0.90206665,  0.43685365,
        -0.30152026,  0.17588654,  0.19894703, -1.0928265 ,  0.95151967,
         0.98478335, -0.52477807, -0.30216476,  0.5285495 , -1.1856889 ,
         0.5172663 ,  0.90005916,  0.61878115, -0.2041543 ,  0.30676246,
         1.083737  , -0.40366983,  1.5723639 ,  0.48888573,  0.5522548 ,
        -0.33971676,  0.6842907 , -1.1423198 , -0.1

In [6]:
nlp = spacy.load('en_core_web_lg')

In [7]:
with nlp.disable_pipes():
    doc_vectors_lg = np.array([nlp(text).vector for text in X])

In [8]:
doc_vectors_lg.shape

(18113, 300)

Let's try with the model given by en_core_web_lg, with dimension of 300

In [17]:
model = LogisticRegressionCV(Cs = [5,10,20],cv = 5,penalty =  'l2',max_iter = 500,random_state = seed,
         multi_class = 'ovr')

fitted_model,results = hf.model_fit_train_score_skf(model,doc_vectors_lg, y)

In [18]:
print(f"Accuracy: {results['Accuracy_mean']:.3f} \nF1 score: {results['F1_mean']:.3f} \nAUC score: {results['AUC_mean']:.3f}")

Accuracy: 0.864 
F1 score: 0.914 
AUC score: 0.909


In [19]:
fitted_model.C_

array([10])

In [28]:
clf = LinearSVC()
parameters_SVC = {'C':[0.5,0.55,0.6,0.7,0.8], 'penalty':['l1','l2','elasticnet'],'loss':['hinge','squared_hinge'] 
                }
clf_grid = GridSearchCV(estimator = clf, param_grid = parameters_SVC, verbose=1,n_jobs = -1,
                        scoring='f1',cv = 5,
                        return_train_score=True).fit(doc_vectors_lg, y)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:   37.7s finished


In [27]:
clf_grid.best_estimator_

LinearSVC(C=0.6, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [29]:
best_SVC = eval(str(clf_grid.best_estimator_))

In [31]:
fitted_model,results = hf.model_fit_train_score_skf(best_SVC,doc_vectors_lg, y,predict_proba = False)
print(f"Accuracy: {results['Accuracy_mean']:.3f} \nF1 score: {results['F1_mean']:.3f} ")

Accuracy: 0.865 
F1 score: 0.916 


In [37]:
clf = RandomForestClassifier()
parameters_rf = {'n_estimators':[100],'min_samples_leaf':[1,5],'max_features':['int','auto']}
clf_grid = GridSearchCV(estimator = clf, param_grid = parameters_rf, verbose=1,n_jobs = -1,
                        scoring='f1',cv = 5,refit='AUC',
                        return_train_score=True).fit(doc_vectors_lg, y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  1.2min finished


In [38]:
best_rf = eval(str(clf_grid.best_estimator_))

In [39]:
fitted_model,results = hf.model_fit_train_score_skf(best_rf,doc_vectors_lg, y)

In [40]:
print(f"Accuracy: {results['Accuracy_mean']:.3f} \nF1 score: {results['F1_mean']:.3f} \nAUC score: {results['AUC_mean']:.3f}")

Accuracy: 0.820 
F1 score: 0.893 
AUC score: 0.858


Now try to build model with the second model

In [41]:
model = LogisticRegressionCV(Cs = [5,10,20],cv = 5,penalty =  'l2',max_iter = 500,random_state = seed,
         multi_class = 'ovr')

fitted_model,results = hf.model_fit_train_score_skf(model,doc_vectors, y)

In [43]:
print(f"Accuracy: {results['Accuracy_mean']:.3f} \nF1 score: {results['F1_mean']:.3f} \nAUC score: {results['AUC_mean']:.3f}")

Accuracy: 0.806 
F1 score: 0.883 
AUC score: 0.805


In [44]:
fitted_model.C_

array([10])

Well it is just not as good as the pre-trained model as our dataset is too small

Lastly we will do model performance evaluation on the 4 best model selected base on validation score