In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score,roc_auc_score,accuracy_score, confusion_matrix, roc_curve
from sklearn.feature_extraction.text import TfidfVectorizer
from helper_functions import model_eval
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC,SVC
from sklearn.decomposition import TruncatedSVD
seed = 123

In [2]:
data_train = pd.read_pickle('train_set.pkl')
data_test = pd.read_pickle('test_set.pkl')

In [5]:
X_train = np.array(data_train.RAW_Text)
y_train = np.array(data_train.Positively_rated)
X_test = np.array(data_test.RAW_Text)
y_test = np.array(data_test.Positively_rated)

We will evaluate the performance of 4 models using test data set

The four best model so far are:
* TF-IDF (bigram) with logistic regression (LR),C = 5 
* TF-IDF (bigram) with linear SVC,C=0.5
* TF-IDF (bigram) + LSA with LR, C = 10
* TF-IDF (bigram) + LSA with linear SVC, C =1

Model 1:

In [6]:
vect = TfidfVectorizer(ngram_range=(1, 2),min_df = 3,max_df = 15396)
X_train_vect = vect.fit_transform(X_train)
X_test_vect = vect.transform(X_test)

In [10]:
model1 = LogisticRegression(C = 5,max_iter = 500,random_state = seed)

In [11]:
fitted_model,results = model_eval(model1,X_train_vect,y_train,X_test_vect,y_test)

In [13]:
print(f"Accuracy: {results['Accuracy']:.3f} \nF1 score: {results['F1']:.3f} \nAUC score: {results['AUC']:.3f}")

Accuracy: 0.894 
F1 score: 0.933 
AUC score: 0.943


Model 2:

In [14]:
model2 = LinearSVC(C=0.5, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [17]:
fitted_model,results = model_eval(model2,X_train_vect,y_train,X_test_vect,y_test,predict_proba = False)

In [19]:
print(f"Accuracy: {results['Accuracy']:.3f} \nF1 score: {results['F1']:.3f} ")

Accuracy: 0.896 
F1 score: 0.933 


Model 3:

In [23]:
SVD = TruncatedSVD(n_components=300, algorithm='randomized', n_iter=5, random_state=seed, tol=0.0)

In [25]:
X_train_vect_SVD = SVD.fit_transform(X_train_vect)
X_test_vect_SVD = SVD.transform(X_test_vect)

In [26]:
model3 = LogisticRegression(C = 10,max_iter = 500,random_state = seed)

In [32]:
fitted_model,results = model_eval(model3,X_train_vect_SVD,y_train,X_test_vect_SVD,y_test)

In [33]:
print(f"Accuracy: {results['Accuracy']:.3f} \nF1 score: {results['F1']:.3f} \nAUC score: {results['AUC']:.3f}")

Accuracy: 0.881 
F1 score: 0.924 
AUC score: 0.931


Model 4:

In [31]:
model4 = LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [35]:
fitted_model,results = model_eval(model4,X_train_vect_SVD,y_train,X_test_vect_SVD,y_test,predict_proba = False)

In [36]:
print(f"Accuracy: {results['Accuracy']:.3f} \nF1 score: {results['F1']:.3f} ")

Accuracy: 0.882 
F1 score: 0.925 


We will choose model 4 for a balance of computational expense and classification accuracy

In [26]:
df = pd.read_pickle('cleaned_df.pkl')

In [27]:
X = np.array(df.RAW_Text)[:-5]
y = np.array(df.Positively_rated)[:-5]

In [28]:
len(X)

22636

In [30]:
sample_x = np.array(df.RAW_Text)[-5:]
sample_y = np.array(df.Positively_rated)[-5:]

In [57]:
sample_y

array([1, 0, 0, 0, 1], dtype=int64)

In [31]:
vect = TfidfVectorizer(ngram_range=(1, 2),min_df = 3,max_df = 15396)
Vectorizer  = vect.fit(X)
X_new = Vectorizer.transform(X)

In [32]:
SVD = TruncatedSVD(n_components=300, algorithm='randomized', n_iter=5, random_state=seed, tol=0.0)
LSA = SVD.fit(X_new)
X_new_LSA = LSA.transform(X_new)

In [33]:
final_model = LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [34]:
fitted_final = final_model.fit(X_new_LSA,y)

Pickle objects for further use

In [55]:
To_pickle = [Vectorizer,LSA,fitted_final,sample_x]
keys_to_pickle = ['Vectorizer','LSA','model','sample']

In [36]:
import pickle 

In [56]:
for name,item in zip(keys_to_pickle,To_pickle):
    file = open(f'output/{name}.obj', 'wb') 
    pickle.dump(item, file)
    file.close()

In [38]:
from collections import defaultdict
model = defaultdict()

In [39]:
for name in keys_to_pickle:
    file = open(f'output/{name}.obj', 'rb') 
    model[name] = pickle.load(file)
    file.close()

In [101]:
text1 = "it is crap and disappointing, worst item ever"
X_test = np.append(sample_x,text1)

In [102]:
text_new = model['Vectorizer'].transform(X_test)

In [103]:
text_new = model['LSA'].transform(text_new)

In [104]:
model['model'].predict(text_new)[-1]

0