In [32]:
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
import string
from ast import literal_eval
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

import xgboost as xgb
import copy

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Load preprocessed data and grab relevant features

In [2]:
data = pd.read_csv('preproc_Reviews.csv')
ppdata = data.loc[:,['Sentiment', 'CleanAll']]
ppdata.head()

Unnamed: 0,Sentiment,CleanAll
0,1,"['good', 'qualiti', 'dog', 'food', 'bought', '..."
1,-1,"['advertis', 'product', 'arriv', 'label', 'jum..."
2,1,"['delight', 'say', 'confect', 'around', 'centu..."
3,-1,"['cough', 'medicin', 'look', 'secret', 'ingred..."
4,1,"['great', 'taffi', 'great', 'taffi', 'great', ..."


# Convert sentiment from [-1,1] to XGBoost required [0,1]

In [3]:
ppdata['Sentiment'] = ppdata['Sentiment'].apply(lambda x: 0 if x == -1 else 1)
ppdata.head()

Unnamed: 0,Sentiment,CleanAll
0,1,"['good', 'qualiti', 'dog', 'food', 'bought', '..."
1,0,"['advertis', 'product', 'arriv', 'label', 'jum..."
2,1,"['delight', 'say', 'confect', 'around', 'centu..."
3,0,"['cough', 'medicin', 'look', 'secret', 'ingred..."
4,1,"['great', 'taffi', 'great', 'taffi', 'great', ..."


# Split data into train and test sets

In [4]:
X_train, X_test, y_train, y_test = train_test_split(ppdata['CleanAll'], ppdata['Sentiment'], test_size = 0.10)

# Run data through sklearn countvectorizer

In [5]:
cv = CountVectorizer(binary=True)
cv.fit(ppdata['CleanAll'])
X_train = cv.transform(X_train)
X_test = cv.transform(X_test)

CountVectorizer(binary=True)

In [6]:
xgb_train = xgb.DMatrix(X_train, y_train)
xgb_test = xgb.DMatrix(X_test, y_test)

# Set params and train xgboost

In [7]:
param = {}

In [8]:
xgb_model = xgb.train(param, xgb_train)

In [None]:
with open('xgb_default.pickle', 'wb') as f:
    pickle.dump(xgb_model, f)

In [None]:
with open('xgb_default.pickle', 'rb') as f:
    xgb_model = pickle.load(f)

In [9]:
y_pred = xgb_model.predict(xgb_test)
grid_list = [copy.deepcopy(y_pred) for _ in range(11)]

In [10]:
for index, preds in enumerate(grid_list):
    threshold = index * 0.1
    for pred_index, number in enumerate(y_pred):
        grid_list[index][pred_index] = 0 if number < threshold else 1

In [28]:
for index, preds in enumerate(grid_list):
    print(f'========================== Threshold: {round(index * 0.1, 2)} ===========================')
    print(f'Accuracy: {round(accuracy_score(y_test, preds), 3)}')
    print(f'Precision: {round(precision_score(y_test, preds), 3)}')
    print(f'Recall: {round(recall_score(y_test, preds), 3)}')
    print(f'F1: {round(f1_score(y_test, preds), 3)}')

Accuracy: 0.857
Precision: 0.857
Recall: 1.0
F1: 0.923
Accuracy: 0.858
Precision: 0.858
Recall: 1.0
F1: 0.924
Accuracy: 0.861
Precision: 0.861
Recall: 1.0
F1: 0.925
Accuracy: 0.866
Precision: 0.865
Recall: 1.0
F1: 0.927
Accuracy: 0.874
Precision: 0.872
Recall: 0.999
F1: 0.931
Accuracy: 0.885
Precision: 0.887
Recall: 0.993
F1: 0.937
Accuracy: 0.892
Precision: 0.899
Recall: 0.984
F1: 0.94
Accuracy: 0.892
Precision: 0.914
Recall: 0.964
F1: 0.938
Accuracy: 0.801
Precision: 0.956
Recall: 0.804
F1: 0.873
Accuracy: 0.529
Precision: 0.979
Recall: 0.46
F1: 0.626
Accuracy: 0.145
Precision: 1.0
Recall: 0.002
F1: 0.003


In [35]:
print(classification_report(y_test, grid_list[6]))

              precision    recall  f1-score   support

           0       0.78      0.34      0.48      8159
           1       0.90      0.98      0.94     48687

    accuracy                           0.89     56846
   macro avg       0.84      0.66      0.71     56846
weighted avg       0.88      0.89      0.87     56846

