# Import the necessary libraries

In [1]:
import sys
sys.path.append("../../Utils/")

In [2]:
from sklearn.externals import joblib
from scipy.sparse import csr_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, cross_val_score
import utils
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline

# Load the data

In [3]:
X_path = '/home/imad/Desktop/Yelp_vectorized_data/Baseline/Restaurants_X.pkl'
y_path = '/home/imad/Desktop/Yelp_vectorized_data/Baseline/Restaurants_y.pkl'
output_path = './Output/'

In [4]:
# Load a sparse matrix of the text (reviews)
X = joblib.load(X_path)
X.shape

(2927730, 917127)

In [5]:
# Load the labels (ratings)
y = joblib.load(y_path)
y.shape

(2927730,)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [7]:
X_test

<878319x917127 sparse matrix of type '<type 'numpy.int64'>'
	with 42186261 stored elements in Compressed Sparse Row format>

# Remove 3-star ratings and re-label the training dataset

In [None]:
neutral_ratings_ids = list(np.where(y_train == 3)[0])
len(neutral_ratings_ids)

In [None]:
X_train = utils.delete_from_csr(X_train, row_indices=neutral_ratings_ids)

In [None]:
X_train.shape

In [None]:
y_train = np.delete(y_train.tolist(), neutral_ratings_ids)

In [None]:
for i in range(0, len(y_train)):
    if y_train[i] == 1 or y_train[i] == 2:
        y_train[i] = -1
    else:
        y_train[i] = 1

In [None]:
y_train.shape

# Re-label in the test dataset:
## 0,1 -> -1
## 4, 5 -> 1
## 3 -> 0

In [None]:
for i in y_test.index:
    if y_test[i] == 1 or y_test[i] == 2:
        y_test[i] = -1
    elif y_test[i] == 4 or y_test[i] == 5:
        y_test[i] = 1
    else:
        y_test[i] = 0

In [None]:
y_test.value_counts()

In [None]:
# Rating Distribution
values, counts = np.unique(y_train, return_counts=True)
colors = ['gold', 'yellowgreen']

# Plot
mpl.rcParams['font.size'] = 20.0
f = plt.figure()
f.set_size_inches(16,9)
plt.pie(counts, labels=['Negative', 'Positive'], colors=colors, autopct='%1.2f%%')
plt.axis('equal')
plt.show()
f.savefig(output_path + 'Distribution.pdf')

# MNB Smoothing-Parameter Optimization

In [None]:
MNB = MultinomialNB()
MNB

In [None]:
params = {'alpha': np.arange(0.1, 1.1, 0.1)}
grid = GridSearchCV(MNB, params, scoring='f1_weighted')
grid.fit(X_train, y_train)

In [None]:
grid.best_estimator_

In [None]:
grid.grid_scores_

In [None]:
MNB = grid.best_estimator_

# Cross-validation

In [None]:
MNB_scores = cross_val_score(MNB, X_train, y_train, cv=5, scoring='f1_weighted')
MNB_scores

# Save the trained model in a pickle file

In [None]:
joblib.dump(MNB, output_path + 'MNB_r_r.pkl')

# Apply the model on the test data

In [None]:
MNB_pred = MNB.predict(X_test)
MNB_pred_prob = MNB.predict_proba(X_test)
print('Predictions: {}'.format(MNB_pred))
print('Predictions Probabilities: {}'.format(MNB_pred_prob.round(2)))

# Choose a range of probabilities where the model is confused

In [None]:
# Calculate the difference between the two prediction of a data point
diff = [abs(pred[0] - pred[1]) for pred in MNB_pred_prob]
diff_sr = pd.Series(diff)
diff_sr.describe()

In [None]:
p = diff_sr.plot.box(figsize=(16,9), label='Diff_Prob_Box')
p.figure.savefig(output_path + 'Diff_Prob_Box.pdf')

In [None]:
p = diff_sr.hist(bins=50, figsize=(16,9), grid=False)
p.set_xlabel('The Distribution of Probability Difference')
p.figure.savefig(output_path + 'Diff_Prob_Hist.pdf')

In [None]:
# If the difference between the p(0|x) and p(1|x) is smaller than 0.6 (it's enough as a threshold)
ix_ambig = [ix for ix in range(MNB_pred_prob.shape[0]) if abs(MNB_pred_prob[ix][0] - MNB_pred_prob[ix][1]) < 0.6]
print(len(ix_ambig))

In [None]:
# Label all the ambiguous data points as 3-star
for i in ix_ambig:
    MNB_pred[i] = 0

In [None]:
pd.Series(MNB_pred).value_counts()

In [None]:
joblib.dump(MNB_pred, output_path + 'Predictions.pkl')

# MNB Results

In [None]:
MNB_accuracy = round(accuracy_score(y_test, MNB_pred), 2)
print('Model Accuracy: {}'.format(MNB_accuracy))
with open(output_path + 'Accuracy.txt', 'wb') as f:
    f.write(str(MNB_accuracy))

In [None]:
MNB_f1 = round(f1_score(y_test, MNB_pred,average='weighted'), 2)
print('Model F1-Score: {}'.format(MNB_f1))
with open(output_path + 'F1.txt', 'wb') as f:
    f.write(str(MNB_f1))

In [None]:
MNB_CM = confusion_matrix(y_test, MNB_pred)
print('Confusion Matrix:\n{}'.format(MNB_CM))

In [None]:
mpl.rcParams['font.size'] = 13.0
utils.cm_plot(MNB_CM, ['Negative', 'Neutral', 'Positive'], 'MNB Normalized Confusion Matrix', output_path + 'MNB_CM.pdf')

In [None]:
print(classification_report(y_test, MNB_pred))