In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import KFold
import sklearn.linear_model as linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
import json
import gensim

%matplotlib inline

# Step 0 - Preprocessing

In [2]:

reviews = {"stars":[], "text":[]}
ids = set()

# ################## get businesses located in Boston####################
b = open('business.json', 'r', encoding='utf-8')
el = b.readline().rstrip('\n')
while el:
    e = json.loads(el)
    if e["city"] == "Boston":
        ids.add(e["business_id"])
    el = b.readline().rstrip('\n')
b.close()
#############################################################

#############get the reviews of the businesses in Boston in 2020#####
r = open('reviews.json', 'r', encoding='utf-8')
el = r.readline().rstrip('\n')
while el:
    e = json.loads(el)
    if e["business_id"] in ids and e["date"].split('-')[0] == '2020':
        reviews["stars"].append(1 if e["stars"] > 3 else 0)
        reviews["text"].append(e["text"])
    el = r.readline().rstrip('\n')
r.close()
################################################################

reviews_df = pd.DataFrame(reviews)
display(reviews_df.head())

Unnamed: 0,stars,text
0,0,Chill dive bar that was around the corner from...
1,1,Definitely a worthy place to stay. The room ...
2,0,Once upon Stella was a great restaurant. It wa...
3,1,The spicy calamari is not to be missed! As are...
4,1,It provided great peace of mind to know that w...


# Apply KFold and initialize the classifiers

In [3]:

tfidf = TfidfVectorizer(analyzer = 'word', stop_words = 'english', max_features=300)

lr_clf = linear_model.LogisticRegression(solver='lbfgs', max_iter=1000)
svm_clf = svm.SVC()
knn = KNeighborsClassifier(n_neighbors=100)

kf = KFold(n_splits=5, shuffle=True, random_state=2021)
folds = kf.split(reviews_df)

 # Helpers

In [4]:

#################classifiers################

def LR(X_train, y_train, X_test):
    lr_clf.fit(X_train, y_train)
    return lr_clf.predict(X_test)

def SVM(X_train, y_train, X_test):
    svm_clf.fit(X_train, y_train)
    return svm_clf.predict(X_test)

def K_NN(X_train, y_train, X_test):
    knn.fit(X_train, y_train)
    return knn.predict(X_test)

#################metrics####################

def get_confusion_matrix(real_classes, predicted_classes):
    return confusion_matrix(real_classes, predicted_classes)

def get_metrics(m):
    
    #total accuracy
    a = (m[0][0] + m[1][1])/(m[0][0] + m[1][0] + m[0][1] + m[1][1])
    
    #class 0 precision + recall + f1
    p0 = m[0][0]/(m[0][0] + m[1][0])
    r0 = m[0][0]/(m[0][0] + m[0][1])
    f10 = 2*(p0 * r0)/(p0 + r0)
    
    #class 1 precision + recall + f1
    p1 = m[1][1]/(m[1][1] + m[0][1])
    r1 = m[1][1]/(m[1][1] + m[1][0])
    f11 = 2*(p1 * r1)/(p1 + r1)
    return ([p0, p1, r0, r1, f10, f11], a)


# Step 1a, 5-fold cross validation using TfIdf Vectorizer

In [5]:

real_classes = np.array([])
predicted_lr = np.array([])
predicted_svm = np.array([])
predicted_knn = np.array([])

for train, test in folds:
    
    train_data = reviews_df.iloc[train]
    test_data = reviews_df.iloc[test]
    
    X_train, X_test = tfidf.fit_transform(train_data['text']), tfidf.fit_transform(test_data['text'])
    y_train, y_test = train_data['stars'], test_data["stars"]
    
    lr_pred = LR(X_train, y_train, X_test)
    svm_pred = SVM(X_train, y_train, X_test)
    knn_pred = K_NN(X_train, y_train, X_test)
    
    real_classes = np.append(real_classes, y_test)
    predicted_lr = np.append(predicted_lr, lr_pred)
    predicted_svm = np.append(predicted_svm, svm_pred)
    predicted_knn = np.append(predicted_knn, knn_pred)

#calculate the confusion matrices
m1 = get_confusion_matrix(real_classes, predicted_svm)
m2 = get_confusion_matrix(real_classes, predicted_lr)
m3 = get_confusion_matrix(real_classes, predicted_knn)

#get the metrics of the classifiers
metrics_svm = get_metrics(m1)
metrics_lr = get_metrics(m2)
metrics_knn = get_metrics(m3)

print('confusion matrices: ')
print('SVM: ')
print(m1)
print('-------------------------------------------------------')

print('LR: ')
print(m2)
print('-------------------------------------------------------')

print('KNN: ')
print(m3)
print('-------------------------------------------------------')

print('Metrics per classifier: ')
print('-------------------------------------------------------')

print('Mean Precision, Recall, F1 scores per class for each Classifier:')
stats = [metrics_svm[0], metrics_lr[0], metrics_knn[0]]
df = pd.DataFrame(stats, index=['SVM', 'LR', 'KNN'], columns = ['Precision1', 'Precision2', 'Recall1', 'Recall2', 'F11', 'F12'])

print('Mean Accuracies of each Classifier across all 5 folds')
print('-------------------------------------------------------')
print('LR: ', metrics_lr[1])
print('SVM: ', metrics_svm[1])
print('KNN: ', metrics_knn[1])
print('-------------------------------------------------------\n')

confusion matrices: 
SVM: 
[[ 5319  4750]
 [ 3872 18075]]
-------------------------------------------------------
LR: 
[[ 5288  4781]
 [ 3844 18103]]
-------------------------------------------------------
KNN: 
[[ 3029  7040]
 [ 1919 20028]]
-------------------------------------------------------
Metrics per classifier: 
-------------------------------------------------------
Mean Precision, Recall, F1 scores per class for each Classifier:


Unnamed: 0,Precision1,Precision2,Recall1,Recall2,F11,F12
SVM,0.578718,0.791895,0.528255,0.823575,0.552336,0.807424
LR,0.579063,0.791077,0.525176,0.824851,0.550805,0.807611
KNN,0.612167,0.739914,0.300824,0.912562,0.403409,0.817219


Mean Accuracies of each Classifier across all 5 folds
-------------------------------------------------------
LR:  0.7306034482758621
SVM:  0.7306971514242878
KNN:  0.720171164417791
-------------------------------------------------------



## Notes
The metrics were calculated using exclusively the mean Confusion Matrix across all 5 folds for each classifier. We calculate the actual classes as well as the predictions for each fold and then combine them with the rest, to finally get the total mean confusion matrix. The metrics are calculated based on the true positives / negatives as well as the false positives / negatives for each class. Based on the metrics we observe that the 3 classifiers have similar accuracy. In addition to tfidf and the CNN classifier, some experiments were performed to find good numbers from features / neighbors.

# Step 1b Words with biggest/smallest coefficients in the last fold

In [38]:

for train, test in folds:
    pass

train_data = reviews_df.iloc[train]
test_data = reviews_df.iloc[test]

X_train = tfidf.fit_transform(train_data['text'])
y_train = train_data['stars']

features = tfidf.get_feature_names_out()

X_test = tfidf.fit_transform(test_data['text'])
y_test = test_data["stars"]
    
LR(X_train, y_train, X_test)
weights = lr_clf.coef_.flatten()

l = list(zip(weights, features))
l.sort(reverse=True, key=lambda x: x[0])

f = list(zip(*l))

print('20 most important words from best to worst: ')
print(f[1][0:20])
print('------------------------------------------------')
print('20 least important words from worst to best: ')
print(f[1][-20:][::-1])


20 most important words from best to worst: 
('amazing', 'delicious', 'perfect', 'great', 'best', 'highly', 'definitely', 'excellent', 'thank', 'wonderful', 'fantastic', 'love', 'favorite', 'perfectly', 'happy', 'awesome', 'professional', 'loved', 'enjoyed', 'friendly')
------------------------------------------------
20 least important words from worst to best: 
('rude', 'money', 'ok', 'told', 'disappointed', 'asked', 'bad', 'wasn', 'don', 'wouldn', 'tasted', 'left', 'pay', 'manager', 'cold', 'didn', 'flavor', '15', 'just', 'better')


## Notes
We see that the best words used by the classifier which have the heaviest weights have a positive meaning and are able to represent a positive review. On the contrary, the words with the smallest weights are either neutral in meaning or are able to describe a negative review.

# Step 2 cross validation using Google Embeddings

In [11]:
import gensim.downloader as api
path = api.load("word2vec-google-news-300", return_path=True)
g_model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)

y = np.array(reviews_df["stars"])
reviews = reviews_df["text"].tolist()

X = []
for review in reviews:
    vx = np.zeros(300)
    length = 0
    for w in review.split(' '): 
        if w in g_model:
            length += 1
            vx += g_model[w]
    if length != 0: vx /= length
    X.append(vx)
    
X = np.array(X)

real_classes = np.array([])
predicted_lr = np.array([])
predicted_svm = np.array([])
predicted_knn = np.array([])

for train, test in folds:
    
    X_train, y_train = X[train], y[train]
    X_test, y_test = X[test], y[test]
    
    lr_pred = LR(X_train, y_train, X_test)
    svm_pred = SVM(X_train, y_train, X_test)
    knn_pred = K_NN(X_train, y_train, X_test)
    
    real_classes = np.append(real_classes, y_test)
    predicted_lr = np.append(predicted_lr, lr_pred)
    predicted_svm = np.append(predicted_svm, svm_pred)
    predicted_knn = np.append(predicted_knn, knn_pred)
    
#calculate the confusion matrices
m1 = get_confusion_matrix(real_classes, predicted_svm)
m2 = get_confusion_matrix(real_classes, predicted_lr)
m3 = get_confusion_matrix(real_classes, predicted_knn)

#get the metrics of the classifiers
metrics_svm = get_metrics(m1)
metrics_lr = get_metrics(m2)
metrics_knn = get_metrics(m3)

print('confusion matrices: ')

print('SVM: ')
print(m1)
print('-------------------------------------------------------')

print('LR: ')
print(m2)
print('-------------------------------------------------------')

print('KNN: ')
print(m3)
print('-------------------------------------------------------')

print('Metrics per classifier: ')
print('-------------------------------------------------------')

print('Mean Precision, Recall, F1 scores per class for each Classifier:')
stats = [metrics_svm[0], metrics_lr[0], metrics_knn[0]]
df = pd.DataFrame(stats, index=['SVM', 'LR', 'KNN'], columns = ['Precision1', 'Precision2', 'Recall1', 'Recall2', 'F11', 'F12'])
display(df)

print('Mean Accuracies of each Classifier across all 5 folds')
print('-------------------------------------------------------')
print('LR: ', metrics_lr[1])
print('SVM: ', metrics_svm[1])
print('KNN: ', metrics_knn[1])
print('-------------------------------------------------------\n')


confusion matrices: 
SVM: 
[[ 7781  2288]
 [ 1334 20613]]
-------------------------------------------------------
LR: 
[[ 7542  2527]
 [ 1386 20561]]
-------------------------------------------------------
KNN: 
[[ 6627  3442]
 [ 2684 19263]]
-------------------------------------------------------
Metrics per classifier: 
-------------------------------------------------------
Mean Precision, Recall, F1 scores per class for each Classifier:


Unnamed: 0,Precision1,Precision2,Recall1,Recall2,F11,F12
SVM,0.853648,0.900092,0.772768,0.939217,0.811197,0.919238
LR,0.844758,0.890549,0.749032,0.936848,0.79402,0.913112
KNN,0.711739,0.848403,0.658159,0.877705,0.683901,0.862806


Mean Accuracies of each Classifier across all 5 folds
-------------------------------------------------------
LR:  0.877779860069965
SVM:  0.8868690654672664
KNN:  0.8086581709145427
-------------------------------------------------------



## Notes
From the metrics we see that google embeddings significantly improve the results compared to the simple tfidf vectorizer, with the SVM algorithm having the best accuracy.