In [7]:
import json
import csv
import pandas as pd
import random

import re

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score



import math
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict

from sklearn.naive_bayes import MultinomialNB

import time

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Data Preprocessing

In [9]:
reviews=pd.read_json('restaurant_reviews.json')

In [10]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5126287 entries, 0 to 5126286
Data columns (total 9 columns):
 #   Column       Dtype         
---  ------       -----         
 0   review_id    object        
 1   user_id      object        
 2   business_id  object        
 3   stars        int64         
 4   useful       int64         
 5   funny        int64         
 6   cool         int64         
 7   text         object        
 8   date         datetime64[ns]
dtypes: datetime64[ns](1), int64(4), object(4)
memory usage: 352.0+ MB


In [11]:
reviews.dropna(inplace=True) #remove all blank entries, seems like it was already done for us!
reviews.shape

(5126287, 9)

In [None]:
# for computational tractability, we will use ~ 100K train reviews, and ~ 25K test reviews

In [None]:
#reviews=reviews.head(250000)

In [None]:
stars = reviews['stars']
reviews = reviews['text']

In [8]:
#Initialize the count vectorizer tool from nlp toolkit. This will get us word counts
vectorizer = CountVectorizer()
word_counts = vectorizer.fit_transform(reviews)

In [9]:
# scale word count results so our results are not biased
scaler = StandardScaler(with_mean=False)
X = scaler.fit_transform(word_counts)

In [10]:
X_train = X[:200000, ]
X_test = X[200000:, ]

star_train=stars[:200000, ]
star_test =stars[200000: , ]

# final variables to be used

X_train: contains normalized word count data for train reviews

star_train: star ratings for train set

X_test: contains normalized word count data for test reviews

star_test: star ratings for test set

## Multinomial Naive Bayes

In [11]:
start = time.time()
nb = MultinomialNB() #initialize a multinomial bayes model, for our 5 classes (1 star to 5 stars)
nb_model=nb.fit(X_train,star_train)
end = time.time()
print(end-start)

0.08050775527954102


In [12]:
start = time.time()
predicted_stars = nb_model.predict(X_test)
end = time.time()
print(end-start)

0.03065013885498047


In [13]:
#evaluate model
nb_model.class_count_

array([19482., 17217., 24773., 54397., 84131.])

In [14]:
print(classification_report(star_test, predicted_stars))

              precision    recall  f1-score   support

           1       0.47      0.54      0.50      5406
           2       0.19      0.37      0.26      4275
           3       0.23      0.31      0.26      5991
           4       0.38      0.33      0.35     12868
           5       0.70      0.53      0.60     21460

    accuracy                           0.44     50000
   macro avg       0.39      0.42      0.40     50000
weighted avg       0.49      0.44      0.46     50000



In [15]:
#surprisingly, the model works better without accounting for stopwords. We believe this may be due to the short nature of the reviews

## Support Vector Machines (Linear Boundary)

In [16]:
start = time.time()
svm=LinearSVC()
svm.fit(X_train, star_train)
end = time.time()
print(end-start)

320.9903781414032




In [17]:
start = time.time()
predicted_stars = svm.predict(X_test)
end = time.time()
print(end-start)

0.0410771369934082


In [18]:
#svm.class_count_

In [19]:
print(classification_report(star_test, predicted_stars))

              precision    recall  f1-score   support

           1       0.53      0.57      0.55      5406
           2       0.28      0.24      0.26      4275
           3       0.29      0.29      0.29      5991
           4       0.40      0.39      0.39     12868
           5       0.68      0.71      0.69     21460

    accuracy                           0.52     50000
   macro avg       0.44      0.44      0.44     50000
weighted avg       0.51      0.52      0.52     50000



## MultiClass Logistic Regression

In [20]:
lm = LogisticRegression(multi_class='multinomial', solver='saga')

In [21]:
start=time.time()
lm_model=lm.fit(X_train,star_train)
end=time.time()
print(end-start)

29.61316680908203




In [22]:
start=time.time()
predicted_stars = lm_model.predict(X_test)
end=time.time()
print(end-start)

0.031584978103637695


In [23]:
#lm_model.class_count_

In [24]:
print(classification_report(star_test, predicted_stars))

              precision    recall  f1-score   support

           1       0.70      0.70      0.70      5406
           2       0.46      0.25      0.32      4275
           3       0.44      0.27      0.34      5991
           4       0.48      0.44      0.46     12868
           5       0.70      0.87      0.77     21460

    accuracy                           0.62     50000
   macro avg       0.56      0.51      0.52     50000
weighted avg       0.59      0.62      0.59     50000



## KNN

In [25]:
knn = KNeighborsClassifier(n_neighbors=10)

In [26]:
start=time.time()
knn.fit(X_train, star_train)
end=time.time()
print(end-start)

0.05590987205505371


In [27]:
start=time.time()
predicted_stars = knn.predict(X_test)
end=time.time()
print(end-start)

2519.298425912857


In [28]:
#knn.class_count_

In [29]:
print(classification_report(star_test, predicted_stars))

              precision    recall  f1-score   support

           1       0.54      0.21      0.31      5406
           2       0.26      0.04      0.07      4275
           3       0.25      0.06      0.09      5991
           4       0.29      0.26      0.28     12868
           5       0.50      0.79      0.61     21460

    accuracy                           0.44     50000
   macro avg       0.37      0.27      0.27     50000
weighted avg       0.40      0.44      0.38     50000



## Random Forests

In [30]:
rf = RandomForestClassifier()

In [31]:
start=time.time()
rf_model=rf.fit(X_train, star_train) 
end=time.time()
print(end-start)

1005.274337053299


In [32]:
start=time.time()
predicted_stars = rf_model.predict(X_test)
end=time.time()
print(end-start)

3.1193583011627197


In [33]:
#rf_model.class_count_

In [34]:
print(classification_report(star_test, predicted_stars))

              precision    recall  f1-score   support

           1       0.71      0.54      0.61      5406
           2       0.48      0.05      0.09      4275
           3       0.44      0.08      0.14      5991
           4       0.39      0.29      0.34     12868
           5       0.57      0.92      0.71     21460

    accuracy                           0.54     50000
   macro avg       0.52      0.38      0.38     50000
weighted avg       0.52      0.54      0.48     50000

