# Loading JSON data

In [1]:
import json
data = []
with open('reviews.json', 'r') as fin:
    data = json.load(fin)
len(data)

14517

# Data preprocessing

### Equally distribute data

In [2]:
new_data = []
#Counting number of rating below of equal 3 in data
n = 0
for review in data:
    if(review['rating'] <= 3):
        new_data.append(review)
        n += 1;
print(n)
i = 0
for review in data:
    if(review['rating'] > 3 and i < n):
        new_data.append(review)
        i += 1

1414


In [3]:
#Shufle data since the we've put all the negative ones first
import random
random.shuffle(new_data)

### Ratings above 3 are considered positive and others are considered negative

In [4]:
import pandas as pd
df = pd.DataFrame(new_data)
df['sentiment'] = df['rating'].apply(lambda x: "NEGATIVE" if(x <= 3) else "POSITIVE")
df = df.drop(['rating'], axis=1)
df.head()

Unnamed: 0,text,sentiment
0,"I still love eating here at rositas, the food...",POSITIVE
1,Absolutely fantastic!!! The staff was amazing ...,POSITIVE
2,After not having the greatest experience the f...,POSITIVE
3,"Wow, what a wonderful coffee shop! If your loo...",POSITIVE
4,Came here on a Monday for dinner with my frien...,NEGATIVE


### Vectorize the text data

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['text']).toarray()
y = df['sentiment']

### Train/Test Split

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# Training the models

In [7]:
#SVM
from sklearn import svm
clf_svm = svm.SVC()
clf_svm.fit(X_train, y_train)

#KNeighbors
from sklearn.neighbors import KNeighborsClassifier
clf_neigh = KNeighborsClassifier(n_neighbors=3)
clf_neigh.fit(X_train, y_train)

#DecisionTree
from sklearn.tree import DecisionTreeClassifier
clf_deci = DecisionTreeClassifier(random_state=0)
clf_deci.fit(X_train, y_train)

#RandomForest
from sklearn.ensemble import RandomForestClassifier
clf_rndfr = RandomForestClassifier(random_state=0)
clf_rndfr.fit(X_train, y_train)

#NaiveBayes
from sklearn.naive_bayes import GaussianNB
clf_nb = GaussianNB()
clf_nb.fit(X_train, y_train)

#MLP
from sklearn.neural_network import MLPClassifier
clf_mlp = MLPClassifier(random_state=1, max_iter=300)
clf_mlp.fit(X_train, y_train)

MLPClassifier(max_iter=300, random_state=1)

# Testing the models

### Accuracy of models

In [8]:
print("SVM Accuracy : ", clf_svm.score(X_test, y_test))
print("KNeighbors Accuracy : ", clf_neigh.score(X_test, y_test))
print("DecisionTree Accuracy : ", clf_deci.score(X_test, y_test))
print("RandomForest Accuracy : ", clf_rndfr.score(X_test, y_test))
print("NaiveBayes Accuracy : ", clf_nb.score(X_test, y_test))
print("MLP Accuracy : ", clf_mlp.score(X_test, y_test))

SVM Accuracy :  0.7879858657243817
KNeighbors Accuracy :  0.5606595995288575
DecisionTree Accuracy :  0.71849234393404
RandomForest Accuracy :  0.7974087161366313
NaiveBayes Accuracy :  0.6725559481743227
MLP Accuracy :  0.7915194346289752


### F1 Scores

In [9]:
from sklearn.metrics import f1_score
print("SVM F1 Score : ", f1_score(y_test, clf_svm.predict(X_test), average=None, labels=["POSITIVE", "NEGATIVE"]))
print("KNeighbors F1 Score : ", f1_score(y_test, clf_neigh.predict(X_test), average=None, labels=["POSITIVE", "NEGATIVE"]))
print("DecisionTree F1 Score : ", f1_score(y_test, clf_deci.predict(X_test), average=None, labels=["POSITIVE", "NEGATIVE"]))
print("RandomForest F1 Score : ", f1_score(y_test, clf_rndfr.predict(X_test), average=None, labels=["POSITIVE", "NEGATIVE"]))
print("NaiveBayes F1 Score : ", f1_score(y_test, clf_nb.predict(X_test), average=None, labels=["POSITIVE", "NEGATIVE"]))
print("MLP F1 Score : ", f1_score(y_test, clf_mlp.predict(X_test), average=None, labels=["POSITIVE", "NEGATIVE"]))

SVM F1 Score :  [0.77832512 0.79683973]
KNeighbors F1 Score :  [0.64976526 0.4107425 ]
DecisionTree F1 Score :  [0.72933182 0.70674847]
RandomForest F1 Score :  [0.79425837 0.80046404]
NaiveBayes F1 Score :  [0.71516393 0.61495845]
MLP F1 Score :  [0.7920094  0.79102715]
