In [113]:
#import the needed libraries
import pandas as pd
import numpy as np
import sys
import csv
from datetime import datetime
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from collections import Counter


In [114]:
#read text column and store it to text
#read stars column and store it to star

text = pd.read_csv('yelp.csv', usecols=['text'])
star = pd.read_csv('yelp.csv', usecols=['stars'])

In [115]:
# exploring the two columns
display(pd.concat([text, star], axis = 1))

Unnamed: 0,text,stars
0,My wife took me here on my birthday for breakf...,5
1,I have no idea why some people give bad review...,5
2,love the gyro plate. Rice is so good and I als...,4
3,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",5
4,General Manager Scott Petello is a good egg!!!...,5
5,"Quiessence is, simply put, beautiful. Full wi...",4
6,Drop what you're doing and drive here. After I...,5
7,"Luckily, I didn't have to travel far to make m...",4
8,Definitely come for Happy hour! Prices are ama...,4
9,Nobuo shows his unique talents with everything...,5


In [116]:
#check if there is any null values to fix them

display(text.isnull().sum(), star.isnull().sum())

text    0
dtype: int64

stars    0
dtype: int64

In [117]:
# need to store the dataset to a list to ease the processing
texts=[]
stars=[]

for i in range(len(text)):
    texts.append(text['text'][i])

for i in range(len(star)):
    stars.append(star['stars'][i])


    

In [118]:
#check if the list is empty or not
display(len(texts))

10000

In [119]:
display(len(stars))

10000

In [120]:
# in order to avoid any bais in rating reviews system we need to train the dataset in a balanced data

def balance_dataset(x,y):
    freq = Counter(y)
    
    # the least rating will be the max number we need for the other ratings
    
    max = freq.most_common()[-1][1]
    
    added_num = {count: 0 for count in freq.keys()}
    
    new_y=[]
    new_x=[]
    
    for i, yy in enumerate(y):
        if added_num[yy] < max:
            new_y.append(yy)
            new_x.append(x[i])
            added_num[yy]+=1
    
    return new_x, new_y

In [121]:
# show the count of each star 
display(Counter(stars))


Counter({1: 749, 2: 927, 3: 1461, 4: 3526, 5: 3337})

In [122]:
# as we can see above the dataset are not balanced:

# we have varies number of rating stars. the majority were rated as 4 or 5 stars.

# we need to balance the data, so our model will not be bias.

# the goal is to see what star rating were scoring the least. in our case, 1 has the least number which is 749

# so, we have to unify the number of all the star rating 

In [123]:
balanced_x, balanced_y = balance_dataset(texts, stars)

In [124]:
# display the count of each star after it has been balanced
display(Counter(balanced_y))


Counter({1: 749, 2: 749, 3: 749, 4: 749, 5: 749})

In [125]:
# vectorize the texts by usig n-grams then calculate the TF-IDF
n=3

vectorizer = TfidfVectorizer(ngram_range=(1, n))

# fit the vectorizer

vectors = vectorizer.fit_transform(balanced_x)

In [126]:
display(vectors)

<3745x601306 sparse matrix of type '<class 'numpy.float64'>'
	with 1272034 stored elements in Compressed Sparse Row format>

In [174]:
#split the dataset for traning
X_train, X_test, y_train, y_test = train_test_split(vectors, balanced_y, test_size=0.2, random_state=60)

In [175]:
# use the RandomForestClassifier classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [176]:
# use the SVM classifier


svm_classifier = LinearSVC()
svm_classifier.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [177]:
pred = svm_classifier.predict(X_test)
pred_clf = clf.predict(X_test)

In [180]:
print("condider all the stars rating")

print("")


print("Accuracy: SVM Classifier")
display(accuracy_score(y_test, pred))


labels = np.unique(y_test)
matrix =confusion_matrix(y_test, pred, labels=labels)

print("confusion_matrix: SVM Classifier")


pd.DataFrame(matrix, index=labels, columns=labels)

condider all the stars rating

Accuracy: SVM Classifier


0.5046728971962616

confusion_matrix: SVM Classifier


Unnamed: 0,1,2,3,4,5
1,114,21,6,2,7
2,63,46,32,7,8
3,21,26,57,30,18
4,10,2,28,61,43
5,8,6,4,29,100


In [160]:
print("Accuracy: Random Forest Classifier")
display(accuracy_score(y_test, pred_clf))


labels = np.unique(y_test)
matrix =confusion_matrix(y_test, pred_clf, labels=labels)

print("confusion_matrix: Random Forest Classifier")


pd.DataFrame(matrix, index=labels, columns=labels)

Accuracy: Random Forest Classifier


0.32309746328437916

confusion_matrix: Random Forest Classifier


Unnamed: 0,1,2,3,4,5
1,81,25,18,7,19
2,52,28,40,20,16
3,31,25,45,24,27
4,28,18,24,24,50
5,21,14,19,29,64


In [161]:
# now we will preform multiple approaches to train the model on to see which approach gives better performance

# 1. consider star rating 1 & 2 as neigative and 4 & 5 as positive  and 3 as neutral 
# 2. consider star rating 1 & 2 as neigative and 3 &4 & 5 as positive 
# 3. consider star rating 1 & 2 & 3 as neigative and 4 & 5 as positive
# 4. consider star rating 1 & 2 as neigative and 4 & 5 as positive  and get rid of rating star 3

In [162]:
keep_rating = set([1,2,3,4,5])

keep_rating_train = [i for i, y in enumerate(y_train) if y in keep_rating]
keep_rating_test = [i for i, y in enumerate(y_test) if y in keep_rating]




In [172]:
print(" 1. consider star rating 1 & 2 as neigative and 4 & 5 as positive  and 3 as neutral ")
print("")

X_train_1 = X_train[keep_rating_train, :]
y_train_1 = [y_train[i] for i in keep_rating_train]
y_train_1 = ["n" if (y == 1 or y == 2) else ("p" if (y == 4 or y == 5) else "non") for y in y_train_1]

X_test_1 = X_test[keep_rating_test, :]
y_test_1 = [y_test[i] for i in keep_rating_test]
y_test_1 = ["n" if (y == 1 or y == 2) else ("p" if (y == 4 or y == 5) else "non") for y in y_test_1]

svm_classifier.fit(X_train_1, y_train_1)
pred = svm_classifier.predict(X_test_1)

print("Accuracy: SVM Classifier")
display(accuracy_score(y_test_1, pred))


labels = np.unique(y_test_1)
matrix =confusion_matrix(y_test_1, pred, labels=labels)

print("confusion_matrix: SVM Classifier")


pd.DataFrame(matrix, index=labels, columns=labels)








 1. consider star rating 1 & 2 as neigative and 4 & 5 as positive  and 3 as neutral 

Accuracy: SVM Classifier


0.8844221105527639

confusion_matrix: SVM Classifier


Unnamed: 0,n,p
n,271,35
p,34,257


In [164]:
#clf = RandomForestClassifier()
clf.fit(X_train_1, y_train_1)


pred_clf = clf.predict(X_test_1)
print("Accuracy: Random Forest Classifier")
display(accuracy_score(y_test_1, pred_clf))


labels = np.unique(y_test_1)
matrix =confusion_matrix(y_test_1, pred_clf, labels=labels)

print("confusion_matrix: Random Forest Classifier")


pd.DataFrame(matrix, index=labels, columns=labels)

Accuracy: Random Forest Classifier


0.57543391188251

confusion_matrix: Random Forest Classifier


Unnamed: 0,n,non,p
n,221,4,81
non,74,8,70
p,82,7,202


In [165]:
print("  2. consider star rating 1 & 2 as neigative and 3 &4 & 5 as positive ")
print("")


#X_train_1 = X_train[keep_rating_train, :]
y_train_1 = [y_train[i] for i in keep_rating_train]
y_train_1 = ["n" if (y == 1 or y == 2) else "p" for y in y_train_1]

X_test_1 = X_test[keep_rating_test, :]
y_test_1 = [y_test[i] for i in keep_rating_test]
y_test_1 = ["n" if (y == 1 or y == 2) else "p" for y in y_test_1]

svm_classifier.fit(X_train_1, y_train_1)
pred = svm_classifier.predict(X_test_1)

print("Accuracy: SVM Classifier")
display(accuracy_score(y_test_1, pred))


labels = np.unique(y_test_1)
matrix =confusion_matrix(y_test_1, pred, labels=labels)

print("confusion_matrix: SVM Classifier")


pd.DataFrame(matrix, index=labels, columns=labels)

Accuracy: SVM Classifier


0.8304405874499332

confusion_matrix: SVM Classifier


Unnamed: 0,n,p
n,209,97
p,30,413


In [166]:
clf.fit(X_train_1, y_train_1)


pred_clf = clf.predict(X_test_1)
print("Accuracy: Random Forest Classifier")
display(accuracy_score(y_test_1, pred_clf))


labels = np.unique(y_test_1)
matrix =confusion_matrix(y_test_1, pred_clf, labels=labels)

print("confusion_matrix: Random Forest Classifier")


pd.DataFrame(matrix, index=labels, columns=labels)

Accuracy: Random Forest Classifier


0.6915887850467289

confusion_matrix: Random Forest Classifier


Unnamed: 0,n,p
n,138,168
p,63,380


In [167]:
print("  3. consider star rating 1 & 2 & 3 as neigative and 4 & 5 as positive")
print("")



#X_train_1 = X_train[keep_rating_train, :]
y_train_1 = [y_train[i] for i in keep_rating_train]
y_train_1 = ["n" if (y == 1 or y == 2 or y == 3) else "p" for y in y_train_1]

X_test_1 = X_test[keep_rating_test, :]
y_test_1 = [y_test[i] for i in keep_rating_test]
y_test_1 = ["n" if (y == 1 or y == 2 or y == 3) else "p" for y in y_test_1]

svm_classifier.fit(X_train_1, y_train_1)
pred = svm_classifier.predict(X_test_1)

print("Accuracy: SVM Classifier")
display(accuracy_score(y_test_1, pred))


labels = np.unique(y_test_1)
matrix =confusion_matrix(y_test_1, pred, labels=labels)

print("confusion_matrix: SVM Classifier")


pd.DataFrame(matrix, index=labels, columns=labels)

Accuracy: SVM Classifier


0.8437917222963952

confusion_matrix: SVM Classifier


Unnamed: 0,n,p
n,439,19
p,98,193


In [168]:
clf.fit(X_train_1, y_train_1)


pred_clf = clf.predict(X_test_1)
print("Accuracy: Random Forest Classifier")
display(accuracy_score(y_test_1, pred_clf))


labels = np.unique(y_test_1)
matrix =confusion_matrix(y_test_1, pred_clf, labels=labels)

print("confusion_matrix: Random Forest Classifier")


pd.DataFrame(matrix, index=labels, columns=labels)

Accuracy: Random Forest Classifier


0.6929238985313751

confusion_matrix: Random Forest Classifier


Unnamed: 0,n,p
n,426,32
p,198,93


In [169]:
print("  4. consider star rating 1 & 2 as neigative and 4 & 5 as positive  and get rid of rating star 3")
print("")
keep_rating = set([1,2,4,5])

keep_rating_train = [i for i, y in enumerate(y_train) if y in keep_rating]
keep_rating_test = [i for i, y in enumerate(y_test) if y in keep_rating]

X_train_1 = X_train[keep_rating_train, :]
y_train_1 = [y_train[i] for i in keep_rating_train]
y_train_1 = ["n" if (y == 1 or y == 2) else "p" for y in y_train_1]

X_test_1 = X_test[keep_rating_test, :]
y_test_1 = [y_test[i] for i in keep_rating_test]
y_test_1 = ["n" if (y == 1 or y == 2) else "p" for y in y_test_1]

svm_classifier.fit(X_train_1, y_train_1)
pred = svm_classifier.predict(X_test_1)

print("Accuracy: SVM Classifier")
display(accuracy_score(y_test_1, pred))


labels = np.unique(y_test_1)
matrix =confusion_matrix(y_test_1, pred, labels=labels)

print("confusion_matrix: SVM Classifier")


pd.DataFrame(matrix, index=labels, columns=labels)

Accuracy: SVM Classifier


0.8844221105527639

confusion_matrix: SVM Classifier


Unnamed: 0,n,p
n,271,35
p,34,257


In [170]:
clf.fit(X_train_1, y_train_1)


pred_clf = clf.predict(X_test_1)
print("Accuracy: Random Forest Classifier")
display(accuracy_score(y_test_1, pred_clf))


labels = np.unique(y_test_1)
matrix =confusion_matrix(y_test_1, pred_clf, labels=labels)

print("confusion_matrix: Random Forest Classifier")


pd.DataFrame(matrix, index=labels, columns=labels)

Accuracy: Random Forest Classifier


0.7386934673366834

confusion_matrix: Random Forest Classifier


Unnamed: 0,n,p
n,249,57
p,99,192
