In [109]:
import pandas as pd
import numpy as np
import enchant
import nltk
from nltk.corpus import stopwords
import pickle
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

In [81]:
data = pd.read_csv('/Users/beibeifeng/Desktop/School/2020fall/5067 NLP/Project/data/train_reviews.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150232 entries, 0 to 150231
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   business_id  150232 non-null  object 
 1   cool         150232 non-null  int64  
 2   date         150232 non-null  object 
 3   funny        150232 non-null  float64
 4   review_id    150232 non-null  object 
 5   stars        150232 non-null  float64
 6   text         150232 non-null  object 
 7   useful       150232 non-null  float64
 8   user_id      150232 non-null  object 
dtypes: float64(3), int64(1), object(5)
memory usage: 10.3+ MB


In [82]:
# randomly select 10,000 samples from the original data
selected_data = data.sample(30000, random_state = 0).reset_index()

In [83]:
# create a binary response for each review, review with stars > 3 is assigned as "high", otherwise is assigned as"low"
binary = []
for i in selected_data.stars:
    if i >= 3:
        binary.append("high")
    elif i < 3:
        binary.append("low")
binary[:10]

['high', 'low', 'low', 'high', 'high', 'low', 'high', 'high', 'high', 'high']

In [84]:
selected_data['binary'] = binary

In [85]:
selected_data.head()

Unnamed: 0,index,business_id,cool,date,funny,review_id,stars,text,useful,user_id,binary
0,97248,WunR7VclAddvbCnc-97jzg,0,2014-07-23,0.0,eedlPRo631GSaNX_DaphkQ,4.0,"Yes, believe it or not there is really good So...",0.0,tgFn7JadPQJ8aYdwIRi8aA,high
1,127659,6vNMmkttsHkW1THWiP50xg,0,2015-06-17,1.0,EDD8yhtd3LQKO95CyOl_MQ,2.0,First tip the whole groupon thing is a sham as...,1.0,rLpq9zNFLgyVYQrN8Gn-Zg,low
2,16213,EWmwbOm_4UhOtvLaBzHpPA,3,2010-08-27,4.0,FDDFJlUc6kIM_ZTg_23Xsw,2.0,"The Skinny: overcooked pasta, wilted iceberg a...",3.0,yXYuW-2Q0X7f0a9MHyERgw,low
3,121599,db12Hn9hdoE-Ne4_NsVKSw,0,2015-04-20,0.0,FEe6cMH54x7Vgn_NXl3mEA,3.0,I really wanted to love this place. I really d...,5.0,wjejB1QIPsnFOVUmNjaHaA,high
4,119819,IpLvJEjb4AN8eWq5NE2TVA,0,2015-04-02,0.0,ZdvF50_PGujN1g1Juur_lw,4.0,We flew on Delta in and out of PSHIA. As an Ai...,1.0,yiHXyCxPenNg5ZWCXF95qQ,high


In [86]:
selected_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   index        30000 non-null  int64  
 1   business_id  30000 non-null  object 
 2   cool         30000 non-null  int64  
 3   date         30000 non-null  object 
 4   funny        30000 non-null  float64
 5   review_id    30000 non-null  object 
 6   stars        30000 non-null  float64
 7   text         30000 non-null  object 
 8   useful       30000 non-null  float64
 9   user_id      30000 non-null  object 
 10  binary       30000 non-null  object 
dtypes: float64(3), int64(2), object(6)
memory usage: 2.5+ MB


In [87]:
selected_data.binary.value_counts()

high    25514
low      4486
Name: binary, dtype: int64

# Text Preprocessing

In [88]:
selected_data.text[0]

"Yes, believe it or not there is really good Soul Food in Ahwatukee.  Love the breaded catfish, the fried chicken, and especially the greens.  Lots of flavor to them, and they're not drenched in butter.  The mac and cheese is good.  The cornbread is a bit of a disappointment, as is the sauteed cabbage, but by and large the food is very good, and the owners really friendly and proud of what they're doing."

In [89]:
# cleaning text, i.e. remove punctuation, change all characters in lower case
def clean_text(str_in):
    import re
    tmp = re.sub("[^A-z ]+", "", str_in.lower())
    return tmp

selected_data['clean_review'] = selected_data.text.apply(clean_text)

In [90]:
selected_data.clean_review[0]

'yes believe it or not there is really good soul food in ahwatukee  love the breaded catfish the fried chicken and especially the greens  lots of flavor to them and theyre not drenched in butter  the mac and cheese is good  the cornbread is a bit of a disappointment as is the sauteed cabbage but by and large the food is very good and the owners really friendly and proud of what theyre doing'

In [91]:
# check if words are in english dictionary
def check_en(var):
    d = enchant.Dict("en_US")
    tmp = var.split() #tokenize
    tmp_list = list()
    for word in tmp:
        if d.check(word):
            tmp_list.append(word)
    tmp = ' '.join(tmp_list)
    return tmp

selected_data['clean_review'] = selected_data.clean_review.apply(check_en)

In [92]:
selected_data.clean_review[0]

'yes believe it or not there is really good soul food in love the breaded catfish the fried chicken and especially the greens lots of flavor to them and not drenched in butter the mac and cheese is good the cornbread is a bit of a disappointment as is the sauteed cabbage but by and large the food is very good and the owners really friendly and proud of what doing'

In [93]:
# remove stopwords
def remove_stopwords(var):
    my_sw = (stopwords.words('english'))
    tmp = var.split()
    tmp = [word for word in tmp if my_sw.count(word) == 0]
    tmp = ' '.join(tmp)
    return tmp

selected_data['clean_review'] = selected_data.clean_review.apply(remove_stopwords)

In [94]:
selected_data.clean_review[0]

'yes believe really good soul food love breaded catfish fried chicken especially greens lots flavor drenched butter mac cheese good cornbread bit disappointment sauteed cabbage large food good owners really friendly proud'

In [95]:
# stemming
def my_stem(var):
    from nltk.stem import PorterStemmer
    stem = PorterStemmer()
    tmp = var.split()
    tmp = [stem.stem(word) for word in tmp]
    tmp = ' '.join(tmp)
    return tmp

selected_data['clean_review'] = selected_data.clean_review.apply(my_stem)

In [96]:
selected_data.clean_review[0]

'ye believ realli good soul food love bread catfish fri chicken especi green lot flavor drench butter mac chees good cornbread bit disappoint saute cabbag larg food good owner realli friendli proud'

In [97]:
# tfidf vectorize text
def my_tfidf(df_in):
    tf_idf_vectorizer = TfidfVectorizer()
    tfidf_vec_out = pd.DataFrame(tf_idf_vectorizer.fit_transform(df_in).toarray())
    tfidf_vec_out.columns = tf_idf_vectorizer.get_feature_names()
    return tfidf_vec_out

In [98]:
tfidf_text = my_tfidf(selected_data.clean_review)
tfidf_text

Unnamed: 0,aah,ab,aback,abacu,abalon,abandon,abbey,abbot,abbrevi,abdomen,...,zinnia,zip,zipper,zippi,zither,zombi,zone,zoo,zoom,zucchini
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Train Models - Predict Stars

In [105]:
# split data into training/test sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_text, selected_data.stars, test_size=0.20, random_state=42)

In [106]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((24000, 14594), (6000, 14594), (24000,), (6000,))

# Multinomial Naive Bayes

In [107]:
# build model and fit training data
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=.01).fit(X_train, y_train)
y_pred = clf.predict(X_test)
# Training & Test accuracy
print("Training Accuracy is", clf.score(X_train, y_train)*100, "%")
print("Test Accuracy is", clf.score(X_test, y_test)*100, "%")

Training Accuracy is 70.44166666666666 %
Test Accuracy is 48.266666666666666 %


In [110]:
# metrics
metrics = pd.DataFrame(precision_recall_fscore_support(y_test, y_pred, average='weighted'))
metrics.index = ["precision", "recall", "fscore", "support"]
metrics

Unnamed: 0,0
precision,0.479693
recall,0.482667
fscore,0.455023
support,


# Train Models - Predict High/Low

In [111]:
# split data into training/test sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_text, selected_data.binary, test_size=0.20, random_state=42)

In [112]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((24000, 14594), (6000, 14594), (24000,), (6000,))

In [113]:
# build model and fit training data
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=.01).fit(X_train, y_train)
y_pred = clf.predict(X_test)
# Training & Test accuracy
print("Training Accuracy is", clf.score(X_train, y_train)*100, "%")
print("Test Accuracy is", clf.score(X_test, y_test)*100, "%")

Training Accuracy is 91.0625 %
Test Accuracy is 88.35 %


In [114]:
# metrics
metrics = pd.DataFrame(precision_recall_fscore_support(y_test, y_pred, average='weighted'))
metrics.index = ["precision", "recall", "fscore", "support"]
metrics

Unnamed: 0,0
precision,0.875144
recall,0.8835
fscore,0.856997
support,
