# Import Libraries

In [1]:
import nltk
from nltk.stem.snowball import SnowballStemmer 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import OneHotEncoder
from nltk.probability import FreqDist
from matplotlib import cm
from wordcloud import WordCloud
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier
from keras import models
from keras import layers
from keras import optimizers

XGBoostError: XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed (vcomp140.dll or libgomp-1.dll for Windows, libomp.dylib for Mac OSX, libgomp.so for Linux and other UNIX-like OSes). Mac OSX users: Run `brew install libomp` to install OpenMP runtime.
  * You are running 32-bit Python on a 64-bit OS
Error message(s): ['dlopen(/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/xgboost/lib/libxgboost.dylib, 6): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib\n  Referenced from: /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: image not found']


# Obtain Data

Read Yelp review data from csv

In [None]:
df = pd.read_csv('yelp_reviews.csv')
df.head()

# Scrub Data: Pre-Processing

Create numerical dummy column for review class

In [None]:
class_map = {'chinese': 0, 'japanese':1, 'indpak':2}
df['Class'] = df['Category'].replace(class_map)
df.iloc[1195:1205,:]

Create list of tokenized chinese restaurant reviews

In [None]:
chinese_reviews = list(df[df['Class'] == 0]['Review'].values)
chinese_tokens = []
for review in chinese_reviews:
    chinese_tokens.extend(nltk.word_tokenize(review))

Create list of tokenized japanese restaurant reviews

In [None]:
japanese_reviews = list(df[df['Class'] == 1]['Review'].values)
japanese_tokens = []
for review in japanese_reviews:
    japanese_tokens.extend(nltk.word_tokenize(review))

Create list of tokenized indian restaurant reviews

In [None]:
indian_reviews = list(df[df['Class'] == 2]['Review'].values)
indian_tokens = []
for review in indian_reviews:
    indian_tokens.extend(nltk.word_tokenize(review))

Remove stopwords from chinese, japanese and indian token lists

In [None]:
stop_words = stopwords.words('english')
stop_words.extend(['.','i',",",'!','s','gym','barber','barbers', '...',"'s","n't",'(',')',"'m",'ve',"I've", 'chinese',
                  'japanese', 'indian', 'china', 'japan', 'india'])
filtered_chinese_tokens = []
for word in chinese_tokens:
    if word not in stop_words:
        filtered_chinese_tokens.append(word)
filtered_japanese_tokens = [] 
for word in japanese_tokens:
    if word not in stop_words:
        filtered_japanese_tokens.append(word)
filtered_indian_tokens = [] 
for word in indian_tokens:
    if word not in stop_words:
        filtered_indian_tokens.append(word)

Stem gym and barber token lists

In [None]:
stemmer = SnowballStemmer('english')
stemmed_chinese = [stemmer.stem(word) for word in filtered_chinese_tokens]
stemmed_japanese = [stemmer.stem(word) for word in filtered_japanese_tokens]
stemmed_indian = [stemmer.stem(word) for word in filtered_indian_tokens]

# Explore Data

Create lists of 25 most frequent words and word counts for gym and barber classes

In [None]:
chinese_freq = FreqDist(stemmed_chinese)
japanese_freq = FreqDist(stemmed_japanese)
indian_freq = FreqDist(stemmed_indian)
chinese_freq_words = [x[0] for x in chinese_freq.most_common(25)]
chinese_freq_counts = [x[1] for x in chinese_freq.most_common(25)]
japanese_freq_words = [x[0] for x in japanese_freq.most_common(25)]
japanese_freq_counts = [x[1] for x in japanese_freq.most_common(25)]
indian_freq_words = [x[0] for x in indian_freq.most_common(25)]
indian_freq_counts = [x[1] for x in indian_freq.most_common(25)]

Create frequency distribution bar graphs for 25 most frequent words for gym and barber classes

In [None]:
color = cm.viridis_r(np.linspace(.4,.8, 30))
fig, ax = plt.subplots(nrows = 3, ncols=1, figsize=(10,25))
ax[0].bar(chinese_freq_words, chinese_freq_counts, color=color)
ax[0].set_xticklabels(labels=chinese_freq_words, rotation=90)
ax[0].set_title('Frequency Distribution of Top 25 Chinese Restaurant Review Tokens')
ax[0].set_xlabel('Tokens')
ax[0].set_ylabel('Frequency')
ax[1].bar(japanese_freq_words, japanese_freq_counts, color=color)
ax[1].set_xticklabels(labels=japanese_freq_words, rotation=90)
ax[1].set_title('Frequency Distribution of Top 25 Japanese Restaurant Review Tokens')
ax[1].set_xlabel('Tokens')
ax[1].set_ylabel('Frequency')
ax[2].bar(indian_freq_words, indian_freq_counts, color=color)
ax[2].set_xticklabels(labels=indian_freq_words, rotation=90)
ax[2].set_title('Frequency Distribution of Top 25 Indian Restaurant Review Tokens')
ax[2].set_xlabel('Tokens')
ax[2].set_ylabel('Frequency')

Create word clouds for 25 most frequent words for gym and barber classes

In [None]:
chinese_freq_zipped = dict(zip(chinese_freq_words, chinese_freq_counts))
japanese_freq_zipped = dict(zip(japanese_freq_words, japanese_freq_counts))
indian_freq_zipped = dict(zip(indian_freq_words, indian_freq_counts))
chinese_wordcloud = WordCloud(colormap='Spectral').generate_from_frequencies(chinese_freq_zipped)
japanese_wordcloud = WordCloud(colormap='Spectral').generate_from_frequencies(japanese_freq_zipped)
indian_wordcloud = WordCloud(colormap='Spectral').generate_from_frequencies(indian_freq_zipped)
fig, ax = plt.subplots(nrows=3,ncols=1, figsize=(10,30))
ax[0].imshow(chinese_wordcloud)
ax[0].set_title('Chinese Restaurant Reviews Top 25 Tokens Wordcloud')
ax[0].set_xticklabels([])
ax[0].set_yticklabels([])
ax[1].imshow(japanese_wordcloud)
ax[1].set_title('Japanese Restaurant Reviews Top 25 Tokens Wordcloud')
ax[1].set_xticklabels([])
ax[1].set_yticklabels([])
ax[2].imshow(indian_wordcloud)
ax[2].set_title('Indian Restaurant Reviews Top 25 Tokens Wordcloud')
ax[2].set_xticklabels([])
ax[2].set_yticklabels([])

# Model Data

Split Yelp review data into x and y variables

In [None]:
x = df['Review'].values
y = df['Class'].values

Perform train test split on Yelp review data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=11, train_size = .2)

Vectorize Yelp review data using tf-idf

In [None]:
vectorizer = TfidfVectorizer(stop_words = stop_words)
vectorizer.fit(x_train)
tfidf_x_train = vectorizer.transform(x_train)
tfidf_x_test = vectorizer.transform(x_test)

Train baseline Dummy Classifier for Yelp review data

In [None]:
dummy = DummyClassifier(random_state=78)
dummy.fit(tfidf_x_train, y_train)
y_train_pred_dummy = dummy.predict(tfidf_x_train)
y_test_pred_dummy = dummy.predict(tfidf_x_test)
print('Train F1 Score: ', f1_score(y_train, y_train_pred_dummy, average='weighted'))
print('Test F1 Score: ', f1_score(y_test, y_test_pred_dummy, average='weighted'))

Perform GridSearchCV on Naive Bayes to find optimal alpha value

In [None]:
nb_dict = {'alpha': np.array(range(1,10000))/1000}
gs_nb = MultinomialNB()
gs_nb = GridSearchCV(gs_nb, nb_dict, scoring='f1_weighted', n_jobs=-1, cv=5, verbose=1)
gs_nb.fit(tfidf_x_train, y_train)
gs_nb.best_params_

Train Naive Bayes model based on optimal alpha value from GridsearchCV

In [None]:
nb = MultinomialNB(alpha=.54)
nb.fit(tfidf_x_train, y_train)
y_train_pred = nb.predict(tfidf_x_train)
y_test_pred = nb.predict(tfidf_x_test)
print('Train F1 Score: ', f1_score(y_train, y_train_pred, average='weighted'))
print('Test F1 Score: ', f1_score(y_test, y_test_pred, average='weighted'))

Create Confusion Matrix for Naive Bayes model

In [None]:
cm = confusion_matrix(y_test, y_test_pred)
fig, ax = plt.subplots(figsize=(5,5))
matrix = sns.heatmap(cm/np.sum(cm), annot=True, ax = ax, cmap = 'Blues', fmt = '.1%')
matrix.set_title('Naive Bayes Confusion Matrix')
matrix.set_xlabel('Predicted')
matrix.set_xticklabels(['Barber','Gym'])
matrix.set_ylabel('Actual')
matrix.set_yticklabels(['Barber','Gym'])

Perform GridSearchCV on Random Forest to find optimal hyperparameters

In [None]:
rf_dict = {'max_depth': range(25,30), 'min_samples_leaf': range(1,30), 'criterion': ['gini','entropy']}
gs_rf = RandomForestClassifier()
gs_rf = GridSearchCV(gs_rf, rf_dict, scoring='f1_weighted', n_jobs=-1, cv=5, verbose=1)
gs_rf.fit(tfidf_x_train, y_train)
gs_rf.best_params_

Train Random Forest Classifier based on GridsearchCV results

In [None]:
rf = RandomForestClassifier(criterion = 'gini', max_depth = 29, min_samples_leaf = 1)
rf.fit(tfidf_x_train, y_train)
y_train_pred_rf = rf.predict(tfidf_x_train)
y_test_pred_rf = rf.predict(tfidf_x_test)
print('Train F1 Score: ', f1_score(y_train, y_train_pred_rf, average='weighted'))
print('Test F1 Score: ', f1_score(y_test, y_test_pred_rf, average='weighted'))

Create Confusion Matrix for Random Forest model

In [None]:
cm = confusion_matrix(y_test, y_test_pred_rf)
fig, ax = plt.subplots(figsize=(5,5))
matrix = sns.heatmap(cm/np.sum(cm), annot=True, ax = ax, cmap = 'Blues', fmt = '.1%')
matrix.set_title('Random Forest Confusion Matrix')
matrix.set_xlabel('Predicted')
matrix.set_xticklabels(['Barber','Gym'])
matrix.set_ylabel('Actual')
matrix.set_yticklabels(['Barber','Gym'])

Train Gradient Boost model

In [None]:
gb = GradientBoostingClassifier(min_impurity_decrease=.0037)
gb.fit(tfidf_x_train, y_train)
y_train_pred_gb = gb.predict(tfidf_x_train)
y_test_pred_gb = gb.predict(tfidf_x_test)
print('Train F1 Score: ', f1_score(y_train, y_train_pred_gb, average='weighted'))
print('Test F1 Score: ', f1_score(y_test, y_test_pred_gb, average='weighted'))

Create Confusion Matrix for Gradient Boost model

In [None]:
cm = confusion_matrix(y_test, y_test_pred_gb)
fig, ax = plt.subplots(figsize=(5,5))
matrix = sns.heatmap(cm/np.sum(cm), annot=True, ax = ax, cmap = 'Blues', fmt = '.1%')
matrix.set_title('Gradient Boost Confusion Matrix')
matrix.set_xlabel('Predicted')
matrix.set_xticklabels(['Barber','Gym'])
matrix.set_ylabel('Actual')
matrix.set_yticklabels(['Barber','Gym'])

Gridsearch xgboost model

In [None]:
xgb_dict = {'min_impurity_decrease': [x/10000 for x in range(1,100)]}
gs_xgb = XGBClassifier()
gs_xgb = GridSearchCV(gs_xgb, xgb_dict, scoring='f1_weighted', n_jobs=-1, cv=5, verbose=1)
gs_xgb.fit(tfidf_x_train, y_train)
gs_xgb.best_params_

Build and validate top performing xgboost model

In [None]:
xgb = XGBClassifier(min_impurity_decrease=0.1)
xgb.fit(tfidf_x_train, y_train)
y_train_pred_xgb = xgb.predict(tfidf_x_train)
y_test_pred_xgb = xgb.predict(tfidf_x_test)
print('Train F1 Score: ', f1_score(y_train, y_train_pred_xgb, average='weighted'))
print('Test F1 Score: ', f1_score(y_test, y_test_pred_xgb, average='weighted'))

Train voting classifier model

In [None]:
vc = VotingClassifier(estimators=[('nb', nb), ('rf', rf), ('gb', gb)], voting='soft', weights = [.615,.594,.581])
vc.fit(tfidf_x_train, y_train)
y_train_pred_vc = vc.predict(tfidf_x_train)
y_test_pred_vc = vc.predict(tfidf_x_test)
print('Train F1 Score: ', f1_score(y_train, y_train_pred_vc, average='weighted'))

print('Test F1 Score: ', f1_score(y_test, y_test_pred_vc, average='weighted'))

Create Confusion Matrix for Voting Classifier model

In [None]:
cm = confusion_matrix(y_test, y_test_pred_vc)
fig, ax = plt.subplots(figsize=(5,5))
matrix = sns.heatmap(cm/np.sum(cm), annot=True, ax = ax, cmap = 'Blues', fmt = '.1%')
matrix.set_title('Voting Classifier Confusion Matrix')
matrix.set_xlabel('Predicted')
matrix.set_xticklabels(['Barber','Gym'])
matrix.set_ylabel('Actual')
matrix.set_yticklabels(['Barber','Gym'])