In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import enchant
import pickle
from sklearn.metrics import precision_recall_fscore_support
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, auc, f1_score

# 1 Random Forest & MLP

## 1.1 Load Data

In [4]:
data = pd.read_csv('train_reviews.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150232 entries, 0 to 150231
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   business_id  150232 non-null  object 
 1   cool         150232 non-null  int64  
 2   date         150232 non-null  object 
 3   funny        150232 non-null  float64
 4   review_id    150232 non-null  object 
 5   stars        150232 non-null  float64
 6   text         150232 non-null  object 
 7   useful       150232 non-null  float64
 8   user_id      150232 non-null  object 
dtypes: float64(3), int64(1), object(5)
memory usage: 10.3+ MB


In [5]:
# randomly select 10,000 samples from the original data
selected_data = data.sample(10000, random_state = 0).reset_index()

In [6]:
# create a binary response for each review, review with stars > 3 is assigned as "high", otherwise is assigned as"low"
binary = []
for i in selected_data.stars:
    if i >= 3:
        binary.append("high")
    elif i < 3:
        binary.append("low")
binary[:10]

['high', 'low', 'low', 'high', 'high', 'low', 'high', 'high', 'high', 'high']

In [7]:
selected_data['binary'] = binary

In [8]:
selected_data.head()

Unnamed: 0,index,business_id,cool,date,funny,review_id,stars,text,useful,user_id,binary
0,97248,WunR7VclAddvbCnc-97jzg,0,2014-07-23,0.0,eedlPRo631GSaNX_DaphkQ,4.0,"Yes, believe it or not there is really good So...",0.0,tgFn7JadPQJ8aYdwIRi8aA,high
1,127659,6vNMmkttsHkW1THWiP50xg,0,2015-06-17,1.0,EDD8yhtd3LQKO95CyOl_MQ,2.0,First tip the whole groupon thing is a sham as...,1.0,rLpq9zNFLgyVYQrN8Gn-Zg,low
2,16213,EWmwbOm_4UhOtvLaBzHpPA,3,2010-08-27,4.0,FDDFJlUc6kIM_ZTg_23Xsw,2.0,"The Skinny: overcooked pasta, wilted iceberg a...",3.0,yXYuW-2Q0X7f0a9MHyERgw,low
3,121599,db12Hn9hdoE-Ne4_NsVKSw,0,2015-04-20,0.0,FEe6cMH54x7Vgn_NXl3mEA,3.0,I really wanted to love this place. I really d...,5.0,wjejB1QIPsnFOVUmNjaHaA,high
4,119819,IpLvJEjb4AN8eWq5NE2TVA,0,2015-04-02,0.0,ZdvF50_PGujN1g1Juur_lw,4.0,We flew on Delta in and out of PSHIA. As an Ai...,1.0,yiHXyCxPenNg5ZWCXF95qQ,high


In [9]:
selected_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   index        10000 non-null  int64  
 1   business_id  10000 non-null  object 
 2   cool         10000 non-null  int64  
 3   date         10000 non-null  object 
 4   funny        10000 non-null  float64
 5   review_id    10000 non-null  object 
 6   stars        10000 non-null  float64
 7   text         10000 non-null  object 
 8   useful       10000 non-null  float64
 9   user_id      10000 non-null  object 
 10  binary       10000 non-null  object 
dtypes: float64(3), int64(2), object(6)
memory usage: 859.5+ KB


In [10]:
selected_data.binary.value_counts()

high    8485
low     1515
Name: binary, dtype: int64

## 1.2 Text Preprocessing

In [11]:
selected_data.text[0]

"Yes, believe it or not there is really good Soul Food in Ahwatukee.  Love the breaded catfish, the fried chicken, and especially the greens.  Lots of flavor to them, and they're not drenched in butter.  The mac and cheese is good.  The cornbread is a bit of a disappointment, as is the sauteed cabbage, but by and large the food is very good, and the owners really friendly and proud of what they're doing."

In [12]:
# cleaning text, i.e. remove punctuation, change all characters in lower case
def clean_text(str_in):
    import re
    tmp = re.sub("[^A-z ]+", "", str_in.lower())
    return tmp

selected_data['clean_review'] = selected_data.text.apply(clean_text)

In [13]:
selected_data.clean_review[0]

'yes believe it or not there is really good soul food in ahwatukee  love the breaded catfish the fried chicken and especially the greens  lots of flavor to them and theyre not drenched in butter  the mac and cheese is good  the cornbread is a bit of a disappointment as is the sauteed cabbage but by and large the food is very good and the owners really friendly and proud of what theyre doing'

In [14]:
# check if words are in english dictionary
def check_en(var):
    d = enchant.Dict("en_US")
    tmp = var.split() #tokenize
    tmp_list = list()
    for word in tmp:
        if d.check(word):
            tmp_list.append(word)
    tmp = ' '.join(tmp_list)
    return tmp

selected_data['clean_review'] = selected_data.clean_review.apply(check_en)

In [15]:
selected_data.clean_review[0]

'yes believe it or not there is really good soul food in love the breaded catfish the fried chicken and especially the greens lots of flavor to them and not drenched in butter the mac and cheese is good the cornbread is a bit of a disappointment as is the sauteed cabbage but by and large the food is very good and the owners really friendly and proud of what doing'

In [16]:
# remove stopwords
def remove_stopwords(var):
    my_sw = (stopwords.words('english'))
    tmp = var.split()
    tmp = [word for word in tmp if my_sw.count(word) == 0]
    tmp = ' '.join(tmp)
    return tmp

selected_data['clean_review'] = selected_data.clean_review.apply(remove_stopwords)

In [17]:
selected_data.clean_review[0]

'yes believe really good soul food love breaded catfish fried chicken especially greens lots flavor drenched butter mac cheese good cornbread bit disappointment sauteed cabbage large food good owners really friendly proud'

In [18]:
# stemming
def my_stem(var):
    from nltk.stem import PorterStemmer
    stem = PorterStemmer()
    tmp = var.split()
    tmp = [stem.stem(word) for word in tmp]
    tmp = ' '.join(tmp)
    return tmp

selected_data['clean_review'] = selected_data.clean_review.apply(my_stem)

In [19]:
selected_data.clean_review[0]

'ye believ realli good soul food love bread catfish fri chicken especi green lot flavor drench butter mac chees good cornbread bit disappoint saute cabbag larg food good owner realli friendli proud'

In [20]:
# tfidf vectorize text
def my_tfidf(df_in):
    tf_idf_vectorizer = TfidfVectorizer()
    tfidf_vec_out = pd.DataFrame(tf_idf_vectorizer.fit_transform(df_in).toarray())
    tfidf_vec_out.columns = tf_idf_vectorizer.get_feature_names()
    return tfidf_vec_out

In [21]:
tfidf_text = my_tfidf(selected_data.clean_review)
tfidf_text

Unnamed: 0,aah,ab,aback,abacu,abalon,abandon,abbrevi,abdomen,abdomin,abhor,...,zinger,zingi,zip,zippi,zither,zombi,zone,zoo,zoom,zucchini
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
# save object
def save_df(df_in, path_in, name_in):
    import pickle
    try:
        pickle.dump(df_in, open(path_in + name_in + ".pkl", "wb"))
    except:
        print ("SAVE FAILED")
        pass

# open object
def open_df(path_in, name_in):
    import pickle
    try:
        df_out = pickle.load(open(path_in + name_in + ".pkl", "rb"))
    except:
        print ("OPEN FAILED")
        pass
    return df_out

In [23]:
# save tfidf_text
save_df(tfidf_text, 'saved_object/', 'tfidf_text')

SAVE FAILED


In [24]:
# load saved tfidf_text
tfidf_text = open_df('saved_object/', 'tfidf_text')
tfidf_text

OPEN FAILED


UnboundLocalError: local variable 'df_out' referenced before assignment

In [25]:
def pca_fun(df_in):
    my_pca = PCA(n_components=0.95)
    dim_reduced_data = my_pca.fit_transform(df_in)
    print ("explained variance: " + str(np.sum(my_pca.explained_variance_ratio_)))
    
    return dim_reduced_data

In [26]:
pca_data = pca_fun(tfidf_text)

explained variance: 0.950024768821888


In [27]:
# save pca_data
save_df(pca_data, 'saved_object/', 'pca_data')

SAVE FAILED


In [28]:
# load saved pca_data
pca_data = open_df('saved_object/', 'pca_data')
pca_data

OPEN FAILED


UnboundLocalError: local variable 'df_out' referenced before assignment

In [None]:
pca_data.shape

# Train Models - Predict Stars

In [None]:
# split data into training/test sets
X_train, X_test, y_train, y_test = train_test_split(pca_data, selected_data.stars, test_size=0.20, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape
X_train

## Random Forests

In [None]:
# Cross Validation
my_model = RandomForestClassifier(random_state=0)

parameters = {'n_estimators':[10, 100], 'max_depth': [25, 50]}

my_grid = GridSearchCV(my_model, parameters, cv=5)
my_grid.fit(X_train, y_train)

In [None]:
# best parameters
my_grid.best_params_

In [None]:
# build model using best parameters and fit training data
my_model_optimal = RandomForestClassifier(**my_grid.best_params_, random_state=0)
my_model_optimal.fit(X_train, y_train)   
y_pred = my_model_optimal.predict(X_test)

In [None]:
# Training & Test accuracy
print("Training Accuracy is", my_model_optimal.score(X_train, y_train)*100, "%")
print("Test Accuracy is", my_model_optimal.score(X_test, y_test)*100, "%")

In [None]:
# metrics
metrics = pd.DataFrame(precision_recall_fscore_support(y_test, y_pred, average='weighted'))
metrics.index = ["precision", "recall", "fscore", "support"]
metrics

## MLP

Trying different hidden_layer_sizes

In [None]:
# build MLP classifier
clf = MLPClassifier(random_state=1, max_iter=300, hidden_layer_sizes = (200, 100, 5))
clf.fit(X_train, y_train)

In [None]:
# Training & Test accuracy
print("Training Accuracy is", clf.score(X_train, y_train)*100, "%")
print("Test Accuracy is", clf.score(X_test, y_test)*100, "%")

In [None]:
# build MLP classifier
clf = MLPClassifier(random_state=1, max_iter=300, hidden_layer_sizes = (200, 5))
clf.fit(X_train, y_train)

In [None]:
# Training & Test accuracy
print("Training Accuracy is", clf.score(X_train, y_train)*100, "%")
print("Test Accuracy is", clf.score(X_test, y_test)*100, "%")

In [None]:
# metrics
y_pred = clf.predict(X_test)
metrics = pd.DataFrame(precision_recall_fscore_support(y_test, y_pred, average='weighted'))
metrics.index = ["precision", "recall", "fscore", "support"]
metrics

In [None]:
# build MLP classifier
clf = MLPClassifier(random_state=1, max_iter=300, hidden_layer_sizes = (100, 5))
clf.fit(X_train, y_train)

In [None]:
# Training & Test accuracy
print("Training Accuracy is", clf.score(X_train, y_train)*100, "%")
print("Test Accuracy is", clf.score(X_test, y_test)*100, "%")

# Train Models - Predict High/Low

In [None]:
# split data into training/test sets
X_train, X_test, y_train, y_test = train_test_split(pca_data, selected_data.binary, test_size=0.20, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## Random Forests

In [None]:
# Cross Validation
my_model = RandomForestClassifier(random_state=0)

parameters = {'n_estimators':[10, 100], 'max_depth': [25, 50]}

my_grid = GridSearchCV(my_model, parameters, cv=5)
my_grid.fit(X_train, y_train)

In [None]:
# best parameters
my_grid.best_params_

In [None]:
# build model using best parameters and fit training data
my_model_optimal = RandomForestClassifier(**my_grid.best_params_, random_state=0)

my_model_optimal.fit(X_train, y_train)   
y_pred = my_model_optimal.predict(X_test)

In [None]:
# Training & Test accuracy
print("Training Accuracy is", my_model_optimal.score(X_train, y_train)*100, "%")
print("Test Accuracy is", my_model_optimal.score(X_test, y_test)*100, "%")

In [None]:
# metrics
metrics = pd.DataFrame(precision_recall_fscore_support(y_test, y_pred, average='weighted'))
metrics.index = ["precision", "recall", "fscore", "support"]
metrics

## MLP

Trying different hidden_layer_sizes

In [None]:
# build MLP classifier
clf = MLPClassifier(random_state=1, max_iter=300, hidden_layer_sizes = (10, 5))
clf.fit(X_train, y_train)

In [None]:
# Training & Test accuracy
print("Training Accuracy is", clf.score(X_train, y_train)*100, "%")
print("Test Accuracy is", clf.score(X_test, y_test)*100, "%")

In [None]:
# metrics
y_pred = clf.predict(X_test)
metrics = pd.DataFrame(precision_recall_fscore_support(y_test, y_pred, average='weighted'))
metrics.index = ["precision", "recall", "fscore", "support"]
metrics

In [None]:
# build MLP classifier
clf = MLPClassifier(random_state=1, max_iter=300, hidden_layer_sizes = (5, 5))
clf.fit(X_train, y_train)

In [None]:
# Training & Test accuracy
print("Training Accuracy is", clf.score(X_train, y_train)*100, "%")
print("Test Accuracy is", clf.score(X_test, y_test)*100, "%")

In [None]:
# metrics
y_pred = clf.predict(X_test)
metrics = pd.DataFrame(precision_recall_fscore_support(y_test, y_pred, average='weighted'))
metrics.index = ["precision", "recall", "fscore", "support"]
metrics

In [None]:
# build MLP classifier
clf = MLPClassifier(random_state=1, max_iter=300, hidden_layer_sizes = (20, 5))
clf.fit(X_train, y_train)

In [None]:
# Training & Test accuracy
print("Training Accuracy is", clf.score(X_train, y_train)*100, "%")
print("Test Accuracy is", clf.score(X_test, y_test)*100, "%")

In [None]:
# metrics
y_pred = clf.predict(X_test)
metrics = pd.DataFrame(precision_recall_fscore_support(y_test, y_pred, average='weighted'))
metrics.index = ["precision", "recall", "fscore", "support"]
metrics