### Machine Learning Project Code

In [1]:
import nltk
import pandas as pd
import numpy as np
import sklearn 
import sklearn.metrics
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
from imblearn.over_sampling import SMOTE

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nylaennels/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nylaennels/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
raw_train = pd.read_csv('/Users/nylaennels/Desktop/train.csv')
dev = pd.read_csv('/Users/nylaennels/Desktop/dev.csv')

In [6]:
raw_train.head()

Unnamed: 0,ex_id,user_id,prod_id,rating,label,date,review
0,0,923,0,3.0,1,2014-12-08,The food at snack is a selection of popular Gr...
1,1,924,0,3.0,1,2013-05-16,This little place in Soho is wonderful. I had ...
2,2,925,0,4.0,1,2013-07-01,ordered lunch for 15 from Snack last Friday. ...
3,3,926,0,4.0,1,2011-07-28,This is a beautiful quaint little restaurant o...
4,4,927,0,4.0,1,2010-11-01,Snack is great place for a casual sit down lu...


#### Process Text

In [7]:
tokens=[]
for i in range(len(raw_train)):
    tokens.append(word_tokenize(raw_train['review'][i]))

Getting rid of stop words

In [9]:
stop_words = stopwords.words('english')
tokens_filt = tokens.copy()

In [11]:
for w in range(len(tokens_filt)): #w = review index
    count=list(range(len(tokens_filt[w])))
    for i in count: #i = word index
        if tokens_filt[w][i].lower() in stop_words:
            tokens_filt[w].remove(tokens_filt[w][i])
            del count[-1]
        else:
            pass

In [12]:
punct = ['?','.','!','(',')',',','...',';','''''','""']

Removing punctuation

In [13]:
for w in range(len(tokens_filt)): #w = review index
    count=list(range(len(tokens_filt[w])))
    for i in count: #i = word index
        if tokens_filt[w][i] in punct:
            tokens_filt[w].remove(tokens_filt[w][i])
            del count[-1]
        else:
            pass

Stemming words

In [14]:
ps = PorterStemmer()

In [15]:
for w in range(len(tokens_filt)): #w = review index
    count=list(range(len(tokens_filt[w])))
    for i in count: #i = word index
        tokens_filt[w][i] = (ps.stem(tokens_filt[w][i]))
        del count[-1]

#### Vectorize

In [16]:
#List of unlinked words (for vectorizer)
unlinked_train = []
for w in range(len(tokens_filt)): #w = review index
    count=list(range(len(tokens_filt[w])))
    for i in count: #i = word index
        unlinked_train.append(tokens_filt[w][i])
        del count[-1]

In [17]:
targets=raw_train['label']

In [18]:
raw_train['cleaned_review'] = tokens_filt

In [19]:
df=pd.DataFrame(raw_train['cleaned_review'])

In [20]:
# Covnert column from list type to str type
for i in range(len(df)): 
    df['cleaned_review'][i]=" ".join(df['cleaned_review'][i])

In [21]:
# Initialize tf-idf vectorizer
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(lowercase=False)
train_vectors = vectorizer.fit_transform(df['cleaned_review'])

In [22]:
test_dta = dev['review'] #use dev file as test data

In [23]:
test_vectors = vectorizer.transform(test_dta)

#### Initialize baseline model & make predictions

In [145]:
# Train a classifier 
sgd = SGDClassifier(loss='log',penalty= 'l2', max_iter= 50,
                                             alpha= 0.00001,fit_intercept= True)
# Fit the classifier 
sgd.fit(train_vectors,targets)

SGDClassifier(alpha=1e-05, loss='log', max_iter=50)

In [146]:
predics = sgd.predict(test_vectors)
predics_prob = sgd.predict_proba(test_vectors)

In [147]:
test_targets = dev['label']

#### Metrics - BEFORE UPSAMPLING

In [149]:
# Isolate the prediction probabilites of the postive class
predics_prob_ones = []
for i in range(len(predics_prob)):
    predics_prob_ones.append(predics_prob[i][1])

In [None]:
predics_prob_ones = np.array(predics_prob_ones)

In [None]:
# Accuracy, AUC-ROC, AP
acc_raw = sklearn.metrics.accuracy_score(test_targets, predics, normalize=True)
roc_raw = sklearn.metrics.roc_auc_score(test_targets,predics_prob_ones)
ap_raw = sklearn.metrics.average_precision_score(test_targets,predics_prob_ones) 

In [159]:
metrics_raw = [acc_raw,roc_raw,ap_raw]
metrics_raw = pd.DataFrame(metrics_raw, index=['accuracy','AUC-ROC','AP'], columns=['Baseline_Metrics'])
metrics_raw

Unnamed: 0,Baseline_Metrics
accuracy,0.89824
AUC-ROC,0.714675
AP,0.207655


--------------------------------------------------------------------------------------------------------------------

#### Up-sample - SMOTE

In [160]:
prep_df = raw_train.drop(columns=['ex_id','user_id','prod_id','rating','date'])

In [161]:
upsamp = SMOTE(random_state=5)
samp_train,samp_targ = upsamp.fit_resample(train_vectors,targets)

#### Predict using up-sampled data

In [162]:
#samp_train_vectors = vectorizer.transform(samp_df['cleaned_review'])
sgd.fit(samp_train,samp_targ)
samp_predics = sgd.predict(test_vectors)

In [163]:
#Using predict_proba for probability scores for metrics
y_score = sgd.predict_proba(test_vectors)

#### Metrics - AFTER SMOTE UPSAMPLING

In [166]:
# Isolate probablity predictions for the positive class
predics_prob_1 = []
for i in range(len(y_score)):
    predics_prob_1.append(y_score[i][1])

In [None]:
acc_SMOTE = sklearn.metrics.accuracy_score(test_targets, samp_predics, normalize=True)
roc_SMOTE = sklearn.metrics.roc_auc_score(test_targets,predics_prob_1)
ap_SMOTE = sklearn.metrics.average_precision_score(test_targets,predics_prob_1)

In [222]:
metrics_upsamp = [acc_SMOTE,roc_SMOTE,ap_SMOTE]
metric_up = pd.DataFrame(metrics_upsamp, columns=['Upsampled_no_tuning'], index=['accuracy','AUC-ROC','AP'])

In [223]:
pd.concat([metrics_raw,metric_up],axis=1,sort=False)

Unnamed: 0,Baseline_Metrics,Upsampled_no_tuning
accuracy,0.89824,0.662648
AUC-ROC,0.714675,0.69354
AP,0.207655,0.188034


---------------------------------------------------------------------------------------------------------------------

#### Hyperparameter Tuning - using SMOTE upsampled data

First attempt

In [109]:
max_iter_grid = [25,100,250,500,750,1000,1250,1500,2000]
alpha_grid = [0.000001,0.00005,0.0001,0.0005,0.001,0.005,0.01,0.1,0.2]

In [110]:
tun_iter = []
tun_alpha = []
tun_acc = []
tun_roc = []
tun_ap = []

for i in range(0,9):
    sgd_tun = SGDClassifier(loss='log',penalty='l2',max_iter=max_iter_grid[i], alpha=alpha_grid[i],fit_intercept=True)
    sgd_tun.fit(samp_train,samp_targ)
    predics_tun = sgd_tun.predict(test_vectors)
    proba_tun = sgd_tun.predict_proba(test_vectors)
    
    tun_iter.append(max_iter_grid[i])
    tun_alpha.append(alpha_grid[i])
    
    predics_prob_tun = []
   
    for i in range(len(proba_tun)):
        predics_prob_tun.append(proba_tun[i][1])
    
    tun_acc.append(sklearn.metrics.accuracy_score(test_targets, predics_tun, normalize=True))
    tun_roc.append(sklearn.metrics.roc_auc_score(test_targets,predics_prob_tun))
    tun_ap.append(sklearn.metrics.average_precision_score(test_targets,predics_prob_tun))

Metrics from first attempt

In [111]:
metrics_df_tun = list(zip(tun_iter,tun_alpha,tun_acc,tun_roc,tun_ap))
metrics_df_tun = pd.DataFrame(metrics_df_tun,columns=['max_iter','alpha','accuracy','AUC-ROC','AP'] )

In [113]:
metrics_df_tun

Unnamed: 0,max_iter,alpha,accuracy,AUC-ROC,AP
0,25,1e-06,0.631605,0.683375,0.17751
1,100,5e-05,0.719416,0.677819,0.178257
2,250,0.0001,0.740047,0.666983,0.172305
3,500,0.0005,0.766023,0.641739,0.160054
4,750,0.001,0.769642,0.633799,0.157008
5,1000,0.005,0.781168,0.624681,0.155411
6,1250,0.01,0.786736,0.622881,0.154769
7,1500,0.1,0.806197,0.620745,0.1539
8,2000,0.2,0.898435,0.621742,0.154708


Further tuning below....

In [104]:
max_iter=[25,100,250,500,750,1000,1250,1300,1500,2000]
alpha=[0.000001,0.00005,0.00001,0.0001, 0.0005,0.001,0.005,0.01,0.1,0.2]
#loss=['hinge','log','squared_loss']
#penalty=['l1','l2']

In [105]:
tun_iter = []
tun_alpha = []
tun_acc = []
tun_roc = []
tun_ap = []

for i in range(len(max_iter)):
    for j in range(len(alpha)):
        sgd_tun = SGDClassifier(loss='log',penalty='l2',max_iter=max_iter[i], alpha=alpha[j],fit_intercept=True)
        sgd_tun.fit(samp_train,samp_targ)
        predics_tun = sgd_tun.predict(test_vectors)
        proba_tun = sgd_tun.predict_proba(test_vectors)
    
        tun_iter.append(max_iter[i])
        tun_alpha.append(alpha[j])
    
        predics_prob_tun = []
        print(max_iter[i],alpha[j])
        for t in range(len(proba_tun)):
            predics_prob_tun.append(proba_tun[t][1])
    
        tun_acc.append(sklearn.metrics.accuracy_score(test_targets, predics_tun, normalize=True))
        tun_roc.append(sklearn.metrics.roc_auc_score(test_targets,predics_prob_tun))
        tun_ap.append(sklearn.metrics.average_precision_score(test_targets,predics_prob_tun))

25 1e-06
25 5e-05
25 1e-05
25 0.0001
25 0.0005
25 0.001
25 0.005
25 0.01
25 0.1
25 0.2
100 1e-06
100 5e-05
100 1e-05
100 0.0001
100 0.0005
100 0.001
100 0.005
100 0.01
100 0.1
100 0.2
250 1e-06
250 5e-05
250 1e-05
250 0.0001
250 0.0005
250 0.001
250 0.005
250 0.01
250 0.1
250 0.2
500 1e-06
500 5e-05
500 1e-05
500 0.0001
500 0.0005
500 0.001
500 0.005
500 0.01
500 0.1
500 0.2
750 1e-06
750 5e-05
750 1e-05
750 0.0001
750 0.0005
750 0.001
750 0.005
750 0.01
750 0.1
750 0.2
1000 1e-06
1000 5e-05
1000 1e-05
1000 0.0001
1000 0.0005
1000 0.001
1000 0.005
1000 0.01
1000 0.1
1000 0.2
1250 1e-06
1250 5e-05
1250 1e-05
1250 0.0001
1250 0.0005
1250 0.001
1250 0.005
1250 0.01
1250 0.1
1250 0.2
1300 1e-06
1300 5e-05
1300 1e-05
1300 0.0001
1300 0.0005
1300 0.001
1300 0.005
1300 0.01
1300 0.1
1300 0.2
1500 1e-06
1500 5e-05
1500 1e-05
1500 0.0001
1500 0.0005
1500 0.001
1500 0.005
1500 0.01
1500 0.1
1500 0.2
2000 1e-06
2000 5e-05
2000 1e-05
2000 0.0001
2000 0.0005
2000 0.001
2000 0.005
2000 0.01
2000 0.1

Metrics from further tuning (top results)

In [107]:
new_metric_df = list(zip(tun_iter,tun_alpha,tun_acc,tun_roc,tun_ap))
new_metric_df = pd.DataFrame(new_metric_df,columns=['max_iter','alpha','accuracy','AUC-ROC','AP'])

In [116]:
new_metric_df.sort_values(by='AUC-ROC', ascending=False).head()

Unnamed: 0,max_iter,alpha,accuracy,AUC-ROC,AP
52,1000,1e-05,0.66134,0.694041,0.188746
22,250,1e-05,0.664207,0.693745,0.188357
72,1300,1e-05,0.662119,0.693689,0.188043
12,100,1e-05,0.660672,0.69363,0.188491
62,1250,1e-05,0.660588,0.693601,0.188275


Best hyperparameters and metrics from tuning

In [196]:
sgd_tun = SGDClassifier(loss='log',penalty= 'l2', max_iter=1000 ,alpha=0.000010,fit_intercept= True)
sgd_tun.fit(samp_train,samp_targ)
new_predics = sgd_tun.predict(test_vectors)
new_proba = sgd_tun.predict_proba(test_vectors)

In [197]:
new_proba_ones = []
for t in range(len(new_proba)):
    new_proba_ones.append(new_proba[t][1])

In [193]:
best_tuning = pd.DataFrame(new_metric_df.iloc[52])
print("Best hyperparameters and metrics after tuning:")
best_tuning

Best hyperparameters and metrics after tuning:


Unnamed: 0,95
max_iter,2000.0
alpha,0.001
accuracy,0.76967
AUC-ROC,0.633853
AP,0.157045


Metrics on tuned model trained WITHOUT upsampling

In [209]:
sgd_tun.fit(train_vectors,targets)

SGDClassifier(alpha=1e-05, loss='log')

In [211]:
tun_unsamp_predics = sgd_tun.predict(test_vectors)

In [212]:
predics_prob = sgd_tun.predict_proba(test_vectors)

In [213]:
predics_probz_tun = []
   
for i in range(len(predics_prob)):
    predics_probz_tun.append(predics_prob[i][1])

In [214]:
acc_tun_unsamp = sklearn.metrics.accuracy_score(test_targets, new_predics, normalize=True)
roc_tun_unsamp = sklearn.metrics.roc_auc_score(test_targets,predics_probz_tun)
ap_tun_unsamp = sklearn.metrics.average_precision_score(test_targets,predics_probz_tun)

In [220]:
metric_tun_unsamp = [acc_tun_unsamp,roc_tun_unsamp,ap_tun_unsamp]
metric_tun_unsamp = pd.DataFrame(metric_tun_unsamp, index=['accuracy','AUC-ROC','AP'],columns=['Tuning_not_sampled'])

Combine all metrics 

In [253]:
tuning_met = [0.769670,0.633853,0.157045] #copied values from best_tuning table 
best_tuning_metric = pd.DataFrame(tuning_met,index=['accuracy','AUC-ROC','AP'],columns=['Tuning_sampled'])

In [255]:
pd.concat([metrics_raw,metric_up,metric_tun_unsamp,best_tuning_metric],axis=1,sort=False)

Unnamed: 0,Baseline_Metrics,Upsampled_no_tuning,Tuning_not_sampled,Tuning_sampled
accuracy,0.89824,0.662648,0.665405,0.76967
AUC-ROC,0.714675,0.69354,0.715131,0.633853
AP,0.207655,0.188034,0.208533,0.157045


## TO DO if time ..... (must re-run notebook):

#### Upsample - Random Over Sampler

In [None]:
from imblearn.over_sampling import RandomOverSampler, RandomUnderSampler

In [None]:
ros = RandomOverSampler(random_state=5)
train_ros, targ_ros = ros.fit_resample(train_vectors,targets)

In [None]:
sgd_tun.fit(train_ros,targ_ros) 
ros_predics = sgd_tun.predict(test_vectors) 
y_scor = sgd_tun.predict_proba(test_vectors)

In [None]:
y_scor_ones = []
for i in range(len(y_scor)):
        y_scor_ones.append(y_scor[i][1])

In [None]:
ros_tun_acc = sklearn.metrics.accuracy_score(test_targets, ros_predics, normalize=True)
ros_tun_roc = sklearn.metrics.roc_auc_score(test_targets,y_scor_ones)
ros_tun_ap = sklearn.metrics.average_precision_score(test_targets,y_scor_ones)

#### Downsample - Random Under Sampler

In [None]:
rus = RandomUnderSampler(random_state=5) 
train_rus, targ_rus = rus.fit_resample(train_vectors,targets)

In [None]:
sgd_tun.fit(train_rus,targ_rus) 
rus_predics = sgd_tun.predict(test_vectors) 
y_scor = sgd_tun.predict_proba(test_vectors) 

In [None]:
y_scor_ones = []
for i in range(len(y_scor)):
        y_scor_ones.append(y_scor[i][1])

In [None]:
rus_tun_acc = sklearn.metrics.accuracy_score(test_targets, rus_predics, normalize=True)
rus_tun_roc = sklearn.metrics.roc_auc_score(test_targets,y_scor_ones)
rus_tun_ap = sklearn.metrics.average_precision_score(test_targets,y_scor_ones)

#### Add 'dev' data to training + redo preprocessing

In [None]:
# Hold out a small portion of 'dev' to evaluate model with added data
from sklearn.model_selection import train_test_split
dev_train, dev_test = train_test_split(dev, test_size=0.2, random_state=5)

In [None]:
dev_test.head()

In [None]:
raw_train2 = pd.raw_train.drop('cleaned_review')
raw_train2.head()

In [None]:
comb_train = pd.concat([raw_train2,dev_train],axis=0, ignore_index=True)

--------------------------------------------------------------------------------

In [None]:
tokenz=[]
for i in range(len(comb_train)):
    tokenz.append(word_tokenize(comb_train['review'][i]))

In [None]:
tokenz_filt = tokenz.copy()

In [None]:
#Remove stop words
for w in range(len(tokenz_filt)): #w = review index
    count=list(range(len(tokenz_filt[w])))
    for i in count: #i = word index
        if tokenz_filt[w][i].lower() in stop_words:
            tokenz_filt[w].remove(tokenz_filt[w][i])
            del count[-1]
        else:
            pass

In [None]:
#Remove punctuation
for w in range(len(tokenz_filt)): #w = review index
    count=list(range(len(tokenz_filt[w])))
    for i in count: #i = word index
        if tokenz_filt[w][i] in punct:
            tokenz_filt[w].remove(tokenz_filt[w][i])
            del count[-1]
        else:
            pass

In [None]:
#Stemming
for w in range(len(tokenz_filt)): #w = review index
    count=list(range(len(tokenz_filt[w])))
    for i in count: #i = word index
        tokenz_filt[w][i] = (ps.stem(tokenz_filt[w][i]))
        del count[-1]

#### Re-Vectorize 

In [None]:
comb_targets=comb_train['label']

In [None]:
raw_train2['cleaned_review'] = tokenz_filt
df2=pd.DataFrame(raw_train2['cleaned_review'])

In [None]:
for i in range(len(df2)): 
    df2['cleaned_review'][i]=" ".join(df2['cleaned_review'][i])

In [None]:
comb_train_vectors = vectorizer.fit_transform(df2['cleaned_review'])
comb_test_vectors = vectorizer.transform(dev_test['review'])

#### Predictions / Metrics

In [None]:
# Fit the classifier 
sgd_tun.fit(comb_train_vectors,comb_targets) 
comb_predics = sgd_tun.predict(comb_test_vectors) 
comb_predics_prob = sgd_tun.predict_proba(comb_test_vectors) 

In [None]:
# Isolate probability scores for positive class
y_ones = []
for i in range(len(comb_predics_prob)):
        y_ones.append(comb_predics_prob[i][1])

In [None]:
test_targets = dev_test['label']

In [None]:
comb_acc = sklearn.metrics.accuracy_score(test_targets, comb_predics, normalize=True)
comb_roc = sklearn.metrics.roc_auc_score(test_targets,y_ones)
comb_ap = sklearn.metrics.average_precision_score(test_targets,y_ones)

#### Add remaining portion (20%) of 'dev' to training set

Must repeat previous steps

In [None]:
final_comb = pd.concat([comb_train,dev_test],axis=0, ignore_index=True)