### Machine Learning Project Code

In [7]:
import nltk
import pandas as pd
import numpy as np
import sklearn 
import sklearn.metrics
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
from imblearn.over_sampling import SMOTE

In [8]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nylaennels/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nylaennels/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
raw_train = pd.read_csv('/Users/nylaennels/Desktop/train.csv')
dev = pd.read_csv('/Users/nylaennels/Desktop/dev.csv')

In [5]:
raw_train.head()

Unnamed: 0,ex_id,user_id,prod_id,rating,label,date,review
0,0,923,0,3.0,1,2014-12-08,The food at snack is a selection of popular Gr...
1,1,924,0,3.0,1,2013-05-16,This little place in Soho is wonderful. I had ...
2,2,925,0,4.0,1,2013-07-01,ordered lunch for 15 from Snack last Friday. ...
3,3,926,0,4.0,1,2011-07-28,This is a beautiful quaint little restaurant o...
4,4,927,0,4.0,1,2010-11-01,Snack is great place for a casual sit down lu...


#### Process Text

In [37]:
tokens=[]
for i in range(len(raw_train)):
    tokens.append(word_tokenize(raw_train['review'][i]))

Getting rid of stop words

In [38]:
stop_words = stopwords.words('english')
tokens_filt = tokens.copy()

In [39]:
for w in range(len(tokens_filt)): #w = review index
    count=list(range(len(tokens_filt[w])))
    for i in count: #i = word index
        if tokens_filt[w][i].lower() in stop_words:
            tokens_filt[w].remove(tokens_filt[w][i])
            del count[-1]
        else:
            pass

In [40]:
punct = ['?','.','!','(',')',',','...',';','''''','""']

Removing punctuation

In [41]:
for w in range(len(tokens_filt)): #w = review index
    count=list(range(len(tokens_filt[w])))
    for i in count: #i = word index
        if tokens_filt[w][i] in punct:
            tokens_filt[w].remove(tokens_filt[w][i])
            del count[-1]
        else:
            pass

Stemming words

In [14]:
ps = PorterStemmer()

In [42]:
for w in range(len(tokens_filt)): #w = review index
    count=list(range(len(tokens_filt[w])))
    for i in count: #i = word index
        tokens_filt[w][i] = (ps.stem(tokens_filt[w][i]))
        del count[-1]

#### Vectorize

In [None]:
#List of unlinked words (for vectorizer)
#unlinked_train = []
#for w in range(len(tokens_filt)): #w = review index
#    count=list(range(len(tokens_filt[w])))
#    for i in count: #i = word index
#        unlinked_train.append(tokens_filt[w][i])
#        del count[-1]

In [44]:
targets=raw_train['label']

In [45]:
raw_train['cleaned_review'] = tokens_filt

In [46]:
df=pd.DataFrame(raw_train['cleaned_review'])

In [47]:
# Covnert column from list type to str type
for i in range(len(df)): 
    df['cleaned_review'][i]=" ".join(df['cleaned_review'][i])

In [51]:
# Initialize tf-idf vectorizer
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(lowercase=False)
train_vectors = vectorizer.fit_transform(df['cleaned_review'])

In [52]:
test_dta = dev['review'] #use dev file as test data

In [53]:
test_vectors = vectorizer.transform(test_dta)

#### Initialize baseline model & make predictions

In [54]:
# Train a classifier 
sgd = SGDClassifier(loss='log',penalty= 'l2', max_iter= 50,
                                             alpha= 0.00001,fit_intercept= True)
# Fit the classifier 
sgd.fit(train_vectors,targets)

SGDClassifier(alpha=1e-05, loss='log', max_iter=50)

In [55]:
predics = sgd.predict(test_vectors)
predics_prob = sgd.predict_proba(test_vectors)

In [56]:
test_targets = dev['label']

#### Metrics - BEFORE UPSAMPLING

In [57]:
# Isolate the prediction probabilites of the postive class
predics_prob_ones = []
for i in range(len(predics_prob)):
    predics_prob_ones.append(predics_prob[i][1])

In [58]:
predics_prob_ones = np.array(predics_prob_ones)

In [59]:
# Accuracy, AUC-ROC, AP
acc_raw = sklearn.metrics.accuracy_score(test_targets, predics, normalize=True)
roc_raw = sklearn.metrics.roc_auc_score(test_targets,predics_prob_ones)
ap_raw = sklearn.metrics.average_precision_score(test_targets,predics_prob_ones) 

In [60]:
metrics_raw = [acc_raw,roc_raw,ap_raw]
metrics_raw = pd.DataFrame(metrics_raw, index=['accuracy','AUC-ROC','AP'], columns=['Baseline_Metrics'])
metrics_raw

Unnamed: 0,Baseline_Metrics
accuracy,0.898185
AUC-ROC,0.714909
AP,0.208361


--------------------------------------------------------------------------------------------------------------------

#### Up-sample - SMOTE

In [61]:
prep_df = raw_train.drop(columns=['ex_id','user_id','prod_id','rating','date'])

In [62]:
upsamp = SMOTE(random_state=5)
samp_train,samp_targ = upsamp.fit_resample(train_vectors,targets)

#### Predict using up-sampled data

In [63]:
#samp_train_vectors = vectorizer.transform(samp_df['cleaned_review'])
sgd.fit(samp_train,samp_targ)
samp_predics = sgd.predict(test_vectors)

In [64]:
#Using predict_proba for probability scores for metrics
y_score = sgd.predict_proba(test_vectors)

#### Metrics - AFTER SMOTE UPSAMPLING

In [65]:
# Isolate probablity predictions for the positive class
predics_prob_1 = []
for i in range(len(y_score)):
    predics_prob_1.append(y_score[i][1])

In [66]:
acc_SMOTE = sklearn.metrics.accuracy_score(test_targets, samp_predics, normalize=True)
roc_SMOTE = sklearn.metrics.roc_auc_score(test_targets,predics_prob_1)
ap_SMOTE = sklearn.metrics.average_precision_score(test_targets,predics_prob_1)

In [67]:
metrics_upsamp = [acc_SMOTE,roc_SMOTE,ap_SMOTE]
metric_up = pd.DataFrame(metrics_upsamp, columns=['Upsampled_no_tuning'], index=['accuracy','AUC-ROC','AP'])

In [68]:
pd.concat([metrics_raw,metric_up],axis=1,sort=False)

Unnamed: 0,Baseline_Metrics,Upsampled_no_tuning
accuracy,0.898185,0.669386
AUC-ROC,0.714909,0.69345
AP,0.208361,0.18806


---------------------------------------------------------------------------------------------------------------------

#### Hyperparameter Tuning - using SMOTE upsampled data

First attempt

In [69]:
max_iter_grid = [25,100,250,500,750,1000,1250,1500,2000]
alpha_grid = [0.000001,0.00005,0.0001,0.0005,0.001,0.005,0.01,0.1,0.2]

In [70]:
tun_iter = []
tun_alpha = []
tun_acc = []
tun_roc = []
tun_ap = []

for i in range(0,9):
    sgd_tun = SGDClassifier(loss='log',penalty='l2',max_iter=max_iter_grid[i], alpha=alpha_grid[i],fit_intercept=True)
    sgd_tun.fit(samp_train,samp_targ)
    predics_tun = sgd_tun.predict(test_vectors)
    proba_tun = sgd_tun.predict_proba(test_vectors)
    
    tun_iter.append(max_iter_grid[i])
    tun_alpha.append(alpha_grid[i])
    
    predics_prob_tun = []
   
    for i in range(len(proba_tun)):
        predics_prob_tun.append(proba_tun[i][1])
    
    tun_acc.append(sklearn.metrics.accuracy_score(test_targets, predics_tun, normalize=True))
    tun_roc.append(sklearn.metrics.roc_auc_score(test_targets,predics_prob_tun))
    tun_ap.append(sklearn.metrics.average_precision_score(test_targets,predics_prob_tun))

Metrics from first attempt

In [71]:
metrics_df_tun = list(zip(tun_iter,tun_alpha,tun_acc,tun_roc,tun_ap))
metrics_df_tun = pd.DataFrame(metrics_df_tun,columns=['max_iter','alpha','accuracy','AUC-ROC','AP'] )

In [72]:
metrics_df_tun

Unnamed: 0,max_iter,alpha,accuracy,AUC-ROC,AP
0,25,1e-06,0.633916,0.683958,0.17847
1,100,5e-05,0.717133,0.678036,0.178543
2,250,0.0001,0.7391,0.667096,0.172355
3,500,0.0005,0.765521,0.641784,0.160083
4,750,0.001,0.769893,0.633815,0.157024
5,1000,0.005,0.78192,0.624712,0.155435
6,1250,0.01,0.797121,0.623296,0.155077
7,1500,0.1,0.856145,0.621044,0.154104
8,2000,0.2,0.825547,0.620626,0.153859


Further tuning below....

In [73]:
max_iter=[25,100,250,500,750,1000,1250,1300,1500,2000]
alpha=[0.000001,0.00005,0.00001,0.0001, 0.0005,0.001,0.005,0.01,0.1,0.2]
#loss=['hinge','log','squared_loss']
#penalty=['l1','l2']

In [74]:
tun_iter = []
tun_alpha = []
tun_acc = []
tun_roc = []
tun_ap = []

for i in range(len(max_iter)):
    for j in range(len(alpha)):
        sgd_tun = SGDClassifier(loss='log',penalty='l2',max_iter=max_iter[i], alpha=alpha[j],fit_intercept=True)
        sgd_tun.fit(samp_train,samp_targ)
        predics_tun = sgd_tun.predict(test_vectors)
        proba_tun = sgd_tun.predict_proba(test_vectors)
    
        tun_iter.append(max_iter[i])
        tun_alpha.append(alpha[j])
    
        predics_prob_tun = []
        print(max_iter[i],alpha[j])
        for t in range(len(proba_tun)):
            predics_prob_tun.append(proba_tun[t][1])
    
        tun_acc.append(sklearn.metrics.accuracy_score(test_targets, predics_tun, normalize=True))
        tun_roc.append(sklearn.metrics.roc_auc_score(test_targets,predics_prob_tun))
        tun_ap.append(sklearn.metrics.average_precision_score(test_targets,predics_prob_tun))

25 1e-06
25 5e-05
25 1e-05
25 0.0001
25 0.0005
25 0.001
25 0.005
25 0.01
25 0.1
25 0.2
100 1e-06
100 5e-05
100 1e-05
100 0.0001
100 0.0005
100 0.001
100 0.005
100 0.01
100 0.1
100 0.2
250 1e-06
250 5e-05
250 1e-05
250 0.0001
250 0.0005
250 0.001
250 0.005
250 0.01
250 0.1
250 0.2
500 1e-06
500 5e-05
500 1e-05
500 0.0001
500 0.0005
500 0.001
500 0.005
500 0.01
500 0.1
500 0.2
750 1e-06
750 5e-05
750 1e-05
750 0.0001
750 0.0005
750 0.001
750 0.005
750 0.01
750 0.1
750 0.2
1000 1e-06
1000 5e-05
1000 1e-05
1000 0.0001
1000 0.0005
1000 0.001
1000 0.005
1000 0.01
1000 0.1
1000 0.2
1250 1e-06
1250 5e-05
1250 1e-05
1250 0.0001
1250 0.0005
1250 0.001
1250 0.005
1250 0.01
1250 0.1
1250 0.2
1300 1e-06
1300 5e-05
1300 1e-05
1300 0.0001
1300 0.0005
1300 0.001
1300 0.005
1300 0.01
1300 0.1
1300 0.2
1500 1e-06
1500 5e-05
1500 1e-05
1500 0.0001
1500 0.0005
1500 0.001
1500 0.005
1500 0.01
1500 0.1
1500 0.2
2000 1e-06
2000 5e-05
2000 1e-05
2000 0.0001
2000 0.0005
2000 0.001
2000 0.005
2000 0.01
2000 0.1

Metrics from further tuning (top results)

In [75]:
new_metric_df = list(zip(tun_iter,tun_alpha,tun_acc,tun_roc,tun_ap))
new_metric_df = pd.DataFrame(new_metric_df,columns=['max_iter','alpha','accuracy','AUC-ROC','AP'])

In [76]:
new_metric_df.sort_values(by='AUC-ROC', ascending=False).head()

Unnamed: 0,max_iter,alpha,accuracy,AUC-ROC,AP
62,1250,1e-05,0.662982,0.693747,0.188325
72,1300,1e-05,0.661869,0.693692,0.187992
2,25,1e-05,0.663539,0.693656,0.188214
92,2000,1e-05,0.664681,0.693654,0.188103
12,100,1e-05,0.657999,0.693593,0.188132


Best hyperparameters and metrics from tuning

In [77]:
sgd_tun = SGDClassifier(loss='log',penalty= 'l2', max_iter=1000 ,alpha=0.000010,fit_intercept= True)
sgd_tun.fit(samp_train,samp_targ)
new_predics = sgd_tun.predict(test_vectors)
new_proba = sgd_tun.predict_proba(test_vectors)

In [78]:
new_proba_ones = []
for t in range(len(new_proba)):
    new_proba_ones.append(new_proba[t][1])

In [79]:
best_tuning = pd.DataFrame(new_metric_df.iloc[0])
print("Best hyperparameters and metrics after tuning:")
best_tuning

Best hyperparameters and metrics after tuning:


Unnamed: 0,0
max_iter,25.0
alpha,1e-06
accuracy,0.641545
AUC-ROC,0.683979
AP,0.177926


Metrics on tuned model trained WITHOUT upsampling

In [80]:
sgd_tun.fit(train_vectors,targets)

SGDClassifier(alpha=1e-05, loss='log')

In [81]:
tun_unsamp_predics = sgd_tun.predict(test_vectors)

In [82]:
predics_prob = sgd_tun.predict_proba(test_vectors)

In [83]:
predics_probz_tun = []
   
for i in range(len(predics_prob)):
    predics_probz_tun.append(predics_prob[i][1])

In [84]:
acc_tun_unsamp = sklearn.metrics.accuracy_score(test_targets, new_predics, normalize=True)
roc_tun_unsamp = sklearn.metrics.roc_auc_score(test_targets,predics_probz_tun)
ap_tun_unsamp = sklearn.metrics.average_precision_score(test_targets,predics_probz_tun)

In [85]:
metric_tun_unsamp = [acc_tun_unsamp,roc_tun_unsamp,ap_tun_unsamp]
metric_tun_unsamp = pd.DataFrame(metric_tun_unsamp, index=['accuracy','AUC-ROC','AP'],columns=['Tuning_not_sampled'])

Combine all metrics 

In [86]:
tuning_met = [0.769670,0.633853,0.157045] #copied values from best_tuning table 
best_tuning_metric = pd.DataFrame(tuning_met,index=['accuracy','AUC-ROC','AP'],columns=['Tuning_sampled'])

In [87]:
pd.concat([metrics_raw,metric_up,metric_tun_unsamp,best_tuning_metric],axis=1,sort=False)

Unnamed: 0,Baseline_Metrics,Upsampled_no_tuning,Tuning_not_sampled,Tuning_sampled
accuracy,0.898185,0.669386,0.654714,0.76967
AUC-ROC,0.714909,0.69345,0.714954,0.633853
AP,0.208361,0.18806,0.208065,0.157045


### First Gradescope Submission

#### EVALUATE BEST MODEL ON TEST SET

Import test set

In [88]:
test_set = pd.read_csv('/Users/nylaennels/Desktop/test_no_label.csv')

In [89]:
test_set.head()

Unnamed: 0,ex_id,user_id,prod_id,rating,label,date,review
0,6,929,0,4.0,,2009-08-25,Let me start with a shout-out to everyone who ...
1,9,932,0,5.0,,2014-05-09,Stopped in for lunch today and couldn't believ...
2,14,937,0,4.0,,2014-10-15,"Tiny little place, but very good food. Pastits..."
3,22,945,0,5.0,,2014-04-10,Food was delicious and service was great. Good...
4,23,946,0,5.0,,2014-03-29,Awesome hole in the wall place to grab a quick...


In [18]:
test_reviews = test_set['review']
test_reviews.head()

0    Let me start with a shout-out to everyone who ...
1    Stopped in for lunch today and couldn't believ...
2    Tiny little place, but very good food. Pastits...
3    Food was delicious and service was great. Good...
4    Awesome hole in the wall place to grab a quick...
Name: review, dtype: object

Vectorize test set + predict

In [90]:
final_test_vectors = vectorizer.transform(test_reviews)

In [91]:
sgd_tun.fit(train_vectors,targets) #best model according to ROC-AUC and AP

SGDClassifier(alpha=1e-05, loss='log')

In [92]:
final_predictions = sgd_tun.predict(final_test_vectors)
final_proba = sgd_tun.predict_proba(final_test_vectors)

In [93]:
final_yscore = []
for t in range(len(final_proba)):
    final_yscore.append(final_proba[t][1])

In [94]:
final_yscore_df = pd.DataFrame(final_yscore,columns=['probability_score'])

In [None]:
#final_yscore_df.head()

Export to CSV

In [None]:
#final_yscore_df.to_csv('/Users/nylaennels/Desktop/predictions.csv', index = False)

--------------------------------------------------------------------------------------------------------------------

### Second Gradescope Submission

#### Add 'dev' data to training data $\rightarrow$ re-train and predict

In [95]:
raw_train2 = pd.DataFrame(raw_train['review'])
dev_int = pd.DataFrame(dev['review'])
final_train = pd.concat([raw_train2,dev_int],axis=0, ignore_index=True) #stacks dfs in order

In [98]:
#final_train.head()

In [96]:
raw_train2_targets = raw_train['label']
dev_int_targets = dev['label']
final_train_targets = pd.concat([raw_train2_targets,dev_int_targets],axis=0,ignore_index=True)

In [None]:
#len(final_train_targets)

In [None]:
#len(final_train)

Re-do text preprocessing 

In [23]:
tokenz=[]
for i in range(len(final_train)):
    tokenz.append(word_tokenize(final_train['review'][i]))

In [24]:
tokenz_filt = tokenz.copy()

In [25]:
#Remove stop words
for w in range(len(tokenz_filt)): #w = review index
    count=list(range(len(tokenz_filt[w])))
    for i in count: #i = word index
        if tokenz_filt[w][i].lower() in stop_words:
            tokenz_filt[w].remove(tokenz_filt[w][i])
            del count[-1]
        else:
            pass

In [26]:
#Remove punctuation
for w in range(len(tokenz_filt)): #w = review index
    count=list(range(len(tokenz_filt[w])))
    for i in count: #i = word index
        if tokenz_filt[w][i] in punct:
            tokenz_filt[w].remove(tokenz_filt[w][i])
            del count[-1]
        else:
            pass

In [27]:
#Stemming
for w in range(len(tokenz_filt)): #w = review index
    count=list(range(len(tokenz_filt[w])))
    for i in count: #i = word index
        tokenz_filt[w][i] = (ps.stem(tokenz_filt[w][i]))
        del count[-1]

Re-vectorize new training data and corresponding targets

In [28]:
final_train['cleaned_review'] = tokenz_filt
df2=pd.DataFrame(final_train['cleaned_review'])

In [29]:
for i in range(len(df2)): 
    df2['cleaned_review'][i]=" ".join(df2['cleaned_review'][i])

In [31]:
comb_train_vectors = vectorizer.fit_transform(df2['cleaned_review']) #run above 'transform' test vectors

In [None]:
#VIEW
#results = pd.DataFrame(comb_train_vectors.toarray(), columns=vectorizer.get_feature_names())
#results

Final predictions

In [33]:
sgd_tun.fit(comb_train_vectors,final_train_targets) 
comb_predics = sgd_tun.predict(final_test_vectors) 
comb_predics_proba = sgd_tun.predict_proba(final_test_vectors) 

In [34]:
# Isolate probability scores for positive class
final_y_ones = []
for i in range(len(comb_predics_proba)):
        final_y_ones.append(comb_predics_proba[i][1])

Export final predictions to CSV

In [35]:
final_yscore_ = pd.DataFrame(final_y_ones,columns=['probability_score'])
final_yscore_.head()

Unnamed: 0,probability_score
0,0.048973
1,0.140689
2,0.194288
3,0.081263
4,0.077449


In [36]:
#final_yscore_.to_csv('/Users/nylaennels/Desktop/predictions.csv', index = False)

---------------------------------------------------------------------------------------------------------------------