In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import random
import string
import operator
from sklearn import preprocessing
#from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfTransformer
pd.options.mode.chained_assignment = None  # default='warn'

# Binary Bag-of-Words
## Yelp

In [2]:
########### Question 1 ###########
# convert both datasets

## binary bag-of-words
# yelp
data1 = pd.read_csv("hwk3_datasets/yelp-train.txt", sep="\t", header=None)
data2 = pd.read_csv("hwk3_datasets/yelp-valid.txt", sep="\t",  header=None)
data3 = pd.read_csv("hwk3_datasets/yelp-test.txt", sep="\t",  header=None)

exclude = set(string.punctuation)
for i in range(data1[0].count()):
    temp = data1[0][i]
    temp = ''.join(ch for ch in temp if ch not in exclude)
    temp = temp.lower()
    data1[0][i] = temp

for i in range(data2[0].count()):
    temp = data2[0][i]
    temp = ''.join(ch for ch in temp if ch not in exclude)
    temp = temp.lower()
    data2[0][i] = temp

for i in range(data3[0].count()):
    temp = data3[0][i]
    temp = ''.join(ch for ch in temp if ch not in exclude)
    temp = temp.lower()
    data3[0][i] = temp

yelp_train = data1.values
yelp_valid = data2.values
yelp_test = data3.values

# generating top 10000 words
pre_list_train = {}

for i in range(yelp_train.shape[0]):
    temp = yelp_train[i][0].split()
    for j in range(len(temp)):
        if temp[j] in pre_list_train:
            pre_list_train[temp[j]] += 1
        else:
            pre_list_train[temp[j]] = 1
            
# sort list
sorted_list_train = sorted(pre_list_train.items(), key=operator.itemgetter(1), reverse=True)

# find top 10000 words
top_list_train = []

for i in range(10000):
    top_list_train.append(sorted_list_train[i][0])
    
# # scikit binary bag-of-words vectorizer
# vectorizer = CountVectorizer(analyzer = 'word', tokenizer = None, preprocessor = None, stop_words = None, max_features = 10000)
# vectors_train = vectorizer.fit_transform(yelp_train[:,0])
# vectors_valid = vectorizer.transform(yelp_valid[:,0])
# vectors_test = vectorizer.transform(yelp_test[:,0])

In [3]:
# create binary bag-of-words representation for train, validation, test
vectors_train = np.zeros(shape=(yelp_train.shape[0],10000),dtype=int)
vectors_valid = np.zeros(shape=(yelp_valid.shape[0],10000),dtype=int)
vectors_test = np.zeros(shape=(yelp_test.shape[0],10000),dtype=int)

for i in range(yelp_train.shape[0]):
    words = yelp_train[i][0].split(' ')
    for j in range(10000):
        if top_list_train[j] in words:
            vectors_train[i][j] = 1
        else:
            vectors_train[i][j] = 0
    
for i in range(yelp_valid.shape[0]):
    words = yelp_valid[i][0].split(' ')
    for j in range(10000):
        if top_list_train[j] in words:
            vectors_valid[i][j] = 1
        else:
            vectors_valid[i][j] = 0
    
for i in range(yelp_test.shape[0]):
    words = yelp_test[i][0].split(' ')
    for j in range(10000):
        if top_list_train[j] in words:
            vectors_test[i][j] = 1
        else:
            vectors_test[i][j] = 0

In [4]:
# classes (y values)
c_train = yelp_train[:,1].astype(str)
c_valid = yelp_valid[:,1].astype(str)
c_test = yelp_test[:,1].astype(str)

In [5]:
# save dataset

#vocab
f = open("hwk3_datasets/converted_data/yelp-vocab.txt","w",encoding='utf-8')
for i in range(10000):
    f.write(sorted_list_train[i][0]+"\t"+str(i+1)+"\t"+str(sorted_list_train[i][1])+"\n")
f.close()

# datasets
#train
f = open("hwk3_datasets/converted_data/yelp-train.txt","w",encoding='utf-8')
for i in range(yelp_train.shape[0]):
    words = yelp_train[i][0].split(' ')
    replaced = [(top_list_train.index(word)+1) if word in top_list_train else word for word in words]
    for item in replaced:
        f.write("%s " %item)
    f.write("\t" + c_train[i] + "\n")
f.close()

#validation
f = open("hwk3_datasets/converted_data/yelp-valid.txt","w",encoding='utf-8')
for i in range(yelp_valid.shape[0]):
    words = yelp_valid[i][0].split(' ')
    replaced = [(top_list_train.index(word)+1) if word in top_list_train else word for word in words]
    for item in replaced:
        if item == replaced[-1]:
            f.write("%s" %item)
        f.write("%s " %item)
        
    f.write("\t" + c_valid[i] + "\n")
f.close()

#test
f = open("hwk3_datasets/converted_data/yelp-test.txt","w",encoding='utf-8')
for i in range(yelp_test.shape[0]):
    words = yelp_test[i][0].split(' ')
    replaced = [(top_list_train.index(word)+1) if word in top_list_train else word for word in words]
    for item in replaced:
        if item == replaced[-1]:
            f.write("%s" %item)
        f.write("%s " %item)
        
    f.write("\t" + c_test[i] + "\n")
f.close()

In [6]:
########### Question 2 ###########
# random classifier
random_results_train = np.empty(shape=(c_train.shape[0]),dtype=int)
random_results_valid = np.empty(shape=(c_valid.shape[0]),dtype=int)
random_results_test = np.empty(shape=(c_test.shape[0]),dtype=int)

for i in range(yelp_train.shape[0]):
    random_results_train[i] = np.random.randint(1,6)

for i in range(yelp_valid.shape[0]):
    random_results_valid[i] = np.random.randint(1,6)
    
for i in range(yelp_test.shape[0]):
    random_results_test[i] = np.random.randint(1,6)
    
random_results_train = random_results_train.astype(str)
random_results_valid = random_results_valid.astype(str)
random_results_test = random_results_test.astype(str)

f1_random_train = metrics.f1_score(c_train, random_results_train, average='macro')
f1_random_valid = metrics.f1_score(c_valid, random_results_valid, average='macro')
f1_random_test = metrics.f1_score(c_test, random_results_test, average='macro')
print("F1 score for random classifier on training dataset: ", f1_random_train)
print("F1 score for random classifier on validation dataset: ", f1_random_valid)
print("F1 score for random classifier on test dataset: ", f1_random_test)

F1 score for random classifier on training dataset:  0.181391642388
F1 score for random classifier on validation dataset:  0.196443621976
F1 score for random classifier on test dataset:  0.188403069368


In [7]:
# majority classifier
majority_list = {}
for i in range(yelp_train.shape[0]):
    if yelp_train[i][1] in majority_list:
        majority_list[yelp_train[i][1]] += 1
    else:
        majority_list[yelp_train[i][1]] = 1

majority = max(majority_list.items(), key=operator.itemgetter(1))[0]

majority_results_train = np.full((yelp_train.shape[0],1),majority)
majority_results_train = np.squeeze(majority_results_train).astype(str)

majority_results_valid = np.full((yelp_valid.shape[0],1),majority)
majority_results_valid = np.squeeze(majority_results_valid).astype(str)

majority_results_test = np.full((yelp_test.shape[0],1),majority)
majority_results_test = np.squeeze(majority_results_test).astype(str)


f1_majority_train = metrics.f1_score(c_train, majority_results_train, average='micro')
f1_majority_valid = metrics.f1_score(c_valid, majority_results_valid, average='micro')
f1_majority_test = metrics.f1_score(c_test, majority_results_test, average='micro')
print("Majority class: ", majority)
print("F1 score for majority classifier on training dataset: ", f1_majority_train)
print("F1 score for majority classifier on validation dataset: ", f1_majority_valid)
print("F1 score for majority classifier on test dataset: ", f1_majority_test)

Majority class:  4
F1 score for majority classifier on training dataset:  0.352571428571
F1 score for majority classifier on validation dataset:  0.356
F1 score for majority classifier on test dataset:  0.351


In [8]:
#naive Bayes multiclass classifier
from sklearn.naive_bayes import BernoulliNB
num = np.arange(0.001,0.01,0.001)  # testing multiple hyperparameters
y_pred_list = []
for i in range(len(num)):
    nb_clf = BernoulliNB(alpha=num[i],binarize=None)
    nb_clf.fit(vectors_train,c_train)
    test = nb_clf.predict(vectors_train)
    y_pred_list.append(nb_clf.predict(vectors_valid))
    print("alpha: ", num[i], " F1_train: ", metrics.f1_score(c_train,test,average='macro'), " F1_valid: ", metrics.f1_score(c_valid, y_pred_list[i],average="macro"))
    
# choosing optimal hyperparameters based on F1 metric
f1_list = []
for i in range(len(num)):
    f1_list.append(metrics.f1_score(c_valid, y_pred_list[i], average='macro'))

alpha_opt = num[f1_list.index(max(f1_list))]
print("Optimal hyperparameter for Naive Bayes: ", alpha_opt)

# f1 metric with best hyperparameter
nb_clf = BernoulliNB(alpha=alpha_opt)
nb_clf.fit(vectors_train,c_train)
y_pred_train = (nb_clf.predict(vectors_train))
y_pred_valid = (nb_clf.predict(vectors_valid))
y_pred_test = (nb_clf.predict(vectors_test))

f1_nb_train = metrics.f1_score(c_train, y_pred_train, average='macro')
f1_nb_valid = metrics.f1_score(c_valid,y_pred_valid, average='macro')
f1_nb_test = metrics.f1_score(c_test, y_pred_test,average='macro')

print("F1 score for Naive Bayes w/ optimal hyperparameter on training dataset: ", f1_nb_train)
print("F1 score for Naive Bayes w/ optimal hyperparameter on validation dataset: ", f1_nb_valid)
print("F1 score for Naive Bayes w/ optimal hyperparameter on test dataset: ", f1_nb_test)

alpha:  0.001  F1_train:  0.784823208367  F1_valid:  0.357769954585
alpha:  0.002  F1_train:  0.781606400127  F1_valid:  0.360143650098
alpha:  0.003  F1_train:  0.779424995491  F1_valid:  0.363287823578
alpha:  0.004  F1_train:  0.77804363117  F1_valid:  0.371852273074
alpha:  0.005  F1_train:  0.777003654568  F1_valid:  0.375269310287
alpha:  0.006  F1_train:  0.775676135214  F1_valid:  0.376461283229
alpha:  0.007  F1_train:  0.773467945461  F1_valid:  0.37804555183
alpha:  0.008  F1_train:  0.77231374774  F1_valid:  0.378195567626
alpha:  0.009  F1_train:  0.771349809341  F1_valid:  0.376928270044
Optimal hyperparameter for Naive Bayes:  0.008
F1 score for Naive Bayes w/ optimal hyperparameter on training dataset:  0.77231374774
F1 score for Naive Bayes w/ optimal hyperparameter on validation dataset:  0.378195567626
F1 score for Naive Bayes w/ optimal hyperparameter on test dataset:  0.357689749645


In [9]:
# decision trees classifier 
from sklearn import tree
depth = np.arange(10,50,2)
y_pred_list = []
for i in range(len(depth)):
    dt_clf = tree.DecisionTreeClassifier(max_depth=depth[i])
    dt_clf.fit(vectors_train,c_train)
    y_pred_list.append(dt_clf.predict(vectors_valid))
    test = dt_clf.predict(vectors_train)
    print("Depth: ", depth[i], " F1_train: ", metrics.f1_score(c_train,test,average='macro'), " F1_valid: ", metrics.f1_score(c_valid,y_pred_list[i], average='macro'))

# choosing maximum tree depth using validation set and F1 metric
f1_list = []
for i in range(len(depth)):
    f1_list.append(metrics.f1_score(c_valid, y_pred_list[i], average='macro'))

depth_opt = depth[f1_list.index(max(f1_list))]
print("Optimal depth for Decision Tree: ", depth_opt)

# f1 metric for train, validation, test with optimal hyperparameter(max tree depth)
dt_clf = tree.DecisionTreeClassifier(max_depth=depth_opt)
dt_clf.fit(vectors_train,c_train)
y_pred_train = dt_clf.predict(vectors_train)
y_pred_valid = dt_clf.predict(vectors_valid)
y_pred_test = dt_clf.predict(vectors_test)

f1_dt_train = metrics.f1_score(c_train,y_pred_train,average='macro')
f1_dt_valid = metrics.f1_score(c_valid,y_pred_valid,average='macro')
f1_dt_test = metrics.f1_score(c_test,y_pred_test,average='macro')


print("F1 score for Decision Tree w/ optimal depth on training dataset: ", f1_dt_train)
print("F1 score for Decision Tree w/ optimal depth on validation dataset: ", f1_dt_valid)
print("F1 score for Decision Tree w/ optimal depth on testing dataset: ", f1_dt_test)

Depth:  10  F1_train:  0.423428452981  F1_valid:  0.283798756908
Depth:  12  F1_train:  0.509317886134  F1_valid:  0.261656466917
Depth:  14  F1_train:  0.582342605266  F1_valid:  0.280027495475
Depth:  16  F1_train:  0.648080558544  F1_valid:  0.263341495708
Depth:  18  F1_train:  0.700370951386  F1_valid:  0.266445699533
Depth:  20  F1_train:  0.74569138898  F1_valid:  0.281250035928
Depth:  22  F1_train:  0.799531736402  F1_valid:  0.26903527573
Depth:  24  F1_train:  0.834195827204  F1_valid:  0.275214488039
Depth:  26  F1_train:  0.867603951649  F1_valid:  0.274103368565
Depth:  28  F1_train:  0.891560352585  F1_valid:  0.277906201393
Depth:  30  F1_train:  0.908688951382  F1_valid:  0.276708176318
Depth:  32  F1_train:  0.929040223038  F1_valid:  0.29498837925
Depth:  34  F1_train:  0.938913214545  F1_valid:  0.283703989275
Depth:  36  F1_train:  0.947706229736  F1_valid:  0.258619824271
Depth:  38  F1_train:  0.953332483887  F1_valid:  0.274077584421
Depth:  40  F1_train:  0.965

In [10]:
# Linear SVM
from sklearn.svm import LinearSVC
c = np.arange(2**-11,0.1,2**-9)
#c = [2**-7, 2**-5, 2**-3, 2**-1, 2**1, 2**3, 2**5, 2**7]
y_pred_list = []
for i in range(len(c)):
    svm_clf = LinearSVC(C=c[i], multi_class="ovr", loss='hinge')
    svm_clf.fit(vectors_train,c_train)
    y_pred_list.append(svm_clf.predict(vectors_valid))
    test = svm_clf.predict(vectors_train)
    print("C: ", c[i], " F1_train: ", metrics.f1_score(c_train,test,average='macro'), " F1_valid: ", metrics.f1_score(c_valid,y_pred_list[i],average='macro'))
    
f1_list = []
for i in range(len(c)):
    f1_list.append(metrics.f1_score(c_valid, y_pred_list[i], average='macro'))

c_opt = c[f1_list.index(max(f1_list))]
print("Optimal C for Linear SVM: ", c_opt)


# f1 metric for train, validation, test dataset with optimal c
svm_clf=LinearSVC(C=c_opt, multi_class='ovr', loss='hinge')
svm_clf.fit(vectors_train,c_train)

y_pred_train = svm_clf.predict(vectors_train)
y_pred_valid = svm_clf.predict(vectors_valid)
y_pred_test = svm_clf.predict(vectors_test)

f1_svm_train = metrics.f1_score(c_train,y_pred_train,average='macro')
f1_svm_valid = metrics.f1_score(c_valid,y_pred_valid,average='macro')
f1_svm_test = metrics.f1_score(c_test,y_pred_test,average='macro')

print("F1 score for Linear SVM w/ optimal depth on training dataset: ", f1_svm_train)
print("F1 score for Linear SVM w/ optimal depth on validation dataset: ", f1_svm_valid)
print("F1 score for Linear SVM w/ optimal depth on testing dataset: ", f1_svm_test)

C:  0.00048828125  F1_train:  0.472105621056  F1_valid:  0.328335366592
C:  0.00244140625  F1_train:  0.644040802553  F1_valid:  0.393762700845
C:  0.00439453125  F1_train:  0.684066141538  F1_valid:  0.412300878437
C:  0.00634765625  F1_train:  0.712892154039  F1_valid:  0.415817051422
C:  0.00830078125  F1_train:  0.741748944064  F1_valid:  0.414952621073
C:  0.01025390625  F1_train:  0.760105065177  F1_valid:  0.419908391179
C:  0.01220703125  F1_train:  0.772837733948  F1_valid:  0.422554585453
C:  0.01416015625  F1_train:  0.782839697436  F1_valid:  0.439759463415
C:  0.01611328125  F1_train:  0.794502051519  F1_valid:  0.438139652496
C:  0.01806640625  F1_train:  0.801801531213  F1_valid:  0.437437104771
C:  0.02001953125  F1_train:  0.808414022847  F1_valid:  0.442961522999
C:  0.02197265625  F1_train:  0.816187927236  F1_valid:  0.444363499903
C:  0.02392578125  F1_train:  0.82327808456  F1_valid:  0.453392613077
C:  0.02587890625  F1_train:  0.831068747024  F1_valid:  0.449565

# Binary Bag-of-Words
## IMDB

In [11]:
# binary bag-of-words
#IMDB

data4 = pd.read_csv("hwk3_datasets/IMDB-train.txt", sep="\t", header=None)
data5 = pd.read_csv("hwk3_datasets/IMDB-valid.txt", sep="\t",  header=None)
data6 = pd.read_csv("hwk3_datasets/IMDB-test.txt", sep="\t",  header=None)

exclude = set(string.punctuation)
for i in range(data4[0].count()):
    temp = data4[0][i]
    temp = ''.join(ch for ch in temp if ch not in exclude)
    temp = temp.lower()
    data4[0][i] = temp

for i in range(data5[0].count()):
    temp = data5[0][i]
    temp = ''.join(ch for ch in temp if ch not in exclude)
    temp = temp.lower()
    data5[0][i] = temp

for i in range(data6[0].count()):
    temp = data6[0][i]
    temp = ''.join(ch for ch in temp if ch not in exclude)
    temp = temp.lower()
    data6[0][i] = temp

imdb_train = data4.values
imdb_valid = data5.values
imdb_test = data6.values

# generating top 10000 words
pre_list_train = {}


for i in range(imdb_train.shape[0]):
    temp = imdb_train[i][0].split()
    for j in range(len(temp)):
        if temp[j] in pre_list_train:
            pre_list_train[temp[j]] += 1
        else:
            pre_list_train[temp[j]] = 1
            
# sort list
sorted_list_train = sorted(pre_list_train.items(), key=operator.itemgetter(1), reverse=True)

# find top 10000 words
top_list_train = []

for i in range(10000):
    top_list_train.append(sorted_list_train[i][0])
    
# # scikit binary bag-of-words vectorizer
# vectorizer = CountVectorizer(analyzer = 'word', tokenizer = None, preprocessor = None, stop_words = None, max_features = 10000)
# vectors_train = vectorizer.fit_transform(imdb_train[:,0])
# vectors_valid = vectorizer.transform(imdb_valid[:,0])
# vectors_test = vectorizer.transform(imdb_test[:,0])

In [12]:
# create binary bag-of-words representation for train, validation, test
vectors_train = np.zeros(shape=(imdb_train.shape[0],10000),dtype=int)
vectors_valid = np.zeros(shape=(imdb_valid.shape[0],10000),dtype=int)
vectors_test = np.zeros(shape=(imdb_test.shape[0],10000),dtype=int)

for i in range(imdb_train.shape[0]):
    words = imdb_train[i][0].split(' ')
    for j in range(10000):
        if top_list_train[j] in words:
            vectors_train[i][j] = 1
        else:
            vectors_train[i][j] = 0
    
for i in range(imdb_valid.shape[0]):
    words = imdb_valid[i][0].split(' ')
    for j in range(10000):
        if top_list_train[j] in words:
            vectors_valid[i][j] = 1
        else:
            vectors_valid[i][j] = 0
    
for i in range(imdb_test.shape[0]):
    words = imdb_test[i][0].split(' ')
    for j in range(10000):
        if top_list_train[j] in words:
            vectors_test[i][j] = 1
        else:
            vectors_test[i][j] = 0

In [13]:
# classes (y values)
c_train = imdb_train[:,1].astype(str)
c_valid = imdb_valid[:,1].astype(str)
c_test = imdb_test[:,1].astype(str)

In [14]:
# save dataset

#vocab
f = open("hwk3_datasets/converted_data/IMDB-vocab.txt","w",encoding='utf-8')
for i in range(10000):
    f.write(sorted_list_train[i][0]+"\t"+str(i+1)+"\t"+str(sorted_list_train[i][1])+"\n")
f.close()

# datasets
#train
f = open("hwk3_datasets/converted_data/IMDB-train.txt","w",encoding='utf-8')
for i in range(imdb_train.shape[0]):
    words = imdb_train[i][0].split(' ')
    replaced = [(top_list_train.index(word)+1) if word in top_list_train else word for word in words]
    for item in replaced:
        f.write("%s " %item)
    f.write("\t" + c_train[i] + "\n")
f.close()

#validation
f = open("hwk3_datasets/converted_data/IMDB-valid.txt","w",encoding='utf-8')
for i in range(imdb_valid.shape[0]):
    words = imdb_valid[i][0].split(' ')
    replaced = [(top_list_train.index(word)+1) if word in top_list_train else word for word in words]
    for item in replaced:
        if item == replaced[-1]:
            f.write("%s" %item)
        f.write("%s " %item)
        
    f.write("\t" + c_valid[i] + "\n")
f.close()

#test
f = open("hwk3_datasets/converted_data/IMDB-test.txt","w",encoding='utf-8')
for i in range(imdb_test.shape[0]):
    words = imdb_test[i][0].split(' ')
    replaced = [(top_list_train.index(word)+1) if word in top_list_train else word for word in words]
    for item in replaced:
        if item == replaced[-1]:
            f.write("%s" %item)
        f.write("%s " %item)
        
    f.write("\t" + c_test[i] + "\n")
f.close()

In [15]:
# random classifier
random_results_train = np.empty(shape=(c_train.shape[0]),dtype=int)
random_results_valid = np.empty(shape=(c_valid.shape[0]),dtype=int)
random_results_test = np.empty(shape=(c_test.shape[0]),dtype=int)

for i in range(imdb_train.shape[0]):
    random_results_train[i] = np.random.randint(0,2)

for i in range(imdb_valid.shape[0]):
    random_results_valid[i] = np.random.randint(0,2)
    
for i in range(imdb_test.shape[0]):
    random_results_test[i] = np.random.randint(0,2)
    
random_results_train = random_results_train.astype(str)
random_results_valid = random_results_valid.astype(str)
random_results_test = random_results_test.astype(str)

f1_random_train = metrics.f1_score(c_train, random_results_train, average='macro')
f1_random_valid = metrics.f1_score(c_valid, random_results_valid, average='macro')
f1_random_test = metrics.f1_score(c_test, random_results_test, average='macro')
print("F1 score for random classifier on training dataset: ", f1_random_train)
print("F1 score for random classifier on validation dataset: ", f1_random_valid)
print("F1 score for random classifier on test dataset: ", f1_random_test)

F1 score for random classifier on training dataset:  0.500799991125
F1 score for random classifier on validation dataset:  0.509285155876
F1 score for random classifier on test dataset:  0.497759971071


In [16]:
#naive Bayes multiclass classifier
from sklearn.naive_bayes import BernoulliNB
num = np.arange(0.001,0.01,0.001)  # testing multiple hyperparameters
y_pred_list = []
for i in range(len(num)):
    nb_clf = BernoulliNB(alpha=num[i],binarize=None)
    nb_clf.fit(vectors_train,c_train)
    test = nb_clf.predict(vectors_train)
    y_pred_list.append(nb_clf.predict(vectors_valid))
    print("alpha: ", num[i], " F1_train: ", metrics.f1_score(c_train,test,average='macro'), " F1_valid: ", metrics.f1_score(c_valid, y_pred_list[i],average="macro"))
    
# choosing optimal hyperparameters based on F1 metric
f1_list = []
for i in range(len(num)):
    f1_list.append(metrics.f1_score(c_valid, y_pred_list[i], average='macro'))

alpha_opt = num[f1_list.index(max(f1_list))]
print("Optimal hyperparameter for Naive Bayes: ", alpha_opt)

# f1 metric with best hyperparameter
nb_clf = BernoulliNB(alpha=alpha_opt)
nb_clf.fit(vectors_train,c_train)
y_pred_train = (nb_clf.predict(vectors_train))
y_pred_valid = (nb_clf.predict(vectors_valid))
y_pred_test = (nb_clf.predict(vectors_test))

f1_nb_train = metrics.f1_score(c_train, y_pred_train, average='macro')
f1_nb_valid = metrics.f1_score(c_valid,y_pred_valid, average='macro')
f1_nb_test = metrics.f1_score(c_test, y_pred_test,average='macro')

print("F1 score for Naive Bayes w/ optimal hyperparameter on training dataset: ", f1_nb_train)
print("F1 score for Naive Bayes w/ optimal hyperparameter on validation dataset: ", f1_nb_valid)
print("F1 score for Naive Bayes w/ optimal hyperparameter on test dataset: ", f1_nb_test)

alpha:  0.001  F1_train:  0.872771118478  F1_valid:  0.843077024907
alpha:  0.002  F1_train:  0.872771118478  F1_valid:  0.843077024907
alpha:  0.003  F1_train:  0.872771118478  F1_valid:  0.843175886724
alpha:  0.004  F1_train:  0.872771118478  F1_valid:  0.843075480544
alpha:  0.005  F1_train:  0.872771118478  F1_valid:  0.843075480544
alpha:  0.006  F1_train:  0.872771118478  F1_valid:  0.843175886724
alpha:  0.007  F1_train:  0.872771118478  F1_valid:  0.843175102479
alpha:  0.008  F1_train:  0.872771118478  F1_valid:  0.843075480544
alpha:  0.009  F1_train:  0.872704180328  F1_valid:  0.843075480544
Optimal hyperparameter for Naive Bayes:  0.003
F1 score for Naive Bayes w/ optimal hyperparameter on training dataset:  0.872771118478
F1 score for Naive Bayes w/ optimal hyperparameter on validation dataset:  0.843175886724
F1 score for Naive Bayes w/ optimal hyperparameter on test dataset:  0.835259920588


In [17]:
# decision trees classifier 441700
from sklearn import tree
depth = np.arange(10,50,2)
y_pred_list = []
for i in range(len(depth)):
    dt_clf = tree.DecisionTreeClassifier(max_depth=depth[i])
    dt_clf.fit(vectors_train,c_train)
    y_pred_list.append(dt_clf.predict(vectors_valid))
    test = dt_clf.predict(vectors_train)
    print("Depth: ", depth[i], " F1_train: ", metrics.f1_score(c_train,test,average='macro'), " F1_valid: ", metrics.f1_score(c_valid,y_pred_list[i], average='macro'))

# choosing maximum tree depth using validation set and F1 metric
f1_list = []
for i in range(len(depth)):
    f1_list.append(metrics.f1_score(c_valid, y_pred_list[i], average='macro'))

depth_opt = depth[f1_list.index(max(f1_list))]
print("Optimal depth for Decision Tree: ", depth_opt)

# f1 metric for train, validation, test with optimal hyperparameter(max tree depth)
dt_clf = tree.DecisionTreeClassifier(max_depth=depth_opt)
dt_clf.fit(vectors_train,c_train)
y_pred_train = dt_clf.predict(vectors_train)
y_pred_valid = dt_clf.predict(vectors_valid)
y_pred_test = dt_clf.predict(vectors_test)

f1_dt_train = metrics.f1_score(c_train,y_pred_train,average='macro')
f1_dt_valid = metrics.f1_score(c_valid,y_pred_valid,average='macro')
f1_dt_test = metrics.f1_score(c_test,y_pred_test,average='macro')


print("F1 score for Decision Tree w/ optimal depth on training dataset: ", f1_dt_train)
print("F1 score for Decision Tree w/ optimal depth on validation dataset: ", f1_dt_valid)
print("F1 score for Decision Tree w/ optimal depth on testing dataset: ", f1_dt_test)

Depth:  10  F1_train:  0.760842843323  F1_valid:  0.711618102202
Depth:  12  F1_train:  0.784775599482  F1_valid:  0.711707475031
Depth:  14  F1_train:  0.813524204257  F1_valid:  0.716593294035
Depth:  16  F1_train:  0.840843998549  F1_valid:  0.720396710935
Depth:  18  F1_train:  0.858351113688  F1_valid:  0.715001355296
Depth:  20  F1_train:  0.875782268953  F1_valid:  0.71496063426
Depth:  22  F1_train:  0.89029714748  F1_valid:  0.715170898613
Depth:  24  F1_train:  0.904044655242  F1_valid:  0.712266288293
Depth:  26  F1_train:  0.917405559996  F1_valid:  0.711291087344
Depth:  28  F1_train:  0.92749226205  F1_valid:  0.709602748865
Depth:  30  F1_train:  0.937768588626  F1_valid:  0.707453211724
Depth:  32  F1_train:  0.948299627218  F1_valid:  0.706237373274
Depth:  34  F1_train:  0.956181883643  F1_valid:  0.709596264882
Depth:  36  F1_train:  0.964387591054  F1_valid:  0.702189068406
Depth:  38  F1_train:  0.969055729724  F1_valid:  0.702159750005
Depth:  40  F1_train:  0.972

In [18]:
# Linear SVM
from sklearn.svm import LinearSVC
c = np.arange(2**-11,0.1,2**-9)
#c = [2**-7, 2**-5, 2**-3, 2**-1, 2**1, 2**3, 2**5, 2**7]
y_pred_list = []
for i in range(len(c)):
    svm_clf = LinearSVC(C=c[i], multi_class="ovr", loss='hinge')
    svm_clf.fit(vectors_train,c_train)
    y_pred_list.append(svm_clf.predict(vectors_valid))
    test = svm_clf.predict(vectors_train)
    print("C: ", c[i], " F1_train: ", metrics.f1_score(c_train,test,average='macro'), " F1_valid: ", metrics.f1_score(c_valid,y_pred_list[i],average='macro'))
    
f1_list = []
for i in range(len(c)):
    f1_list.append(metrics.f1_score(c_valid, y_pred_list[i], average='macro'))

c_opt = c[f1_list.index(max(f1_list))]
print("Optimal C for Linear SVM: ", c_opt)


# f1 metric for train, validation, test dataset with optimal c
svm_clf=LinearSVC(C=c_opt, multi_class='ovr', loss='hinge')
svm_clf.fit(vectors_train,c_train)

y_pred_train = svm_clf.predict(vectors_train)
y_pred_valid = svm_clf.predict(vectors_valid)
y_pred_test = svm_clf.predict(vectors_test)

f1_svm_train = metrics.f1_score(c_train,y_pred_train,average='macro')
f1_svm_valid = metrics.f1_score(c_valid,y_pred_valid,average='macro')
f1_svm_test = metrics.f1_score(c_test,y_pred_test,average='macro')

print("F1 score for Linear SVM w/ optimal depth on training dataset: ", f1_svm_train)
print("F1 score for Linear SVM w/ optimal depth on validation dataset: ", f1_svm_valid)
print("F1 score for Linear SVM w/ optimal depth on testing dataset: ", f1_svm_test)

C:  0.00048828125  F1_train:  0.85315962003  F1_valid:  0.834230251778
C:  0.00244140625  F1_train:  0.894304673928  F1_valid:  0.863657285191
C:  0.00439453125  F1_train:  0.909449255167  F1_valid:  0.872372044373
C:  0.00634765625  F1_train:  0.919256417256  F1_valid:  0.873680889182
C:  0.00830078125  F1_train:  0.925256493245  F1_valid:  0.875385998371
C:  0.01025390625  F1_train:  0.930325306499  F1_valid:  0.876391278169
C:  0.01220703125  F1_train:  0.935994738483  F1_valid:  0.87629147772
C:  0.01416015625  F1_train:  0.940529269236  F1_valid:  0.876092266918
C:  0.01611328125  F1_train:  0.944864361017  F1_valid:  0.876892700968
C:  0.01806640625  F1_train:  0.947598356684  F1_valid:  0.876193581875
C:  0.02001953125  F1_train:  0.948864866483  F1_valid:  0.876294446858
C:  0.02197265625  F1_train:  0.950664968665  F1_valid:  0.876794953521
C:  0.02392578125  F1_train:  0.952865292219  F1_valid:  0.875896513933
C:  0.02587890625  F1_train:  0.955065451636  F1_valid:  0.8764983

# Frequency Bag-of-Words
## Yelp

In [19]:
## frequency bag-of-words
# yelp
data7 = pd.read_csv("hwk3_datasets/yelp-train.txt", sep="\t", header=None)
data8 = pd.read_csv("hwk3_datasets/yelp-valid.txt", sep="\t",  header=None)
data9 = pd.read_csv("hwk3_datasets/yelp-test.txt", sep="\t",  header=None)

exclude = set(string.punctuation)
for i in range(data7[0].count()):
    temp = data7[0][i]
    temp = ''.join(ch for ch in temp if ch not in exclude)
    temp = temp.lower()
    data7[0][i] = temp

for i in range(data8[0].count()):
    temp = data8[0][i]
    temp = ''.join(ch for ch in temp if ch not in exclude)
    temp = temp.lower()
    data8[0][i] = temp

for i in range(data9[0].count()):
    temp = data9[0][i]
    temp = ''.join(ch for ch in temp if ch not in exclude)
    temp = temp.lower()
    data9[0][i] = temp

yelp_train = data7.values
yelp_valid = data8.values
yelp_test = data9.values

# generating top 10000 words
pre_list_train = {}

for i in range(yelp_train.shape[0]):
    temp = yelp_train[i][0].split()
    for j in range(len(temp)):
        if temp[j] in pre_list_train:
            pre_list_train[temp[j]] += 1
        else:
            pre_list_train[temp[j]] = 1
            
# sort list
sorted_list_train = sorted(pre_list_train.items(), key=operator.itemgetter(1), reverse=True)

# find top 10000 words
top_list_train = []

for i in range(10000):
    top_list_train.append(sorted_list_train[i][0])

In [20]:
# create frequency bag-of-words representation for train, validation, test
vectors_train = np.zeros(shape=(yelp_train.shape[0],10000))
vectors_valid = np.zeros(shape=(yelp_valid.shape[0],10000))
vectors_test = np.zeros(shape=(yelp_test.shape[0],10000))

for i in range(yelp_train.shape[0]):
    words = yelp_train[i][0].split(' ')
    for j in range(10000):
        vectors_train[i][j] = words.count(top_list_train[j])
        
    if (np.sum(vectors_train[i]) == 0):
        continue
    vectors_train[i] = vectors_train[i]/(np.sum(vectors_train[i]))
    
    
for i in range(yelp_valid.shape[0]):
    words = yelp_valid[i][0].split(' ')
    for j in range(10000):
        vectors_valid[i][j] = words.count(top_list_train[j])

    if (np.sum(vectors_valid[i]) == 0):
        continue
    vectors_valid[i] = vectors_valid[i]/(np.sum(vectors_valid[i]))
    
    
for i in range(yelp_test.shape[0]):
    words = yelp_test[i][0].split(' ')
    for j in range(10000):
        vectors_test[i][j] = words.count(top_list_train[j])

    if (np.sum(vectors_test[i]) == 0):
        continue
    vectors_test[i] = vectors_test[i]/(np.sum(vectors_test[i]))

In [21]:
# classes (y values)
c_train = yelp_train[:,1].astype(str)
c_valid = yelp_valid[:,1].astype(str)
c_test = yelp_test[:,1].astype(str)

In [22]:
# random classifier
random_results_train = np.empty(shape=(c_train.shape[0]),dtype=int)
random_results_valid = np.empty(shape=(c_valid.shape[0]),dtype=int)
random_results_test = np.empty(shape=(c_test.shape[0]),dtype=int)

for i in range(yelp_train.shape[0]):
    random_results_train[i] = np.random.randint(1,6)

for i in range(yelp_valid.shape[0]):
    random_results_valid[i] = np.random.randint(1,6)
    
for i in range(yelp_test.shape[0]):
    random_results_test[i] = np.random.randint(1,6)
    
random_results_train = random_results_train.astype(str)
random_results_valid = random_results_valid.astype(str)
random_results_test = random_results_test.astype(str)

f1_random_train = metrics.f1_score(c_train, random_results_train, average='macro')
f1_random_valid = metrics.f1_score(c_valid, random_results_valid, average='macro')
f1_random_test = metrics.f1_score(c_test, random_results_test, average='macro')
print("F1 score for random classifier on training dataset: ", f1_random_train)
print("F1 score for random classifier on validation dataset: ", f1_random_valid)
print("F1 score for random classifier on test dataset: ", f1_random_test)

F1 score for random classifier on training dataset:  0.178485535888
F1 score for random classifier on validation dataset:  0.191607412581
F1 score for random classifier on test dataset:  0.185708757633


In [23]:
# majority classifier
majority_list = {}
for i in range(yelp_train.shape[0]):
    if yelp_train[i][1] in majority_list:
        majority_list[yelp_train[i][1]] += 1
    else:
        majority_list[yelp_train[i][1]] = 1

majority = max(majority_list.items(), key=operator.itemgetter(1))[0]

majority_results_train = np.full((yelp_train.shape[0],1),majority)
majority_results_train = np.squeeze(majority_results_train).astype(str)

majority_results_valid = np.full((yelp_valid.shape[0],1),majority)
majority_results_valid = np.squeeze(majority_results_valid).astype(str)

majority_results_test = np.full((yelp_test.shape[0],1),majority)
majority_results_test = np.squeeze(majority_results_test).astype(str)


f1_majority_train = metrics.f1_score(c_train, majority_results_train, average='micro')
f1_majority_valid = metrics.f1_score(c_valid, majority_results_valid, average='micro')
f1_majority_test = metrics.f1_score(c_test, majority_results_test, average='micro')
print("Majority class: ", majority)
print("F1 score for majority classifier on training dataset: ", f1_majority_train)
print("F1 score for majority classifier on validation dataset: ", f1_majority_valid)
print("F1 score for majority classifier on test dataset: ", f1_majority_test)

Majority class:  4
F1 score for majority classifier on training dataset:  0.352571428571
F1 score for majority classifier on validation dataset:  0.356
F1 score for majority classifier on test dataset:  0.351


In [24]:
# Gaussian Bayes multiclass classifier
from sklearn.naive_bayes import GaussianNB
#num = np.arange(0.001,1,0.001)  # testing multiple hyperparameters
#y_pred_list = []
#for i in range(len(num)):
    #nb_clf = GaussianNB()
    #nb_clf.fit(vectors_train,c_train)
    #test = nb_clf.predict(vectors_train)
    #y_pred_list.append(nb_clf.predict(vectors_valid))
    #print("alpha: ", num[i], " F1_train: ", metrics.f1_score(c_train,test,average='macro'), " F1_valid: ", metrics.f1_score(c_valid, y_pred_list[i],average="macro"))
    
# choosing optimal hyperparameters based on F1 metric
#f1_list = []
#for i in range(len(num)):
#    f1_list.append(metrics.f1_score(c_valid, y_pred_list[i], average='micro'))

#alpha_opt = num[f1_list.index(max(f1_list))]
#print("Optimal hyperparameter for Naive Bayes: ", alpha_opt)

# f1 metric with best hyperparameter
gs_clf = GaussianNB()
gs_clf.fit(vectors_train,c_train)
y_pred_train = (gs_clf.predict(vectors_train))
y_pred_valid = (gs_clf.predict(vectors_valid))
y_pred_test = (gs_clf.predict(vectors_test))

f1_nb_train = metrics.f1_score(c_train, y_pred_train, average='macro')
f1_nb_valid = metrics.f1_score(c_valid,y_pred_valid, average='macro')
f1_nb_test = metrics.f1_score(c_test, y_pred_test,average='macro')

print("F1 score for Gaussian Naive Bayes w/ optimal hyperparameter on training dataset: ", f1_nb_train)
print("F1 score for Gaussian Naive Bayes w/ optimal hyperparameter on validation dataset: ", f1_nb_valid)
print("F1 score for Gaussian Naive Bayes w/ optimal hyperparameter on test dataset: ", f1_nb_test)

F1 score for Gaussian Naive Bayes w/ optimal hyperparameter on training dataset:  0.787889954289
F1 score for Gaussian Naive Bayes w/ optimal hyperparameter on validation dataset:  0.245619015208
F1 score for Gaussian Naive Bayes w/ optimal hyperparameter on test dataset:  0.247784668019


In [25]:
# decision trees classifier 441700
from sklearn import tree
depth = np.arange(10,50,2)
y_pred_list = []
for i in range(len(depth)):
    dt_clf = tree.DecisionTreeClassifier(max_depth=depth[i])
    dt_clf.fit(vectors_train,c_train)
    y_pred_list.append(dt_clf.predict(vectors_valid))
    test = dt_clf.predict(vectors_train)
    print("Depth: ", depth[i], " F1_train: ", metrics.f1_score(c_train,test,average='macro'), " F1_valid: ", metrics.f1_score(c_valid,y_pred_list[i], average='macro'))

# choosing maximum tree depth using validation set and F1 metric
f1_list = []
for i in range(len(depth)):
    f1_list.append(metrics.f1_score(c_valid, y_pred_list[i], average='macro'))

depth_opt = depth[f1_list.index(max(f1_list))]
print("Optimal depth for Decision Tree: ", depth_opt)

# f1 metric for train, validation, test with optimal hyperparameter(max tree depth)
dt_clf = tree.DecisionTreeClassifier(max_depth=depth_opt)
dt_clf.fit(vectors_train,c_train)
y_pred_train = dt_clf.predict(vectors_train)
y_pred_valid = dt_clf.predict(vectors_valid)
y_pred_test = dt_clf.predict(vectors_test)

f1_dt_train = metrics.f1_score(c_train,y_pred_train,average='macro')
f1_dt_valid = metrics.f1_score(c_valid,y_pred_valid,average='macro')
f1_dt_test = metrics.f1_score(c_test,y_pred_test,average='macro')


print("F1 score for Decision Tree w/ optimal depth on training dataset: ", f1_dt_train)
print("F1 score for Decision Tree w/ optimal depth on validation dataset: ", f1_dt_valid)
print("F1 score for Decision Tree w/ optimal depth on testing dataset: ", f1_dt_test)

Depth:  10  F1_train:  0.436416839611  F1_valid:  0.303011210633
Depth:  12  F1_train:  0.511334183231  F1_valid:  0.306516888356
Depth:  14  F1_train:  0.586155122748  F1_valid:  0.294358449115
Depth:  16  F1_train:  0.662002668151  F1_valid:  0.294218129108
Depth:  18  F1_train:  0.72268986416  F1_valid:  0.322305765646
Depth:  20  F1_train:  0.766843169788  F1_valid:  0.30485241354
Depth:  22  F1_train:  0.821854461205  F1_valid:  0.30273731443
Depth:  24  F1_train:  0.866962028248  F1_valid:  0.294152794107
Depth:  26  F1_train:  0.900996337975  F1_valid:  0.300671039523
Depth:  28  F1_train:  0.924322040227  F1_valid:  0.298544799589
Depth:  30  F1_train:  0.946484377066  F1_valid:  0.296849751147
Depth:  32  F1_train:  0.966239144137  F1_valid:  0.300595007861
Depth:  34  F1_train:  0.982672063495  F1_valid:  0.310778341297
Depth:  36  F1_train:  0.990408964902  F1_valid:  0.308291940038
Depth:  38  F1_train:  0.993350044691  F1_valid:  0.314653308972
Depth:  40  F1_train:  0.996

In [26]:
# Linear SVM
from sklearn.svm import LinearSVC
c = np.arange(2**-9,0.1,2**-7)
#c = [2**-7, 2**-5, 2**-3, 2**-1, 2**1, 2**3, 2**5, 2**7]
y_pred_list = []
for i in range(len(c)):
    svm_clf = LinearSVC(C=c[i], multi_class="ovr", loss='hinge')
    svm_clf.fit(vectors_train,c_train)
    y_pred_list.append(svm_clf.predict(vectors_valid))
    test = svm_clf.predict(vectors_train)
    print("C: ", c[i], " F1_train: ", metrics.f1_score(c_train,test,average='macro'), " F1_valid: ", metrics.f1_score(c_valid,y_pred_list[i],average='macro'))
    
f1_list = []
for i in range(len(c)):
    f1_list.append(metrics.f1_score(c_valid, y_pred_list[i], average='macro'))

c_opt = c[f1_list.index(max(f1_list))]
print("Optimal C for Linear SVM: ", c_opt)


# f1 metric for train, validation, test dataset with optimal c
svm_clf=LinearSVC(C=c_opt, multi_class='ovr', loss='hinge')
svm_clf.fit(vectors_train,c_train)

y_pred_train = svm_clf.predict(vectors_train)
y_pred_valid = svm_clf.predict(vectors_valid)
y_pred_test = svm_clf.predict(vectors_test)

f1_svm_train = metrics.f1_score(c_train,y_pred_train,average='macro')
f1_svm_valid = metrics.f1_score(c_valid,y_pred_valid,average='macro')
f1_svm_test = metrics.f1_score(c_test,y_pred_test,average='macro')

print("F1 score for Linear SVM w/ optimal depth on training dataset: ", f1_svm_train)
print("F1 score for Linear SVM w/ optimal depth on validation dataset: ", f1_svm_valid)
print("F1 score for Linear SVM w/ optimal depth on testing dataset: ", f1_svm_test)

C:  0.001953125  F1_train:  0.255555336838  F1_valid:  0.229565940536
C:  0.009765625  F1_train:  0.478004309589  F1_valid:  0.35971884444
C:  0.017578125  F1_train:  0.518311188831  F1_valid:  0.356547063874
C:  0.025390625  F1_train:  0.532497847769  F1_valid:  0.360200779967
C:  0.033203125  F1_train:  0.537627560544  F1_valid:  0.361617057821
C:  0.041015625  F1_train:  0.541079824384  F1_valid:  0.359912042202
C:  0.048828125  F1_train:  0.541416559901  F1_valid:  0.359713257903
C:  0.056640625  F1_train:  0.544148759337  F1_valid:  0.355816207299
C:  0.064453125  F1_train:  0.544131899754  F1_valid:  0.361644335909
C:  0.072265625  F1_train:  0.547074884434  F1_valid:  0.351622633531
C:  0.080078125  F1_train:  0.542546823204  F1_valid:  0.359710623702
C:  0.087890625  F1_train:  0.54834833483  F1_valid:  0.3520599662
C:  0.095703125  F1_train:  0.54781515518  F1_valid:  0.357025015403
Optimal C for Linear SVM:  0.064453125
F1 score for Linear SVM w/ optimal depth on training dat

# Frequency Bag-of-Words
## IMDB

In [27]:
## frequency bag-of-words
# imdb
data10 = pd.read_csv("hwk3_datasets/IMDB-train.txt", sep="\t", header=None)
data11 = pd.read_csv("hwk3_datasets/IMDB-valid.txt", sep="\t",  header=None)
data12 = pd.read_csv("hwk3_datasets/IMDB-test.txt", sep="\t",  header=None)

exclude = set(string.punctuation)
for i in range(data10[0].count()):
    temp = data10[0][i]
    temp = ''.join(ch for ch in temp if ch not in exclude)
    temp = temp.lower()
    data10[0][i] = temp

for i in range(data11[0].count()):
    temp = data11[0][i]
    temp = ''.join(ch for ch in temp if ch not in exclude)
    temp = temp.lower()
    data11[0][i] = temp

for i in range(data12[0].count()):
    temp = data12[0][i]
    temp = ''.join(ch for ch in temp if ch not in exclude)
    temp = temp.lower()
    data12[0][i] = temp

imdb_train = data10.values
imdb_valid = data11.values
imdb_test = data12.values

# generating top 10000 words
pre_list_train = {}

for i in range(imdb_train.shape[0]):
    temp = imdb_train[i][0].split()
    for j in range(len(temp)):
        if temp[j] in pre_list_train:
            pre_list_train[temp[j]] += 1
        else:
            pre_list_train[temp[j]] = 1
            
# sort list
sorted_list_train = sorted(pre_list_train.items(), key=operator.itemgetter(1), reverse=True)

# find top 10000 words
top_list_train = []

for i in range(10000):
    top_list_train.append(sorted_list_train[i][0])

In [28]:
# create frequency bag-of-words representation for train, validation, test
vectors_train = np.zeros(shape=(imdb_train.shape[0],10000))
vectors_valid = np.zeros(shape=(imdb_valid.shape[0],10000))
vectors_test = np.zeros(shape=(imdb_test.shape[0],10000))

for i in range(imdb_train.shape[0]):
    words = imdb_train[i][0].split(' ')
    for j in range(10000):
        vectors_train[i][j] = words.count(top_list_train[j])
        
    if (np.sum(vectors_train[i]) == 0):
        continue
    vectors_train[i] = vectors_train[i]/(np.sum(vectors_train[i]))
    
    
for i in range(imdb_valid.shape[0]):
    words = imdb_valid[i][0].split(' ')
    for j in range(10000):
        vectors_valid[i][j] = words.count(top_list_train[j])

    if (np.sum(vectors_valid[i]) == 0):
        continue
    vectors_valid[i] = vectors_valid[i]/(np.sum(vectors_valid[i]))
    
    
for i in range(imdb_test.shape[0]):
    words = imdb_test[i][0].split(' ')
    for j in range(10000):
        vectors_test[i][j] = words.count(top_list_train[j])

    if (np.sum(vectors_test[i]) == 0):
        continue
    vectors_test[i] = vectors_test[i]/(np.sum(vectors_test[i]))

In [29]:
# classes (y values)
c_train = imdb_train[:,1].astype(str)
c_valid = imdb_valid[:,1].astype(str)
c_test = imdb_test[:,1].astype(str)

In [30]:
# random classifier
random_results_train = np.empty(shape=(c_train.shape[0]),dtype=int)
random_results_valid = np.empty(shape=(c_valid.shape[0]),dtype=int)
random_results_test = np.empty(shape=(c_test.shape[0]),dtype=int)

for i in range(imdb_train.shape[0]):
    random_results_train[i] = np.random.randint(0,2)

for i in range(imdb_valid.shape[0]):
    random_results_valid[i] = np.random.randint(0,2)
    
for i in range(imdb_test.shape[0]):
    random_results_test[i] = np.random.randint(0,2)
    
random_results_train = random_results_train.astype(str)
random_results_valid = random_results_valid.astype(str)
random_results_test = random_results_test.astype(str)

f1_random_train = metrics.f1_score(c_train, random_results_train, average='macro')
f1_random_valid = metrics.f1_score(c_valid, random_results_valid, average='macro')
f1_random_test = metrics.f1_score(c_test, random_results_test, average='macro')
print("F1 score for random classifier on training dataset: ", f1_random_train)
print("F1 score for random classifier on validation dataset: ", f1_random_valid)
print("F1 score for random classifier on test dataset: ", f1_random_test)

F1 score for random classifier on training dataset:  0.504599627899
F1 score for random classifier on validation dataset:  0.507387684692
F1 score for random classifier on test dataset:  0.50047984335


In [31]:
# Gaussian Bayes multiclass classifier
from sklearn.naive_bayes import GaussianNB
#num = np.arange(0.001,1,0.001)  # testing multiple hyperparameters
#y_pred_list = []
#for i in range(len(num)):
    #nb_clf = GaussianNB()
    #nb_clf.fit(vectors_train,c_train)
    #test = nb_clf.predict(vectors_train)
    #y_pred_list.append(nb_clf.predict(vectors_valid))
    #print("alpha: ", num[i], " F1_train: ", metrics.f1_score(c_train,test,average='macro'), " F1_valid: ", metrics.f1_score(c_valid, y_pred_list[i],average="macro"))
    
# choosing optimal hyperparameters based on F1 metric
#f1_list = []
#for i in range(len(num)):
#    f1_list.append(metrics.f1_score(c_valid, y_pred_list[i], average='micro'))

#alpha_opt = num[f1_list.index(max(f1_list))]
#print("Optimal hyperparameter for Naive Bayes: ", alpha_opt)

# f1 metric with best hyperparameter
gs_clf = GaussianNB()
gs_clf.fit(vectors_train,c_train)
y_pred_train = (gs_clf.predict(vectors_train))
y_pred_valid = (gs_clf.predict(vectors_valid))
y_pred_test = (gs_clf.predict(vectors_test))

f1_nb_train = metrics.f1_score(c_train, y_pred_train, average='macro')
f1_nb_valid = metrics.f1_score(c_valid,y_pred_valid, average='macro')
f1_nb_test = metrics.f1_score(c_test, y_pred_test,average='macro')

print("F1 score for Gaussian Naive Bayes w/ optimal hyperparameter on training dataset: ", f1_nb_train)
print("F1 score for Gaussian Naive Bayes w/ optimal hyperparameter on validation dataset: ", f1_nb_valid)
print("F1 score for Gaussian Naive Bayes w/ optimal hyperparameter on test dataset: ", f1_nb_test)

F1 score for Gaussian Naive Bayes w/ optimal hyperparameter on training dataset:  0.862767587377
F1 score for Gaussian Naive Bayes w/ optimal hyperparameter on validation dataset:  0.759488125712
F1 score for Gaussian Naive Bayes w/ optimal hyperparameter on test dataset:  0.692712527589


In [32]:
# decision trees classifier 441700
from sklearn import tree
depth = np.arange(10,50,2)
y_pred_list = []
for i in range(len(depth)):
    dt_clf = tree.DecisionTreeClassifier(max_depth=depth[i])
    dt_clf.fit(vectors_train,c_train)
    y_pred_list.append(dt_clf.predict(vectors_valid))
    test = dt_clf.predict(vectors_train)
    print("Depth: ", depth[i], " F1_train: ", metrics.f1_score(c_train,test,average='macro'), " F1_valid: ", metrics.f1_score(c_valid,y_pred_list[i], average='macro'))

# choosing maximum tree depth using validation set and F1 metric
f1_list = []
for i in range(len(depth)):
    f1_list.append(metrics.f1_score(c_valid, y_pred_list[i], average='macro'))

depth_opt = depth[f1_list.index(max(f1_list))]
print("Optimal depth for Decision Tree: ", depth_opt)

# f1 metric for train, validation, test with optimal hyperparameter(max tree depth)
dt_clf = tree.DecisionTreeClassifier(max_depth=depth_opt)
dt_clf.fit(vectors_train,c_train)
y_pred_train = dt_clf.predict(vectors_train)
y_pred_valid = dt_clf.predict(vectors_valid)
y_pred_test = dt_clf.predict(vectors_test)

f1_dt_train = metrics.f1_score(c_train,y_pred_train,average='macro')
f1_dt_valid = metrics.f1_score(c_valid,y_pred_valid,average='macro')
f1_dt_test = metrics.f1_score(c_test,y_pred_test,average='macro')


print("F1 score for Decision Tree w/ optimal depth on training dataset: ", f1_dt_train)
print("F1 score for Decision Tree w/ optimal depth on validation dataset: ", f1_dt_valid)
print("F1 score for Decision Tree w/ optimal depth on testing dataset: ", f1_dt_test)

Depth:  10  F1_train:  0.763416074292  F1_valid:  0.704660863579
Depth:  12  F1_train:  0.802531227466  F1_valid:  0.717055500819
Depth:  14  F1_train:  0.826350828889  F1_valid:  0.70427251998
Depth:  16  F1_train:  0.848036816514  F1_valid:  0.710053267183
Depth:  18  F1_train:  0.867548620559  F1_valid:  0.712264779864
Depth:  20  F1_train:  0.886586085129  F1_valid:  0.712910698587
Depth:  22  F1_train:  0.900531035921  F1_valid:  0.712694702326
Depth:  24  F1_train:  0.915733204938  F1_valid:  0.70899295098
Depth:  26  F1_train:  0.927959894057  F1_valid:  0.708235143237
Depth:  28  F1_train:  0.938093209686  F1_valid:  0.707654049496
Depth:  30  F1_train:  0.947396457537  F1_valid:  0.70849019504
Depth:  32  F1_train:  0.955821804396  F1_valid:  0.708417761101
Depth:  34  F1_train:  0.96149944857  F1_valid:  0.708004339302
Depth:  36  F1_train:  0.966238922563  F1_valid:  0.704791122006
Depth:  38  F1_train:  0.969980734302  F1_valid:  0.707986672548
Depth:  40  F1_train:  0.9725

In [33]:
# Linear SVM
from sklearn.svm import LinearSVC
c = np.arange(2**11,3000,2**7)
#c = [2**-7, 2**-5, 2**-3, 2**-1, 2**1, 2**3, 2**5, 2**7]
y_pred_list = []
for i in range(len(c)):
    svm_clf = LinearSVC(C=c[i], multi_class="ovr", loss='hinge')
    svm_clf.fit(vectors_train,c_train)
    y_pred_list.append(svm_clf.predict(vectors_valid))
    test = svm_clf.predict(vectors_train)
    print("C: ", c[i], " F1_train: ", metrics.f1_score(c_train,test,average='macro'), " F1_valid: ", metrics.f1_score(c_valid,y_pred_list[i],average='macro'))
    
f1_list = []
for i in range(len(c)):
    f1_list.append(metrics.f1_score(c_valid, y_pred_list[i], average='macro'))

c_opt = c[f1_list.index(max(f1_list))]
print("Optimal C for Linear SVM: ", c_opt)


# f1 metric for train, validation, test dataset with optimal c
svm_clf=LinearSVC(C=c_opt, multi_class='ovr', loss='hinge')
svm_clf.fit(vectors_train,c_train)

y_pred_train = svm_clf.predict(vectors_train)
y_pred_valid = svm_clf.predict(vectors_valid)
y_pred_test = svm_clf.predict(vectors_test)

f1_svm_train = metrics.f1_score(c_train,y_pred_train,average='macro')
f1_svm_valid = metrics.f1_score(c_valid,y_pred_valid,average='macro')
f1_svm_test = metrics.f1_score(c_test,y_pred_test,average='macro')

print("F1 score for Linear SVM w/ optimal depth on training dataset: ", f1_svm_train)
print("F1 score for Linear SVM w/ optimal depth on validation dataset: ", f1_svm_valid)
print("F1 score for Linear SVM w/ optimal depth on testing dataset: ", f1_svm_test)

C:  2048  F1_train:  0.79894655196  F1_valid:  0.750764674521
C:  2176  F1_train:  0.945852078467  F1_valid:  0.841279409754
C:  2304  F1_train:  0.983466560559  F1_valid:  0.872681359278
C:  2432  F1_train:  0.972316941887  F1_valid:  0.861505825367
C:  2560  F1_train:  0.983666321046  F1_valid:  0.873395741033
C:  2688  F1_train:  0.960076087056  F1_valid:  0.851096328202
C:  2816  F1_train:  0.881240974365  F1_valid:  0.79449979524
C:  2944  F1_train:  0.98026294417  F1_valid:  0.87035100608
Optimal C for Linear SVM:  2560
F1 score for Linear SVM w/ optimal depth on training dataset:  0.975522538773
F1 score for Linear SVM w/ optimal depth on validation dataset:  0.864848470873
F1 score for Linear SVM w/ optimal depth on testing dataset:  0.846073089648
