In [1]:
import yelp_dataloader
import numpy as np
import random
from timeit import default_timer as timer
from sklearn import metrics

  from ._conv import register_converters as _register_converters


In [2]:
# Example of how to load Yelp photo features
dir_path = '/home/Afshin/cs231n/cs231n-project/yelp-resnet-feats'
file_name_prefix='yelp_resnet_'
num_batches=5
num_feats=512
start = timer()
meta_feats, photo_feats = yelp_dataloader.read_yelp_photo_feats(dir_path, file_name_prefix, num_batches, num_feats)
end = timer()
print('load time for Yelp features.', (end - start))
print(meta_feats.shape)
print(photo_feats.shape)

/home/Afshin/cs231n/cs231n-project/yelp-resnet-feats/yelp_resnet_0.h5
/home/Afshin/cs231n/cs231n-project/yelp-resnet-feats/yelp_resnet_1.h5
/home/Afshin/cs231n/cs231n-project/yelp-resnet-feats/yelp_resnet_2.h5
/home/Afshin/cs231n/cs231n-project/yelp-resnet-feats/yelp_resnet_3.h5
/home/Afshin/cs231n/cs231n-project/yelp-resnet-feats/yelp_resnet_4.h5
load time for Yelp features. 4.361218080000981
(234842, 11)
(234842, 514)


In [3]:
biz_dir_path = '/home/Afshin/cs231n/cs231n-project/yelp-resnet-biz-feats'
file_name = 'yelp_resnet_biz_feats.h5'
biz_id_feats, biz_id_labels = yelp_dataloader.compute_yelp_biz_feats(meta_feats, photo_feats,\
                                                                     num_feats, biz_dir_path, file_name, export=True)

Computing Yelp biz features out of Yelp photo features...
business features shape =  (2000, 512)
biz id and business features shape =  (2000, 513)
business labels shape =  (2000, 9)
biz id and business labels shape =  (2000, 10)
Exporting Yelp biz features to /home/Afshin/cs231n/cs231n-project/yelp-resnet-biz-feats/yelp_resnet_biz_feats.h5
Yelp biz features exported to /home/Afshin/cs231n/cs231n-project/yelp-resnet-biz-feats/yelp_resnet_biz_feats.h5


In [4]:
# Example of how to read Yelp business features
biz_dir_path = '/home/Afshin/cs231n/cs231n-project/yelp-resnet-biz-feats'
biz_file_name = 'yelp_resnet_biz_feats.h5'
biz_id_feats, biz_id_labels = yelp_dataloader.read_yelp_biz_feats(biz_dir_path, biz_file_name)
biz_feats = biz_id_feats[:, 1:]
biz_labels = biz_id_labels[:, 1:]
print('biz_feats, ', biz_feats.shape)
print('biz_labels, ', biz_labels.shape)

biz_feats,  (2000, 512)
biz_labels,  (2000, 9)


In [5]:
Num_train = 1800
Num_dev = 100
Num_test = 100
X_train = biz_feats[:Num_train]
y_train = biz_labels[:Num_train]
X_dev = biz_feats[Num_train: Num_train + Num_dev]
y_dev = biz_labels[Num_train: Num_train + Num_dev]
X_test = biz_feats[Num_train + Num_dev: Num_train + Num_dev + Num_test]
y_test = biz_labels[Num_train + Num_dev: Num_train + Num_dev + Num_test]

In [6]:
from sklearn.neural_network import MLPClassifier

hidden_sizes = [70, 80, 90, 100, 110, 120, 130, 140]
best_f1, best_size = 0, 0
for size in hidden_sizes:
    resnet_mpl = MLPClassifier(hidden_layer_sizes=(size))
    resnet_mpl.fit(X_train,y_train)
    
    resnet_mpl_train_pred = resnet_mpl.predict(X_train)
    resnet_mpl_dev_pred = resnet_mpl.predict(X_dev)
    f1_train = metrics.f1_score(y_train, resnet_mpl_train_pred, average='macro') 
    f1_dev = metrics.f1_score(y_dev, resnet_mpl_dev_pred, average='macro')  
    print('mpl train f1 = %f for hidden size %d' % (f1_train, size))
    print('mpl dev f1 = %f for hidden size %d' % (f1_dev, size))
    print()
    if f1_dev > best_f1:
        best_f1 = f1_dev
        best_size = size
print('Best mpl dev f1 score = %f Best hidden layer size = %d' % (best_f1, best_size))

mpl train f1 = 0.813235 for hidden size 70
mpl dev f1 = 0.823635 for hidden size 70

mpl train f1 = 0.830007 for hidden size 80
mpl dev f1 = 0.841826 for hidden size 80

mpl train f1 = 0.819605 for hidden size 90
mpl dev f1 = 0.832805 for hidden size 90

mpl train f1 = 0.825253 for hidden size 100
mpl dev f1 = 0.823822 for hidden size 100

mpl train f1 = 0.812314 for hidden size 110
mpl dev f1 = 0.813210 for hidden size 110

mpl train f1 = 0.826561 for hidden size 120
mpl dev f1 = 0.831119 for hidden size 120

mpl train f1 = 0.819057 for hidden size 130
mpl dev f1 = 0.820133 for hidden size 130

mpl train f1 = 0.813584 for hidden size 140
mpl dev f1 = 0.814086 for hidden size 140

Best mpl dev f1 score = 0.841826 Best hidden layer size = 80


In [7]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(X_train,y_train)
    
dtc_train_pred = dtc.predict(X_train)
dtc_dev_pred = dtc.predict(X_dev)
f1_train = metrics.f1_score(y_train, dtc_train_pred, average='macro') 
f1_dev = metrics.f1_score(y_dev, dtc_dev_pred, average='macro')  
print('dtc train f1 = %f' % f1_train)
print('dtc dev f1 = %f' % f1_dev)

dtc train f1 = 1.000000
dtc dev f1 = 0.728035


In [8]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
    
rfc_train_pred = rfc.predict(X_train)
rfc_dev_pred = rfc.predict(X_dev)
f1_train = metrics.f1_score(y_train, rfc_train_pred, average='macro') 
f1_dev = metrics.f1_score(y_dev, rfc_dev_pred, average='macro')  
print('rfc train f1 = %f ' % f1_train)
print('rfc dev f1 = %f ' % f1_dev)

rfc train f1 = 0.988042 
rfc dev f1 = 0.728095 


In [9]:
from sklearn.neighbors import KNeighborsClassifier

neighbor_sizes = [3, 5, 10, 20, 30]
best_f1, best_size = 0, 0
for size in neighbor_sizes:
    knn = KNeighborsClassifier(n_neighbors=size)
    knn.fit(X_train,y_train)

    knn_train_pred = knn.predict(X_train)
    knn_dev_pred = knn.predict(X_dev)
    f1_train = metrics.f1_score(y_train, knn_train_pred, average='macro') 
    f1_dev = metrics.f1_score(y_dev, knn_dev_pred, average='macro')  
    print('knn train f1 = %f Best number of neighbors %d' % (f1_train, size))
    print('knn dev f1 = %f Best number of neighbors %d' % (f1_dev, size))
    print()
    if f1_dev > best_f1:
        best_f1 = f1_dev
        best_size = size
print('Best knn dev f1 score = %f Best number of neighbors = %d' % (best_f1, best_size))

knn train f1 = 0.876877 Best number of neighbors 3
knn dev f1 = 0.793024 Best number of neighbors 3

knn train f1 = 0.852178 Best number of neighbors 5
knn dev f1 = 0.808191 Best number of neighbors 5

knn train f1 = 0.818541 Best number of neighbors 10
knn dev f1 = 0.793197 Best number of neighbors 10

knn train f1 = 0.807829 Best number of neighbors 20
knn dev f1 = 0.787031 Best number of neighbors 20

knn train f1 = 0.799209 Best number of neighbors 30
knn dev f1 = 0.782799 Best number of neighbors 30

Best knn dev f1 score = 0.808191 Best number of neighbors = 5
