In [1]:
import yelp_dataloader
import numpy as np
import random
from timeit import default_timer as timer
from sklearn import metrics

  from ._conv import register_converters as _register_converters


In [2]:
# Example of how to load Yelp photo features
dir_path = '/home/Afshin/cs231n/cs231n-project/yelp-resnet-feats'
file_name_prefix='yelp_resnet_'
num_batches=5
num_feats=512
start = timer()
meta_feats, photo_feats = yelp_dataloader.read_yelp_photo_feats(dir_path, file_name_prefix, num_batches, num_feats)
end = timer()
print('load time for Yelp features.', (end - start))
print(meta_feats.shape)
print(photo_feats.shape)

/home/Afshin/cs231n/cs231n-project/yelp-resnet-feats/yelp_resnet_0.h5
/home/Afshin/cs231n/cs231n-project/yelp-resnet-feats/yelp_resnet_1.h5
/home/Afshin/cs231n/cs231n-project/yelp-resnet-feats/yelp_resnet_2.h5
/home/Afshin/cs231n/cs231n-project/yelp-resnet-feats/yelp_resnet_3.h5
/home/Afshin/cs231n/cs231n-project/yelp-resnet-feats/yelp_resnet_4.h5
load time for Yelp features. 3.5415105640004185
(234842, 11)
(234842, 514)


In [3]:
biz_dir_path = '/home/Afshin/cs231n/cs231n-project/yelp-resnet-biz-feats'
file_name = 'yelp_resnet_biz_feats.h5'
biz_id_feats, biz_id_labels = yelp_dataloader.compute_yelp_biz_feats(meta_feats, photo_feats,\
                                                                     num_feats, biz_dir_path, file_name, export=True)

Computing Yelp biz features out of Yelp photo features...
business features shape =  (2000, 512)
biz id and business features shape =  (2000, 513)
business labels shape =  (2000, 9)
biz id and business labels shape =  (2000, 10)
Exporting Yelp biz features to /home/Afshin/cs231n/cs231n-project/yelp-resnet-biz-feats/yelp_resnet_biz_feats.h5
Yelp biz features exported to /home/Afshin/cs231n/cs231n-project/yelp-resnet-biz-feats/yelp_resnet_biz_feats.h5


In [4]:
# Example of how to read Yelp business features
biz_dir_path = '/home/Afshin/cs231n/cs231n-project/yelp-resnet-biz-feats'
biz_file_name = 'yelp_resnet_biz_feats.h5'
biz_id_feats, biz_id_labels = yelp_dataloader.read_yelp_biz_feats(biz_dir_path, biz_file_name)
biz_feats = biz_id_feats[:, 1:]
biz_labels = biz_id_labels[:, 1:]
print('biz_feats, ', biz_feats.shape)
print('biz_labels, ', biz_labels.shape)

biz_feats,  (2000, 512)
biz_labels,  (2000, 9)


In [5]:
Num_train = 1800
Num_dev = 100
Num_test = 100
X_train = biz_feats[:Num_train]
y_train = biz_labels[:Num_train]
X_dev = biz_feats[Num_train: Num_train + Num_dev]
y_dev = biz_labels[Num_train: Num_train + Num_dev]
X_test = biz_feats[Num_train + Num_dev: Num_train + Num_dev + Num_test]
y_test = biz_labels[Num_train + Num_dev: Num_train + Num_dev + Num_test]

In [14]:
from sklearn.neural_network import MLPClassifier

hidden_sizes = [70, 80, 90, 100, 110, 120, 130, 140]
best_p, best_r, best_f1, best_size, best_mpl_model = 0, 0, 0, 0, None
for size in hidden_sizes:
    mpl = MLPClassifier(hidden_layer_sizes=(size))
    mpl.fit(X_train,y_train)
    
    mpl_train_pred = mpl.predict(X_train)
    mpl_dev_pred = mpl.predict(X_dev)
    
    p_train, r_train, f1_train, _ = metrics.precision_recall_fscore_support(y_train, mpl_train_pred, average='macro') 
    p_dev, r_dev, f1_dev, _ = metrics.precision_recall_fscore_support(y_dev, mpl_dev_pred, average='macro')  
    print('mpl train P = %f, train R = %f, train f1 = %f, Hidden size = %d' \
          % (p_train, r_train, f1_train, size))
    print('mpl dev P = %f, dev R = %f, dev f1 = %f, Hidden size = %d' \
          % (p_dev, r_dev, f1_dev, size))
    print()
    if f1_dev > best_f1:
        best_p = p_dev
        best_r = r_dev
        best_f1 = f1_dev
        best_size = size
        best_mpl_model = mpl

print('Best mpl dev P = %f, dev R = %f, dev f1 = %f, Best hidden size = %d' \
      % (best_p, best_r, best_f1, best_size))
mpl_test_pred = best_mpl_model.predict(X_test)
p_test, r_test, f1_test, _ = metrics.precision_recall_fscore_support(y_test, mpl_test_pred, average='macro') 
print('Best mpl test P = %f, test R = %f, test f1 = %f, Best hidden size = %d' \
  % (p_test, r_test, f1_test, best_size))

mpl train P = 0.851866, train R = 0.780444, train f1 = 0.813485, Hidden size = 70
mpl dev P = 0.870328, dev R = 0.781118, dev f1 = 0.819066, Hidden size = 70

mpl train P = 0.852608, train R = 0.789957, train f1 = 0.816828, Hidden size = 80
mpl dev P = 0.869155, dev R = 0.801593, dev f1 = 0.826855, Hidden size = 80

mpl train P = 0.831060, train R = 0.809128, train f1 = 0.819368, Hidden size = 90
mpl dev P = 0.833291, dev R = 0.809790, dev f1 = 0.817511, Hidden size = 90

mpl train P = 0.853757, train R = 0.816722, train f1 = 0.834179, Hidden size = 100
mpl dev P = 0.854168, dev R = 0.803607, dev f1 = 0.823472, Hidden size = 100

mpl train P = 0.855141, train R = 0.800126, train f1 = 0.825433, Hidden size = 110
mpl dev P = 0.871851, dev R = 0.768503, dev f1 = 0.808839, Hidden size = 110

mpl train P = 0.850565, train R = 0.752446, train f1 = 0.794368, Hidden size = 120
mpl dev P = 0.866454, dev R = 0.748386, dev f1 = 0.796385, Hidden size = 120

mpl train P = 0.866382, train R = 0.7808

In [15]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(X_train,y_train)
    
dtc_train_pred = dtc.predict(X_train)
dtc_dev_pred = dtc.predict(X_dev)
dtc_test_pred = dtc.predict(X_test)
p_train, r_train, f1_train, _ = metrics.precision_recall_fscore_support(y_train, dtc_train_pred, average='macro') 
p_dev, r_dev, f1_dev, _ = metrics.precision_recall_fscore_support(y_dev, dtc_dev_pred, average='macro')  
p_test, r_test, f1_test, _ = metrics.precision_recall_fscore_support(y_test, dtc_test_pred, average='macro') 
print('dtc train P = %f, train R = %f, train f1 = %f' \
      % (p_train, r_train, f1_train))
print('dtc dev P = %f, dev R = %f, dev f1 = %f' \
      % (p_dev, r_dev, f1_dev))
print('dtc test P = %f, test R = %f, test f1 = %f' \
      % (p_test, r_test, f1_test))

dtc train P = 1.000000, train R = 1.000000, train f1 = 1.000000
dtc dev P = 0.753365, dev R = 0.741810, dev f1 = 0.744885
dtc test P = 0.684406, test R = 0.699791, test f1 = 0.689665


In [16]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)

rfc_train_pred = rfc.predict(X_train)
rfc_dev_pred = rfc.predict(X_dev)
rfc_test_pred = rfc.predict(X_test)
p_train, r_train, f1_train, _ = metrics.precision_recall_fscore_support(y_train, rfc_train_pred, average='macro') 
p_dev, r_dev, f1_dev, _ = metrics.precision_recall_fscore_support(y_dev, rfc_dev_pred, average='macro')  
p_test, r_test, f1_test, _ = metrics.precision_recall_fscore_support(y_test, rfc_test_pred, average='macro') 
print('rfc train P = %f, train R = %f, train f1 = %f' \
      % (p_train, r_train, f1_train))
print('rfc dev P = %f, dev R = %f, dev f1 = %f' \
      % (p_dev, r_dev, f1_dev))
print('rfc test P = %f, test R = %f, test f1 = %f' \
      % (p_test, r_test, f1_test))

rfc train P = 0.997805, train R = 0.983134, train f1 = 0.990389
rfc dev P = 0.818966, dev R = 0.727840, dev f1 = 0.766702
rfc test P = 0.801292, test R = 0.690629, test f1 = 0.734858


In [17]:
from sklearn.neighbors import KNeighborsClassifier

neighbor_sizes = [3, 5, 10, 20, 30]
best_p, best_r, best_f1, best_size, best_knn_model = 0, 0, 0, 0, None
for size in neighbor_sizes:
    knn = KNeighborsClassifier(n_neighbors=size)
    knn.fit(X_train,y_train)

    knn_train_pred = knn.predict(X_train)
    knn_dev_pred = knn.predict(X_dev)
    p_train, r_train, f1_train, _ = metrics.precision_recall_fscore_support(y_train, knn_train_pred, average='macro') 
    p_dev, r_dev, f1_dev, _ = metrics.precision_recall_fscore_support(y_dev, knn_dev_pred, average='macro')  
    print('knn train P = %f, train R = %f, train f1 = %f, Best number of neighbors %d' \
          % (p_train, r_train, f1_train, size))
    print('knn dev P = %f, dev R = %f, dev f1 = %f, Best number of neighbors %d' \
          % (p_dev, r_dev, f1_dev, size))
    print()
    if f1_dev > best_f1:
        best_p = p_dev
        best_r = r_dev
        best_f1 = f1_dev
        best_size = size
        best_knn_model = knn
print('Best knn dev P = %f, dev R = %f, dev f1 = %f, Best number of neighbors %d' \
          % (best_p, best_r, best_f1, best_size))
knn_test_pred = best_knn_model.predict(X_test)
p_test, r_test, f1_test, s_test = metrics.precision_recall_fscore_support(y_test, knn_test_pred, average='macro') 
print('Best knn test P = %f, test R = %f, test f1 = %f, Best number of neighbors %d' \
  % (p_test, r_test, f1_test, best_size))

knn train P = 0.873390, train R = 0.884428, train f1 = 0.876877, Best number of neighbors 3
knn dev P = 0.798777, dev R = 0.800341, dev f1 = 0.793024, Best number of neighbors 3

knn train P = 0.848113, train R = 0.862857, train f1 = 0.852178, Best number of neighbors 5
knn dev P = 0.810346, dev R = 0.819282, dev f1 = 0.808191, Best number of neighbors 5

knn train P = 0.846431, train R = 0.805629, train f1 = 0.818541, Best number of neighbors 10
knn dev P = 0.828447, dev R = 0.779296, dev f1 = 0.793197, Best number of neighbors 10

knn train P = 0.820966, train R = 0.815109, train f1 = 0.807829, Best number of neighbors 20
knn dev P = 0.807994, dev R = 0.796636, dev f1 = 0.787031, Best number of neighbors 20

knn train P = 0.805261, train R = 0.818036, train f1 = 0.799209, Best number of neighbors 30
knn dev P = 0.797535, dev R = 0.800856, dev f1 = 0.782799, Best number of neighbors 30

Best knn dev P = 0.810346, dev R = 0.819282, dev f1 = 0.808191, Best number of neighbors 5
Best knn

In [18]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

svc = OneVsRestClassifier(CalibratedClassifierCV(LinearSVC(random_state=0, C=0.1))).fit(X_train, y_train)
svc_train_pred = svc.predict(X_train)
svc_dev_pred = svc.predict(X_dev)
svc_test_pred = svc.predict(X_test)
p_train, r_train, f1_train, _ = metrics.precision_recall_fscore_support(y_train, svc_train_pred, average='macro') 
p_dev, r_dev, f1_dev, _ = metrics.precision_recall_fscore_support(y_dev, svc_dev_pred, average='macro')
p_test, r_test, f1_test, _ = metrics.precision_recall_fscore_support(y_test, svc_test_pred, average='macro') 
print('svc train P = %f, train R = %f, train f1 = %f' % \
      (p_train, r_train, f1_train))
print('svc dev P = %f, dev R = %f, dev f1 = %f' \
      % (p_dev, r_dev, f1_dev))
print('svc test P = %f, test R = %f, test f1 = %f' \
      % (p_test, r_test, f1_test))

svc train P = 0.895254, train R = 0.870001, train f1 = 0.881883
svc dev P = 0.848925, dev R = 0.799972, dev f1 = 0.818042
svc test P = 0.829445, test R = 0.794420, test f1 = 0.810029


In [19]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

lr = OneVsRestClassifier(LogisticRegression(random_state=0, C=0.1)).fit(X_train, y_train)
lr_train_pred = lr.predict(X_train)
lr_dev_pred = lr.predict(X_dev)
lr_test_pred = lr.predict(X_test)
p_train, r_train, f1_train, _ = metrics.precision_recall_fscore_support(y_train, lr_train_pred, average='macro') 
p_dev, r_dev, f1_dev, _ = metrics.precision_recall_fscore_support(y_dev, lr_dev_pred, average='macro')
p_test, r_test, f1_test, _ = metrics.precision_recall_fscore_support(y_test, lr_test_pred, average='macro') 
print('lr train P = %f, train R = %f, train f1 = %f' \
      % (p_train, r_train, f1_train))
print('lr dev P = %f, dev R = %f, dev f1 = %f' \
      % (p_dev, r_dev, f1_dev))
print('lr test P = %f, test R = %f, test f1 = %f' \
      % (p_test, r_test, f1_test))

lr train P = 0.862098, train R = 0.846905, train f1 = 0.854155
lr dev P = 0.849313, dev R = 0.809673, dev f1 = 0.826096
lr test P = 0.826737, test R = 0.793261, test f1 = 0.808103


In [20]:
resnet_mpl_proba_train = best_mpl_model.predict_proba(X_train)
resnet_svc_proba_train = svc.predict_proba(X_train)
resnet_lr_proba_train = lr.predict_proba(X_train)
ensemble_biz_proba_train = (resnet_mpl_proba_train + resnet_svc_proba_train + resnet_lr_proba_train) / 3

resnet_mpl_proba_test = best_mpl_model.predict_proba(X_test)
resnet_svc_proba_test = svc.predict_proba(X_test)
resnet_lr_proba_test = lr.predict_proba(X_test)
ensemble_biz_proba_test = (resnet_mpl_proba_test + resnet_svc_proba_test + resnet_lr_proba_test) / 3

In [22]:
ensemble_biz_pred_train = np.zeros(ensemble_biz_proba_train.shape)
ensemble_biz_pred_train[ensemble_biz_proba_train >= 0.40] = 1.
p_train, r_train, f1_train, _ = metrics.precision_recall_fscore_support(y_train, ensemble_biz_pred_train, average='macro')
print('ensemble train P = %f, train R = %f, train f1 = %f' \
      % (p_train, r_train, f1_train))

ensemble_biz_pred_test = np.zeros(ensemble_biz_proba_test.shape)
ensemble_biz_pred_test[ensemble_biz_proba_test >= 0.40] = 1.
p_test, r_test, f1_test, _ = metrics.precision_recall_fscore_support(y_test, ensemble_biz_pred_test, average='macro')
print('ensemble test P = %f, test R = %f, test f1 = %f' \
      % (p_test, r_test, f1_test))

ensemble train P = 0.827673, train R = 0.900757, train f1 = 0.861940
ensemble test P = 0.794791, test R = 0.869366, test f1 = 0.828749
