In [1]:
# Code for performing grid search on neural network parameters
# Haruto Nakai

from gensim.models import KeyedVectors
from gensim.models.keyedvectors import Word2VecKeyedVectors
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import json
import numpy
import math
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import RMSprop
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import Sequence, to_categorical
from keras import backend as K
import tensorflow as tf
from datetime import datetime
import pandas as pd

Using TensorFlow backend.


In [2]:
google_vecs = KeyedVectors.load_word2vec_format(
    '/media/hnakai/Storage/wordvectors/GoogleNews-vectors-negative300.bin', 
    binary=True, limit=200000)

In [3]:
tokenizer = RegexpTokenizer('\w+(?:(?:\'|-)\w+)?')
stop_words = set(stopwords.words('english'))
#reveiw_tokenize(text)
# Input: text - String containing review
# Output: tokens - Tokenized version of text
#                  All punctuation is removed except for dashes and apostrophes in words
#                  Stop Words removed
def review_tokenize(text):
    vect = tokenizer.tokenize(text.lower())
    tokens = [w for w in vect if w not in stop_words]
    return tokens

#reveiw_generator()
# Is a generator
# Inputs - start: which line to start, 0(beginning) by default
#          end: which line to end, read until end if -1 (default)
# Yields - For each review in the dataset file:
#            tokens - Tokenized version of text
#            stars - Score given by author
def review_generator(start=0, end=-1):
    f=open('yelp_dataset_small_sample.json')
    count=0
    for line in f:
        if count>=start:
            r_dic = json.loads(line)
            yield (review_tokenize(r_dic["text"]), r_dic["stars"])
        count+=1
        if end!=-1 and count>end:
            break
    
#w2v_vectorize(tokens)
# Input: tokens - Tokenized version of text
# Output: feat_vect - Average of vector representations for each word in text
def w2v_vectorize(tokens):
    feat_vect = numpy.zeros(300)
    ct = 0
    for w in tokens:
        try:
            feat_vect+=google_vecs[w]
            ct+=1
        except KeyError:
            continue
    if ct==0:
        return None
    feat_vect /= ct
    return feat_vect

#vect_rating_gen(start, end, batch_size)
# Inputs - start, end: same as review_generator()
#          batch_size: the number of vector/rating pair in each batch
# Yields - batch_features: a list with vector representations of reviews
#          batch_ratings: a list with star ratings corresponding with ones in batch_features
def vect_rating_gen(start=0, end=-1, batch_size=1000):
    counter=0
    batch_features = numpy.zeros((batch_size, 300))
    #batch_ratings = numpy.zeros((batch_size,1))
    batch_ratings = numpy.zeros((batch_size,5))
    for rev,rat in review_generator(start, end):
        batch_features[counter] = w2v_vectorize(rev)
        #batch_ratings[counter] = rat
        batch_ratings[counter] = to_categorical(rat-1, 5)
        counter+=1
        if counter>=batch_size:
            yield batch_features, batch_ratings
            counter=0
    if counter!=0:
        batch_features=batch_features[0:counter]
        batch_ratings=batch_ratings[0:counter]
        yield batch_features, batch_ratings

#yelpSequence() - wrapper for vect_rating_gen() for parallel processing with Keras
class yelpSequence(Sequence):
    def __init__(self, start=0, end=-1, batch_size=100):
        self.st, self.en = start, end
        if self.en==-1:
            self.en=19000
        self.batch_s = batch_size
        self.gen=vect_rating_gen(start, end, batch_size)
    def __len__(self):
        return numpy.ceil((self.en-self.st)/float(self.batch_s))
    def __getitem__(self,_):
        return self.gen.next()

In [5]:
#Training

# Parameters
dropout = 0.5
learning_rate = 0.000005
n_hidden = 5
n_epochs = 20

#create_model(n_input)
# Input: n_input - number of inputs the resulting model should handle
# Output: model - neural network model with n_input inputs and 1 output
#                 uses parameters specified above
def create_model(n_input=300, dropout=0.5, learning_rate=0.000005, n_hidden=5):
    model=Sequential()
    n_neurons = int(math.ceil(float(n_input)*2.0/3.0))
    model.add(Dense(n_neurons, activation='relu', input_dim=n_input))
    model.add(Dropout(dropout))
    for _ in range(1, n_hidden):
        model.add(Dense(n_neurons, activation='relu'))
        model.add(Dropout(dropout))
    #model.add(Dense(1, activation='linear'))
    model.add(Dense(5, activation='softmax'))
    #model.compile(loss='mean_squared_error', 
    model.compile(loss='categorical_crossentropy',
        optimizer=RMSprop(lr=learning_rate),
        metrics=['accuracy'])
    return model

#nn_test(dropout, learning_rate, n_hidden, n_epochs)
# Inputs: parameters for the neural network
# Output: acc - list with format [accuracy, loss]
#         hist - a History object that contains validation accuracies/losses after each epoch in training
def nn_test(dropout=0.5, learning_rate=0.000005, n_hidden=5, n_epochs=100):
    training=yelpSequence(0,15200)
    validation=yelpSequence(15200,17100)
    testing=yelpSequence(17100)
    
    tf.reset_default_graph()
    K.clear_session()
    K.set_session(tf.Session())

    model = create_model(300, dropout, learning_rate, n_hidden)
    hist = model.fit_generator(training, epochs=n_epochs, use_multiprocessing=True, workers=16,
                       validation_data=validation, verbose=0)
    acc = model.evaluate_generator(testing)
    return acc, hist

In [6]:
drs = [0, 0.1, 0.25, 0.5, 0.75, 0.9]
lrs = [0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1]
hls = [1,2,3,4,5]
configs = []
for dr in drs:
    for lr in lrs:
        for hl in hls:
            configs.append((dr, lr, hl))

res_acc, res_hist = {}, {}
for dr, lr, hl in configs:
    print (dr,lr,hl)
    st_t = datetime.now()
    acc, hist = nn_test(dr,lr,hl)
    print acc, datetime.now()-st_t
    res_acc[(dr,lr,hl)] = acc
    res_hist[(dr,lr,hl)] = hist

(0, 1e-07, 1)
[1.5318041161486977, 0.4426315771905999] 0:06:26.198857
(0, 1e-07, 2)
[1.5670071526577598, 0.44631578734046534] 0:08:35.575425
(0, 1e-07, 3)
[1.564958509645964, 0.44684210262800517] 0:07:32.364283
(0, 1e-07, 4)
[1.5692729322533858, 0.4457894720529255] 0:08:05.448509
(0, 1e-07, 5)
[1.5710327374307733, 0.4457894736214688] 0:09:36.300811
(0, 1e-06, 1)
[1.373999181546663, 0.4473684194840883] 0:09:44.793102
(0, 1e-06, 2)
[1.3586083525105526, 0.4452631567653857] 0:08:49.128866
(0, 1e-06, 3)
[1.327427067254719, 0.4447368430463891] 0:07:23.745895
(0, 1e-06, 4)
[1.3170927888468693, 0.4442105277588493] 0:07:29.485090
(0, 1e-06, 5)
[1.2979943626805355, 0.44578947675855535] 0:07:38.999941
(0, 1e-05, 1)
[1.0879853775626736, 0.564736838403501] 0:07:40.293150
(0, 1e-05, 2)
[1.0386838379659151, 0.5799999942905024] 0:07:44.993078
(0, 1e-05, 3)
[1.1875887826869362, 0.5673684242524599] 0:07:47.825563
(0, 1e-05, 4)
[1.551643453146282, 0.5557894706726074] 0:07:51.834032
(0, 1e-05, 5)
[2.24086

[1.036196376148023, 0.570000000690159] 0:09:58.399879
(0.5, 1e-05, 3)
[1.0335498075736196, 0.5768421104079798] 0:10:10.577005
(0.5, 1e-05, 4)
[1.0391123357572054, 0.5594736823910161] 0:10:07.388329
(0.5, 1e-05, 5)
[1.1427177316264103, 0.5394736873476129] 0:10:24.192139
(0.5, 0.0001, 1)
[1.3100221439411766, 0.5605263176717257] 0:10:02.353256
(0.5, 0.0001, 2)
[2.8869348701677824, 0.5584210502473932] 0:10:14.887589
(0.5, 0.0001, 3)
[3.8815327945508455, 0.5689473669779929] 0:10:07.219955
(0.5, 0.0001, 4)
[4.472057053917332, 0.5668421026907469] 0:10:14.182798
(0.5, 0.0001, 5)
[4.6133035609596655, 0.5478947382224234] 0:10:19.627219
(0.5, 0.001, 1)
[4.027184047197041, 0.5315789486232557] 0:10:05.986385
(0.5, 0.001, 2)
[5.279657853277106, 0.5278947384733903] 0:10:17.368675
(0.5, 0.001, 3)
[4.965394885916459, 0.5494736809479562] 0:10:24.154885
(0.5, 0.001, 4)
[5.3372336939761515, 0.5257894757546877] 0:10:27.776707
(0.5, 0.001, 5)
[5.00573626317476, 0.529473685904553] 0:10:30.832221
(0.5, 0.01, 

In [16]:
sort_accs=list(sorted(res_acc.items(), key=lambda x: -x[1][1]))
t_dict = {
    'Parameters':[str(p) for p,_ in sort_accs],
    'Accuracy':[a[1] for _,a in sort_accs],
    'Loss':[a[0] for _,a in sort_accs]
}
df_accs=pd.DataFrame(t_dict, columns=['Parameters', 'Accuracy', 'Loss'])
df_accs.to_csv("csv/yelp_accs.csv",',')

[((0.25, 1e-05, 2), [1.0150066865117926, 0.5821052601462916]),
 ((0.25, 1e-05, 3), [1.0619752061994452, 0.5810526327082985]),
 ((0, 1e-05, 2), [1.0386838379659151, 0.5799999942905024]),
 ((0.1, 1e-05, 3), [1.0970749792299772, 0.5794736790029626]),
 ((0.9, 0.0001, 1), [1.0337722019145363, 0.5784210499964262]),
 ((0.1, 1e-05, 2), [1.0348515855638605, 0.5773684194213465]),
 ((0.5, 1e-05, 3), [1.0335498075736196, 0.5768421104079798]),
 ((0.25, 1e-05, 4), [1.1679851600998326, 0.5742105261275643]),
 ((0.75, 0.0001, 1), [1.14287731522008, 0.573684213977111]),
 ((0.25, 1e-05, 5), [1.2634757443478233, 0.5710526249910656]),
 ((0.75, 0.001, 2), [4.887982493952701, 0.5705263159776989]),
 ((0.5, 1e-05, 2), [1.036196376148023, 0.570000000690159]),
 ((0.5, 0.0001, 3), [3.8815327945508455, 0.5689473669779929]),
 ((0, 1e-05, 3), [1.1875887826869362, 0.5673684242524599]),
 ((0.5, 0.0001, 4), [4.472057053917332, 0.5668421026907469]),
 ((0.75, 0.0001, 2), [2.0337837244334973, 0.5663157795604906]),
 ((0, 1

In [35]:
t_val_acc, t_losses = {}, {}
for p in configs:
    t_val_acc[str(p)]=res_hist[p].history['val_acc']
    t_losses[str(p)]=res_hist[p].history['val_loss']
df_vacc=pd.DataFrame(t_val_acc, columns=[str(p) for p in configs])
df_loss=pd.DataFrame(t_losses, columns=[str(p) for p in configs])
df_vacc.to_csv("csv/yelp_val_accs.csv",',')
df_loss.to_csv("csv/yelp_val_loss.csv",',')