# Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import f1_score

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 5.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'

In [2]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, SimpleRNN, Activation, Masking
from keras.optimizers import Adam
from keras.callbacks import Callback
from keras.regularizers import l2

import keras.backend as K

Using Theano backend.
Using gpu device 0: GeForce GTX 670 (CNMeM is disabled, CuDNN 4007)


In [3]:
categories = [
  'good_for_lunch', 
  'good_for_dinner', 
  'takes_reservations', 
  'outdoor_seating',
  'restaurant_is_expensive',
  'has_alcohol',
  'has_table_service',
  'ambience_is_classy',
  'good_for_kids'
]

vgg_cols = ['f' + str(i) for i in range(4096)]

In [19]:
def loss(ytrue, ypred):
  return K.sum( (ytrue - ypred) ** 2 ) / ytrue.shape[0]

# Load train data

In [5]:
# data = pd.read_hdf('data/Xtrain_full_dataframe.ndf5', 'Xtrain')
# cases = data.business_id.unique()
# train_cases, val_cases = cases[:-400], cases[-400:]
# data_train = data[data.business_id.map(lambda v: v in train_cases)]
# data_val = data[data.business_id.map(lambda v: v in val_cases)]

# print len(data_train), len(data_val), len(data_train) + len(data_val), len(data)

data_train = pd.read_hdf('data/Dataframe_train1596.h5', 'Xtrain')
data_val = pd.read_hdf('data/Dataframe_val400.h5', 'Xval')
train_cases = data_train.business_id.unique()
val_cases = data_val.business_id.unique()
num_train, num_val = len(train_cases), len(val_cases)

print len(train_cases), len(val_cases), len(data_train), len(data_val)

1596 400 198809 35736


## Train / val generators

In [6]:
np.random.seed(0)

def train_generator(n_batches, maxlen, dimin, dimout):
  while True:
    shuffled = train_cases.copy()
    np.random.shuffle(shuffled)
    
    batches = np.array_split(shuffled, n_batches)
    
    for batch in batches:
      X = np.zeros((len(batch), maxlen, dimin)) - 1
      Y = np.zeros((len(batch), dimout))
      
      for idx, case in enumerate(batch):
        res = data_train[data_train.business_id == case]
        if len(res) > maxlen:
          res = res.sample(maxlen)
        else:
          res = res.sample(len(res))
        X[idx, :len(res), :] = np.array(res[vgg_cols])
        Y[idx] = np.array(res[categories][:1])
      yield (X, Y)


def val_generator(n_batches, maxlen, dimin, dimout):
  while True:
    batches = np.array_split(val_cases, n_batches)
    
    for batch in batches:
      X = np.zeros((len(batch), maxlen, dimin)) - 1
      Y = np.zeros((len(batch), dimout))
      
      for idx, case in enumerate(batch):
        res = data_val[data_val.business_id == case][:maxlen]
        X[idx, :len(res), :] = np.array(res[vgg_cols])
        Y[idx] = np.array(res[categories][:1])
      yield (X, Y)

In [7]:
np.random.seed(0)

def train_generator_chunked(n_batches, maxlen, dimin, dimout):
  while True:
    shuffled = train_cases.copy()
    np.random.shuffle(shuffled)
    
    batches = np.array_split(shuffled, n_batches)
    
    for batch in batches:
      X = np.zeros((len(batch), maxlen, dimin)) - 1
      Y = np.zeros((len(batch), dimout))
      
      for idx, case in enumerate(batch):
        res = data_train[data_train.business_id == case]
        
        # set Y, it is easy
        Y[idx] = np.array(res[categories][:1])
        
        # set X if there are fewer rows than required
#         res = res.sample(len(res)) # shuffle rows
        res = np.array(res[vgg_cols])
        
        if len(res) < maxlen:
          X[idx, :len(res)] = res
        else:
          i = 0
          for c in np.array_split(res, maxlen):
            X[idx, i] = c.mean(axis=0)
            i += 1

      yield (X, Y)


def val_generator_chunked(n_batches, maxlen, dimin, dimout):
  while True:
    batches = np.array_split(val_cases, n_batches)
    
    for batch in batches:
      X = np.zeros((len(batch), maxlen, dimin)) - 1
      Y = np.zeros((len(batch), dimout))
      
      for idx, case in enumerate(batch):
        res = data_val[data_val.business_id == case]
        
        # set Y, it is easy
        Y[idx] = np.array(res[categories][:1])
        
        # set X if there are fewer rows than required
        res = np.array(res[vgg_cols])
        
        if len(res) < maxlen:
          X[idx, :len(res)] = res
        else:
          i = 0
          for c in np.array_split(res, maxlen):
            X[idx, i] = c.mean(axis=0)
            i += 1

      yield (X, Y)

# Define params

In [17]:
maxlen = 10
dimin  = 4096
dimout = 9

# Define custom callback

In [12]:
class f1printerCallback(Callback):
  def __init__(self):
    self.bestf1val = 0
  
  def on_train_begin(self, logs={}):
    self.epochs = []
    self.history = {'train': [], 'val': []}

  def on_epoch_end(self, epoch, logs={}):
#     tgen = train_generator(1, maxlen, dimin, dimout)
#     x, train_true = next(tgen)
#     train_pred = self.model.predict(x)
    
#     vgen = val_generator(1, maxlen, dimin, dimout)
    vgen = val_generator_chunked(1, maxlen, dimin, dimout)
    x, val_true = next(vgen)
    val_pred = self.model.predict(x)
      
#     train_pred[train_pred >= .5] = 1
#     train_pred[train_pred < .5] = 0
#     trainf1 = f1_score(train_true, train_pred, average='micro')
    
    val_pred[val_pred >= .5] = 1
    val_pred[val_pred < .5] = 0
    valf1 = f1_score(val_true, val_pred, average='micro')
    
    print ' - val F1: %f' % valf1
#     print 'F1: %f (train), %f (val)' % (trainf1, valf1)

    if valf1 > self.bestf1val:
      print 'F1 val score improved! From %f to %f. Saving model...' % (self.bestf1val, valf1)
      self.bestf1val = valf1
      self.model.save_weights('models/lstm_basic.h5', overwrite=True)
    
    self.epochs.append(epoch)
#     self.history['train'].append(trainf1)
    self.history['val'].append(valf1)

In [13]:
cb = f1printerCallback()

# RNN model part

In [20]:
model = Sequential()
model.add(Masking(-1, input_shape=(maxlen, dimin)))
# model.add(LSTM(
#     100, 
#     input_dim=dimin, 
#     input_length=maxlen, 
# #     W_regularizer=l2(), 
# #     U_regularizer=l2(), 
#     dropout_W=0.5, 
#     dropout_U=0.5,
#     return_sequences=True
# ))
model.add(SimpleRNN(
    100, 
    input_dim=dimin, 
    input_length=maxlen, 
    dropout_W=0.5, 
    dropout_U=0.5,
))
# model.add(Dense(100, activation='relu', W_regularizer=l2()))
model.add(Dense(9, activation='sigmoid', W_regularizer=l2()))
# model.add(Dense(9, activation='sigmoid'))
# model.compile(loss=loss, optimizer=Adam(lr=0.0003))
model.compile(loss=loss, optimizer='adam')

In [21]:
h = model.fit_generator(
  # generator=train_generator(50, maxlen, dimin, dimout), 
  generator=train_generator_chunked(40, maxlen, dimin, dimout), 
  samples_per_epoch=num_train,
  validation_data=val_generator_chunked(5, maxlen, dimin, dimout),
  nb_val_samples=num_val,
  nb_epoch=100,
  callbacks=[cb]
)

Epoch 1/100
F1 val score improved! From 0.000000 to 0.693953. Saving model...
Epoch 2/100
F1 val score improved! From 0.693953 to 0.713481. Saving model...
Epoch 3/100
Epoch 4/100
F1 val score improved! From 0.713481 to 0.734830. Saving model...
Epoch 5/100
F1 val score improved! From 0.734830 to 0.738323. Saving model...
Epoch 6/100
F1 val score improved! From 0.738323 to 0.739625. Saving model...
Epoch 7/100
F1 val score improved! From 0.739625 to 0.744271. Saving model...
Epoch 8/100
Epoch 9/100
Epoch 10/100
F1 val score improved! From 0.744271 to 0.754660. Saving model...
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
F1 val score improved! From 0.754660 to 0.757895. Saving model...
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100

KeyboardInterrupt: 