In [2]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from keras.models import Sequential
from keras.models import Model as KerasModel
from keras.layers import Input, Dense, Activation, Reshape, Merge, Conv1D, MaxPooling1D, Flatten
from keras.layers import Concatenate
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint

import pickle
import matplotlib.pyplot as plt
%matplotlib inline

Using TensorFlow backend.


In [3]:
ahmd_traffic = pd.read_csv('data/ahmd_traffic_clean_df.csv')
print("Total samples:" + str(len(ahmd_traffic)))
ahmd_traffic.head()

Total samples:401399


Unnamed: 0,Event Name,Bus no.,Route Name,Stop Name,weekday,hour
0,2,31,83,176,1,3
1,2,34,86,105,1,3
2,2,31,83,176,1,3
3,2,34,86,105,1,3
4,2,31,83,176,1,3


In [4]:
ahmd_traffic = ahmd_traffic.sample(frac=1).reset_index(drop=True) #shuffle
Y_labels = ahmd_traffic['Event Name']
ahmd_traffic = ahmd_traffic.drop("Event Name", 1)
all_cols = ahmd_traffic.columns

#### Displaying number of unique vals in each categorical column:

In [5]:
for cat_var in all_cols:
    print (cat_var, ahmd_traffic[cat_var].nunique())

Bus no. 182
Route Name 119
Stop Name 182
weekday 7
hour 24


#### Training and validation split

In [6]:
ahmd_traffic_train = ahmd_traffic[:int(0.8*len(ahmd_traffic))]
ahmd_traffic_val = ahmd_traffic[int(0.8*len(ahmd_traffic)):]

In [7]:
Y_train  = Y_labels[:int(0.8*len(ahmd_traffic))]
Y_val  = Y_labels[int(0.8*len(ahmd_traffic)): ]

In [8]:
def preproc(X_train ) : 

    input_list_train = []
    
    
    #the cols to be embedded: rescaling to range [0, # values)
    for c in all_cols :
        
        """
        vals = np.asarray(X_train[c].tolist() )
        input_list_train.append( np.asarray( vals ))
        this fails as keras Expect 0,1,2,3.. as cat and not 1,2,3,5 if there are 4 categories.
        Using below method instead from https://stackoverflow.com/a/45988584 
        
        """
        vals = np.asarray(X_train[c].tolist())
        vals = pd.factorize( vals )[0]
        input_list_train.append( np.asarray(vals)  )
        """
        This below was the original method used in the code by INSERT NAME HERE. But I found the above implemntation much simpler to understand.
        raw_vals = np.unique(X_train[c])
        val_map = {}
        for i in range(len(raw_vals)):
            val_map[raw_vals[i]] = i       
        input_list_train.append(X_train[c].map(val_map).values)
        """
    return input_list_train

In [9]:
ahmd_traffic_train_modified = preproc( ahmd_traffic_train )
ahmd_traffic_val = preproc( ahmd_traffic_val )

#### As expected, the proceesed training DataFrame has 5 elements becasue we had 5 categorical columns:

In [10]:
len(ahmd_traffic_train_modified)

5

#### The lenght of each list in the processed dataframe is same as the number of training rows we have:


In [11]:
print (len(ahmd_traffic_train_modified[0]))
print (len(ahmd_traffic_train_modified[1]))
print (len(ahmd_traffic_train_modified[2]))

321119
321119
321119


#####  The below code adds a embedding network for each of the catgeoriacal variable. The embedding size is set as according to the rule (as mentioned in the paper): 

Embedding size  = min( no-of-unique-cat/2 , 50 )

##### Each  model is appending to a list named models defined intialised in below cell. 

In [12]:
models = []

In [13]:
for categoical_var in all_cols :
    print ("------------------------------------------------------------------")
    print ("for categoical column ", categoical_var     )
    model = Sequential()
    no_of_unique_cat  = ahmd_traffic[categoical_var].nunique()
    print ("number of unique cat",no_of_unique_cat)
    embedding_size = min(np.ceil((no_of_unique_cat)/2), 50 )
    embedding_size = int(embedding_size)
    print ("embedding_size set as ", embedding_size)
    model.add(  Embedding( no_of_unique_cat+1, embedding_size, input_length = 1 ) )
    
    model.add(Reshape(target_shape=( [embedding_size] )))

    
    print (model.summary() )
    
    models.append( model )

------------------------------------------------------------------
for categoical column  Bus no.
number of unique cat 182
embedding_size set as  50
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1, 50)             9150      
_________________________________________________________________
reshape_1 (Reshape)          (None, 50)                0         
Total params: 9,150
Trainable params: 9,150
Non-trainable params: 0
_________________________________________________________________
None
------------------------------------------------------------------
for categoical column  Route Name
number of unique cat 119
embedding_size set as  50
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1, 50)             6000      
________________________________

In [14]:
full_model = Sequential()
full_model.add(Merge(models, mode='concat'))

  


In [15]:
full_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
merge_1 (Merge)              (None, 166)               0         
Total params: 24,632
Trainable params: 24,632
Non-trainable params: 0
_________________________________________________________________


#### Now are input layers and embedding layers are done. We can build on those as any other keras sequential model.

In [16]:
full_model.add(Reshape((166,1)))
full_model.add(Conv1D(128, 5))
full_model.add(Activation('relu'))
full_model.add(MaxPooling1D(5))
full_model.add(Conv1D(128, 5))
full_model.add(Activation('relu'))
full_model.add(MaxPooling1D(5))
full_model.add(Conv1D(128, 5))
full_model.add(Activation('relu'))
full_model.add(MaxPooling1D(1))
full_model.add(Flatten())

full_model.add(Dense(Y_labels.nunique()))
full_model.add(Activation('softmax'))
full_model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [17]:
full_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
merge_1 (Merge)              (None, 166)               0         
_________________________________________________________________
reshape_6 (Reshape)          (None, 166, 1)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 162, 128)          768       
_________________________________________________________________
activation_1 (Activation)    (None, 162, 128)          0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 32, 128)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 28, 128)           82048     
_________________________________________________________________
activation_2 (Activation)    (None, 28, 128)           0         
__________

In [18]:
from  keras.utils.np_utils  import to_categorical
Y_train_cat = to_categorical(Y_train.tolist() )
Y_val_cat = to_categorical(Y_val.tolist()  )

In [19]:
history = full_model.fit(x = ahmd_traffic_train_modified, y = Y_train_cat, batch_size = 64,
                 epochs = 1, validation_data = (ahmd_traffic_val, Y_val_cat))

Train on 321119 samples, validate on 80280 samples
Epoch 1/1


In [33]:
all_cols

Index(['Bus no.', 'Route Name', 'Stop Name', 'weekday', 'hour'], dtype='object')

#### Saving trained embeddings weights for clustering and visualizing later

In [35]:
bus_embedding = models[0].layers[0].get_weights()[0]
route_embedding = models[1].layers[0].get_weights()[0]
stop_embedding = models[2].layers[0].get_weights()[0]
weekday_embedding = models[3].layers[0].get_weights()[0]
hour_embedding = models[4].layers[0].get_weights()[0]

In [45]:
bus_embedding.dump(open('data/embeddings/bus_embedding.npy', 'wb'))
route_embedding.dump(open('data/embeddings/route_embedding.npy', 'wb'))
stop_embedding.dump(open('data/embeddings/stop_embedding.npy', 'wb'))
weekday_embedding.dump(open('data/embeddings/weekday_embedding.npy', 'wb'))
hour_embedding.dump(open('data/embeddings/hour_embedding.npy', 'wb'))