In [4]:
import tensorflow as tf
import keras
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.utils import to_categorical
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization
from tensorflow.keras.utils import normalize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,accuracy_score, f1_score 
from matplotlib import pyplot as plt
from scipy import stats
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [5]:
"""
Utility functions required for classification.
"""
def accuracy_f_score(y_pred,y_true):
    """
    Prints both accuracy and f-score, given predictions and true values.
    """
    print(f"Accuracy score: {round(accuracy_score(y_true, y_pred) * 100,2)}%")
    print('\033[92m' + f"F1 score: {f1_score(y_true, y_pred)}" + '\033[0m')
    
def in_city(x_pred,y_pred):
    """
    Computes whether given coordinates are within the city-centre or not.
    """
    if (3750901.5068 <= x_pred <= 3770901.5069) and (-19268905.6133 <= y_pred <= -19208905.6133):
        return 1
    else:
        return 0

def sigmoid(x):
    e = np.exp(1)
    y = 1/(1+e**(-x))
    return y

def journey_time(x,y):
    """
    Compute journey time in seconds.
    """
    x = pd.to_datetime(x)
    y = pd.to_datetime(y)
    return (y-x).total_seconds()

def to_binary(x):
    result = []
    for n in x:
        result.append(np.argmax(n))
    return result

In [6]:
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall
    
    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [13]:
df = pd.read_csv('../data_train/all_features.csv')
df.head()

Unnamed: 0_level_0,time_entry,time_exit,vmax,vmin,vmean,x_entry,y_entry,x_exit,y_exit,dist,...,prev_tr,x_home,y_home,nj,dist_pct_ch,j_time,dpc,time_x,time_y,home
trajectory_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
traj_0000a8602cf2def930488dee7cdad104_1_5,15:02:31,15:18:33,0.0,0.0,0.0,3744945.0,-19281830.0,3744785.0,-19281480.0,45797.982227,...,3544.948847,3751014.0,-19093980.0,6.0,-0.071843,962.0,0.482047,-151.0,-1113.0,0
traj_0000cf177130469eeac79f67b6bcf3df_9_3,15:00:32,15:29:48,1.149404,1.149404,1.149404,3749088.0,-19266050.0,3749610.0,-19265940.0,29603.985176,...,270.043451,3749450.0,-19265060.0,4.0,-0.009039,1756.0,0.49774,-32.0,-1788.0,0
traj_0001f97b99a80f18f62e2d44e54ef33d_3_1,14:34:35,15:19:51,30.167742,30.167742,30.167742,3758738.0,-19375940.0,3769687.0,-19142580.0,137051.659155,...,-1867.319643,3771461.0,-19104130.0,2.0,0.013813,2716.0,0.503453,1525.0,-1191.0,0
traj_0002124248b0ca510dea42824723ccac_31_10,15:28:54,15:28:54,0.0,0.0,0.0,3767866.0,-19177970.0,3767866.0,-19177970.0,61336.955341,...,-59655.060438,3765544.0,-19172270.0,9.0,35.468958,0.0,1.0,-1734.0,-1734.0,0
traj_000219c2a6380c307e8bffd85b5e404b_23_16,15:08:05,15:08:05,0.0,0.0,0.0,3747641.0,-19226950.0,3747641.0,-19226950.0,17851.785279,...,0.0,3760336.0,-19228180.0,8.0,0.0,0.0,0.5,-485.0,-485.0,1


In [16]:
"""
Compute y using 'e_exit' and 'y_exit'.
"""
df["final_loc"] = list(map(in_city, df["x_exit"], df["y_exit"]))
y = df["final_loc"].values

df = pd.read_csv('../data_train/binary_features.csv') #binary features computed previously.

#df = df[(np.abs(stats.zscore(df)) < 5).all(axis=1)] #removes outlier if required.
df.drop(["final_loc","j_time","dist_pct_ch","dist"], axis=1, inplace=True)
df.head()

Unnamed: 0,dpc,home,start_CC,net_tr_b,prev_tr_b,odd_even_nj,dist_scaled,jt_scaled
0,0.482047,0,0,1,1,1,0.238078,0.050261
1,0.49774,0,0,0,1,1,0.153702,0.091745
2,0.503453,0,0,0,0,1,0.713536,0.141902
3,1.0,0,0,1,0,0,0.31904,0.0
4,0.5,1,0,0,0,1,0.09247,0.0


In [18]:
"""
Feature transformations -- Not required here when using the binary featureset.
"""
df = df.apply(abs,axis=1)
df = df.apply(np.log10,axis=1)
df[df == -np.inf] = 0.0

In [19]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1,1))
X = df.values
# X = scaler.fit_transform(X) #similarly not required when using binary featureset.

In [None]:
"""
Here we can implement SMOTE or ADASYN for class-balancing. Both improved 
performance significantly within the training set tests. However, on the 
out-of-sample test-set performance was degraded. These methods are not 
implemented here.
"""
from imblearn.over_sampling import SMOTE, ADASYN

s = SMOTE(sampling_strategy='minority', random_state=10, k_neighbors=3)
#s = ADASYN()
#df.fillna(0, inplace=True)
X, y = s.fit_resample(df, y)

from collections import Counter
print(sorted(Counter(y).items()))

In [20]:
"""
Implement learning-rate scheduler for learning-rate decay.
"""
from keras.callbacks import LearningRateScheduler
import math
def step_decay(epoch):
    initial_lrate = .005
    drop = 0.5
    epochs_drop = 5
    lrate = initial_lrate * math.pow(drop,  
           math.floor((1+epoch)/epochs_drop))
    return lrate
lrate = LearningRateScheduler(step_decay)

class LossHistory(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.losses = []
        self.lr = []
 
    def on_epoch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))
        self.lr.append(step_decay(len(self.losses)))

loss_history = LossHistory()
lrate = LearningRateScheduler(step_decay)

In [21]:
"""
Deep neural network model used for classification, consisting of an input layer,
an output layer and 3 fully-connected hidden layers. LeakyReLU used as 
activation function and dropout not used as it did not improve performance.
Instead 'class_weight' is used to provide information on imbalanced classes to 
the classifier, below.
"""
NN_model = Sequential()

# The Input Layer :
NN_model.add(Dense(256, kernel_initializer='normal',input_dim = X.shape[1], activation=None, name='layer1'))
NN_model.add(keras.layers.LeakyReLU(alpha=0.3, name='activation_1'))
NN_model.add(BatchNormalization(name='BN1'))
#NN_model.add(Dropout(0.1, name='DO1'))

# The Hidden Layers :
NN_model.add(Dense(1024, kernel_initializer='normal',activation=None, kernel_regularizer=keras.regularizers.l1(0.000),
                   name='fc1'))
NN_model.add(keras.layers.LeakyReLU(alpha=0.3, name='activation_2'))
NN_model.add(BatchNormalization(name='BN2'))
#NN_model.add(Dropout(0.1, name='DO2'))


NN_model.add(Dense(1024, kernel_initializer='normal',activation=None, kernel_regularizer=keras.regularizers.l1(0.000),
                   name='fc2'))
NN_model.add(keras.layers.LeakyReLU(alpha=0.3, name='activation_3'))
NN_model.add(BatchNormalization(name='BN3'))
#NN_model.add(Dropout(0.1, name='DO3'))

NN_model.add(Dense(1024, kernel_initializer='normal',activation=None, kernel_regularizer=keras.regularizers.l1(0.000),
                   name='fc3'))
NN_model.add(keras.layers.LeakyReLU(alpha=0.3, name='activation_4'))
NN_model.add(BatchNormalization(name='BN4'))
#NN_model.add(Dropout(0.1, name='DO3'))

# The Output Layer :
NN_model.add(Dense(2, activation='softmax', name='classifier'))

adam = keras.optimizers.Adam(lr=.0, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
# Compile the network :
NN_model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
NN_model.summary()



Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer1 (Dense)               (None, 256)               2304      
_________________________________________________________________
activation_1 (LeakyReLU)     (None, 256)               0         
_________________________________________________________________
BN1 (BatchNormalization)     (None, 256)               1024      
_________________________________________________________________
fc1 (Dense)                  (None, 1024)              263168    
_________________________________________________________________
activation_2 (LeakyReLU)     (None, 1024)              0         
_________________________________________________________________
BN2 (BatchNormalization)     (None, 1024)              4096      
_________________________________________________________________
fc2 

In [23]:
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' #name for saving model weights
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint,loss_history,lrate]
class_weight = {0: .5,
                1: .66} # class-balancing
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=410,shuffle=True)

y_train = to_categorical(y_train) #converts y-labels to format required with loss function.

NN_model.fit(x_train, y_train, epochs=50, batch_size=256, validation_split = 0.2,
             callbacks=callbacks_list, class_weight=class_weight)

Instructions for updating:
Use tf.cast instead.
Train on 85783 samples, validate on 21446 samples
Epoch 1/50

Epoch 00001: val_loss improved from inf to 0.27170, saving model to Weights-001--0.27170.hdf5
Epoch 2/50

Epoch 00002: val_loss did not improve
Epoch 3/50

Epoch 00003: val_loss improved from 0.27170 to 0.17695, saving model to Weights-003--0.17695.hdf5
Epoch 4/50

Epoch 00004: val_loss improved from 0.17695 to 0.17429, saving model to Weights-004--0.17429.hdf5
Epoch 5/50

Epoch 00005: val_loss improved from 0.17429 to 0.15547, saving model to Weights-005--0.15547.hdf5
Epoch 6/50

Epoch 00006: val_loss improved from 0.15547 to 0.15494, saving model to Weights-006--0.15494.hdf5
Epoch 7/50

Epoch 00007: val_loss did not improve
Epoch 8/50

Epoch 00008: val_loss did not improve
Epoch 9/50

Epoch 00009: val_loss did not improve
Epoch 10/50

Epoch 00010: val_loss improved from 0.15494 to 0.14327, saving model to Weights-010--0.14327.hdf5
Epoch 11/50

Epoch 00011: val_loss did not im


Epoch 00040: val_loss improved from 0.13568 to 0.13525, saving model to Weights-040--0.13525.hdf5
Epoch 41/50

Epoch 00041: val_loss did not improve
Epoch 42/50

Epoch 00042: val_loss improved from 0.13525 to 0.13521, saving model to Weights-042--0.13521.hdf5
Epoch 43/50

Epoch 00043: val_loss did not improve
Epoch 44/50

Epoch 00044: val_loss did not improve
Epoch 45/50

Epoch 00045: val_loss did not improve
Epoch 46/50

Epoch 00046: val_loss did not improve
Epoch 47/50

Epoch 00047: val_loss did not improve
Epoch 48/50

Epoch 00048: val_loss did not improve
Epoch 49/50

Epoch 00049: val_loss did not improve
Epoch 50/50

Epoch 00050: val_loss did not improve


<keras.callbacks.History at 0x294fc6310f0>

In [25]:
weights_file = 'Weights-042--0.13521.hdf5' # choose the best checkpoint 'Weights-049--0.09069.hdf5'
NN_model.load_weights(weights_file) # load it
NN_model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

In [26]:
pred = NN_model.predict(x_test)
pred_b = to_binary(pred)
y_test = to_categorical(y_test)
y_test_b = to_binary(y_test)
accuracy_f_score(pred_b,y_test_b)

Accuracy score: 89.28%
[92mF1 score: 0.8092109629039749[0m
