## Correct path prediction :

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime

In [102]:
import tensorflow as tf
import kerastuner as kt
from tensorflow .data import Dataset
from tensorflow.keras import Sequential, Model
from tensorflow.keras import layers
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam, RMSprop

In [3]:
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

In [4]:
picktime = pd.read_csv('../dataPreprocessing/data/gps_clean.csv', low_memory=False)
picktime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1950296 entries, 0 to 1950295
Data columns (total 12 columns):
 #   Column         Dtype  
---  ------         -----  
 0   vehicle_id     int64  
 1   line_id        int64  
 2   latitude       float64
 3   longitude      float64
 4   datetime       object 
 5   station_id     int64  
 6   vehicle_type   float64
 7   sequence_id    object 
 8   order          int64  
 9   line_label     object 
 10  datetime_diff  float64
 11  outlier        int64  
dtypes: float64(4), int64(5), object(3)
memory usage: 178.6+ MB


In [5]:
target_encoder = dict( (target,code) for code, target in enumerate(picktime.line_id.unique()) )
target_decoder = dict( (code, target) for target, code in target_encoder.items() ) 

In [6]:
nb_targets = picktime.line_id.nunique()

In [9]:
nb_targets

43

In [8]:
correct_lineid = picktime[['sequence_id','line_id']].groupby('sequence_id').first().line_id

### DATA ETL :

In [44]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import gc
import random
from time import time
import os

filenames = os.listdir('../dataPreprocessing/data/fullpick')[1:]
filenames = ['../dataPreprocessing/data/fullpick/'+ filename for filename in filenames]
names = [name[:-4] for name in os.listdir('../dataPreprocessing/data/fullpick') ][1:]

print('---------------------------ETL------------------------------', end='\n')
print('\n\n',end='')

data_length_sum      = 0
features_records_sum = 0
maxlength_exeeded    = 0
sequence_fixedSize   = 200
history_track_rate   = 0.25

for c, filename in enumerate(filenames):
    print('loading chunk ',names[c], end=' ... ')
    start = time()
    #load chunk
    fullpick = pd.read_csv(filename)
    stop = time()
    print('done in {:.2f} second(s)'.format(stop-start))

    print('chunk preprocessing', end=' ... ')
    #convert str to datetime
    start = time()
    fullpick.datetime = fullpick.datetime.apply(lambda x : datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

    #timestamp feature :
    fullpick['timestamp'] = fullpick.datetime.apply(lambda x : x.timestamp())
    

    #map true line_id
    fullpick['correct_line_id'] = fullpick.sequence_id.map(correct_lineid)   #affect target (True line_id)
    fullpick['correct_line_id'] = fullpick.correct_line_id.map(target_encoder)    #target label encoder

    #scale data between 0 and 1
    scaler = MinMaxScaler()
    fullpick.line_id       =  scaler.fit_transform(fullpick[['line_id']])
    fullpick.latitude      =  scaler.fit_transform(fullpick[['latitude']])
    fullpick.longitude     =  scaler.fit_transform(fullpick[['longitude']])
    fullpick.direction     =  scaler.fit_transform(fullpick[['direction']])
    fullpick.vehicle_type  =  scaler.fit_transform(fullpick[['vehicle_type']])
    fullpick.timestamp     =  scaler.fit_transform(fullpick[['timestamp']])


    #grouping sequences
    grouping_dict = {'sequence_id':'first', 'station_id':'first', 'line_id':'first', 'correct_line_id':'first', 'vehicle_type':'first',
                     'latitude':list, 'longitude':list, 'direction':list, 'timestamp':list }
    fullpick = fullpick.set_index('datetime').groupby(['sequence_id','station_id'], as_index=False).agg(grouping_dict).reset_index(drop = True)
    stop = time()
    print('done in {:.2f} second(s)'.format(stop-start))

    print('generating stop steps sequences', end=' ... ')
    #generating X_train and y_train time step squences
    start = time()
    n     = fullpick.shape[0]
    features = []
    labels   = []
    sequence_timestep    = {}
    line_id_sequence_map = {}
    
    for sequence_id, station_id, line_id, correct_line_id, vehicle_type, latitude, longitude, direction, timestamp in fullpick.values :
        n = len(latitude)
        samples = []
        sample  = np.zeros(6)
        for i in range(n):
            sample[0]  = latitude[i]
            sample[1]  = longitude[i]
            sample[2]  = direction[i]
            sample[3]  = timestamp[i]
            sample[4]  = vehicle_type
            sample[5]  = line_id
            samples.append(sample)
            
        if sequence_id in sequence_timestep : 
            sequence_timestep[sequence_id].append(samples)       
        else:
            sequence_timestep[sequence_id] = [samples]
        
        labels.append(np.array(correct_line_id))

    #add history of past time steps to sequences
    for sequence in sequence_timestep.keys() :    #loop on each sequence
        history = []         #history keeps 25% of data for each past time step (data is selected randomly)
        for i, timestep in enumerate(sequence_timestep[sequence]) :     #loop on each sequence time step
            if i != 0 :
                sequence_timestep[sequence][i] = history + sequence_timestep[sequence][i]    #add history list at the start of the current timestep
            
            #if sequence_timestep is less than 200 steps, add zero padding at the end of the sequence
            if len(sequence_timestep[sequence][i]) < sequence_fixedSize :
                nb_zeros = sequence_fixedSize - len(sequence_timestep[sequence][i])
                sequence_timestep[sequence][i] += np.zeros((nb_zeros,6)).tolist()
            
            #if sequence_timestep dims exeeded 200, select 200 random sample 
            if len(sequence_timestep[sequence][i]) > sequence_fixedSize :
                sequence_timestep[sequence][i] = random.sample(sequence_timestep[sequence][i], sequence_fixedSize)
                maxlength_exeeded += 1
                
            #add 25% of the current time_step at the end of history list 
            n = len(timestep)
            k = round(n*history_track_rate)
            history += random.sample(timestep, k)

    #convert sequences to numpy arrays
    for sequence in sequence_timestep.keys() :
        for i in range(len(sequence_timestep[sequence])) :
            sequence_timestep[sequence][i] = np.array(sequence_timestep[sequence][i])
        sequence_timestep[sequence] = np.array(sequence_timestep[sequence], dtype=np.float)

    #generate X_train numpy array 
    features = []
    for path in sequence_timestep.values():
        for sequence in path : 
            features.append(sequence)

    #convert X_train and y_train to numpy arrays
    features = np.array(features, dtype=np.float)
    labels = np.array(labels).reshape(-1,1)
    stop = time()
    print('done in {:.2f} second(s)'.format(stop-start))

    print('extracting preprocessed numpy array data', end=' ... ')
    #export data ready to consume by TF models
    start = time()
    np.save('../dataPreprocessing/data/Fullpick_ETL/features/'+names[c]+'.npy', features)
    np.save('../dataPreprocessing/data/Fullpick_ETL/labels/'+names[c]+'.npy', labels)
    stop = time()
    print('done in {:.2f} second(s)'.format(stop-start))

    #sequences and records data stats
    tot_recs = 0
    for i in range(features.shape[0]):
        tot_recs += features[i].shape[0]
    features_records_sum += tot_recs
    data_length_sum += labels.shape[0]

    print('chunk statistics report : ')
    print('                          * total records sum = ',tot_recs)
    print('                          * data length       = ',labels.shape[0])

    #delete temp objects
    print('delete temp objects', end=' ... ')
    del features
    del labels
    del samples
    del sample
    del fullpick
    gc.collect()
    print('done', end='\n\n')

    print('------------------------------------------------------------------------', end='\n\n')

print('Final report : ')
print('               * total records sum = ',features_records_sum)
print('               * data length       = ',data_length_sum)

---------------------------ETL------------------------------


loading chunk  fullpick_chunk1 ... done in 15.24 second(s)
chunk preprocessing ... done in 294.01 second(s)
generating stop steps sequences ... done in 65.97 second(s)
extracting preprocessed numpy array data ... done in 63.42 second(s)
chunk statistics report : 
                          * total records sum =  14157000
                          * data length       =  70785
delete temp objects ... done

------------------------------------------------------------------------

loading chunk  fullpick_chunk10 ... done in 33.53 second(s)
chunk preprocessing ... done in 206.15 second(s)
generating stop steps sequences ... done in 48.55 second(s)
extracting preprocessed numpy array data ... done in 9.70 second(s)
chunk statistics report : 
                          * total records sum =  9943400
                          * data length       =  49717
delete temp objects ... done

--------------------------------------------------

### Extract a stratified (with respect to line_id) sequence sample (10% of data)

In [10]:
import os
import re

features_path = '../dataPreprocessing/data/Fullpick_ETL/features/'
labels_path = '../dataPreprocessing/data/Fullpick_ETL/labels/'
filenames = sorted(os.listdir('../dataPreprocessing/data/Fullpick_ETL/features'), key = lambda x : int(re.findall(r'\d+',x)[0]) )

features_names = [ features_path+filename for filename in filenames ]
label_names = [ labels_path+filename for filename in filenames ]

In [16]:
from sklearn.model_selection import train_test_split
from tqdm import tqdm

for feature_filename, label_filename in tqdm(zip(features_names, label_names)):

    #load data
    features = np.load(feature_filename, allow_pickle=True)
    labels = np.load(label_filename, allow_pickle=False)
    
    #startified data sampling
    _, stratified_features, _, stratified_labels  = train_test_split(features, labels, test_size=0.1, stratify=labels)

    #save startified samples
    np.save(feature_filename.replace('features','startified_sample/features'), stratified_features)
    np.save(label_filename.replace('labels','startified_sample/labels'), stratified_labels)

24it [06:31, 16.29s/it]


In [18]:
chunk = np.load('../dataPreprocessing/data/Fullpick_ETL/startified_sample/features/fullpick_chunk1.npy')
chunk.shape

(7079, 200, 6)

### Join all startified samples into one train/test sample

In [36]:
import os
import re

features_path = '../dataPreprocessing/data/Fullpick_ETL/startified_sample/features/'
labels_path = '../dataPreprocessing/data/Fullpick_ETL/startified_sample/labels/'

filenames = sorted(os.listdir('../dataPreprocessing/data/Fullpick_ETL/startified_sample/features'), key = lambda x : int(re.findall(r'\d+',x)[0]) )

features_names = [ features_path+filename for filename in filenames ]
label_names = [ labels_path+filename for filename in filenames ]

In [37]:
first  = True
for feature_filename, label_filename in tqdm(zip(features_names, label_names)):
    #load data
    if first :
        first  = False
        data   = np.load(feature_filename, allow_pickle=True)
        target = np.load(label_filename, allow_pickle=False)
    
    else : 
        data = np.append(data, np.load(feature_filename, allow_pickle=True), axis=0)
        target = np.append(target, np.load(label_filename, allow_pickle=False))

print('sample length : ', len(target))

24it [00:21,  1.13it/s]

sample length :  139397





In [38]:
data.shape

(139397, 200, 6)

In [39]:
target.shape

(139397,)

In [40]:
extraction_path = '../dataPreprocessing/data/Fullpick_ETL/startified_sample/merged_sample/'

np.save(extraction_path+'features', data)
np.save(extraction_path+'labels', target)

### Model training and validation

In [41]:
data = np.load('../dataPreprocessing/data/Fullpick_ETL/startified_sample/merged_sample/features.npy', allow_pickle=True)
target = np.load('../dataPreprocessing/data/Fullpick_ETL/startified_sample/merged_sample/labels.npy')

In [42]:
data.shape

(139397, 200, 6)

In [92]:
#model architecture keras API

def generate_model(hp=None):
    #default hyper parameters
    num_stacked_dense = 1
    droput_rate       = 0.2
    gru_units         = 128
    dense_units       = 128
    learning_rate     = 0.002
    num_stacked_gru   = 0
    leakyRelu_alpha   = 0
    num_stacked_convolution = 1
    num_convolution_kernels = 32
    
    if hp:
        num_stacked_dense = hp.Choice('num_stacked_dense', values=[1, 2, 3])
        num_stacked_gru   = hp.Choice('num_stacked_gru', values=[0, 1])
        leakyRelu_alpha   = hp.Float('leakyRelu_alpha', min_value=0 , max_value=0.3) 
        droput_rate       = hp.Float('droput_rate', min_value=0.1 , max_value=0.5)
        learning_rate     = hp.Float('learning_rate', min_value=0.001, max_value=0.01)
        gru_units         = hp.Int('gru_units', min_value=64, max_value=256, step=32)
        dense_units       = hp.Int('dense_units', min_value=128, max_value=320, step=32)
        num_stacked_convolution = hp.Choice('num_stacked_convolution', values=[1, 2, 3])
        num_convolution_kernels = hp.Int('num_convolution_kernels', min_value=16, max_value=64, step=16)
    
    inputs = layers.Input(shape = (200,6))

    #CNN + Maxpool block
    x_conv = inputs[:4]
    for i in range(num_stacked_convolution):
        if i == 0 :
            x_conv = layers.Conv1D(num_convolution_kernels, 3, activation='relu',input_shape=(-1,200,6))(x_conv)
        else :
            x_conv = layers.Conv1D(num_convolution_kernels, 3, activation='relu')(x_conv)            
        x_conv = layers.Conv1D(num_convolution_kernels, 3, activation='relu' )(x_conv)
        x_conv = layers.MaxPooling1D(2)(x_conv)
    x_conv = layers.Flatten()(x_conv)
    
    #GRU layers Block
    if num_stacked_gru > 0 :
        x = layers.GRU(units=gru_units, activation='tanh' , input_shape=(-1, 200, 6), return_sequences=True )(inputs)
        for i in range(num_stacked_gru):
            if i+1 == num_stacked_gru:
                x = layers.GRU(units = gru_units, activation='tanh' , return_sequences = False )(inputs)
            else:
                x = layers.GRU(units=gru_units, activation='tanh', return_sequences=True )(x)
    else :
        x = layers.GRU(units = gru_units, activation='tanh' , input_shape = (-1, 200, 6), return_sequences = False )(inputs)
    
    #reduce output size of convoulution block outputs same as the gru outputs
    x_conv = layers.Dense(gru_units, activation='linear')(x_conv)
    x_conv = layers.BatchNormalization()(x_conv)
    x_conv = tf.nn.leaky_relu(x_conv, leakyRelu_alpha)

    x_concat = layers.concatenate([x_conv,x], axis=1)
    
    #Dense layers block
    for i in range(num_stacked_dense):
        x = layers.Dense(dense_units, activation='linear')(x_concat)
        x = layers.BatchNormalization()(x)
        x = tf.nn.leaky_relu(x, leakyRelu_alpha)
        x = layers.Dropout(droput_rate)(x)
    #output layer
    out = layers.Dense(43, activation='softmax')(x)
    
    #build model
    model = Model(inputs, out)
    #model compilation
    optimizer = Adam(lr=learning_rate)
    loss = SparseCategoricalCrossentropy(from_logits = True)
    metric = SparseCategoricalAccuracy()
    model.compile(optimizer=optimizer, loss=loss, metrics=metric)

    return model

In [115]:
from sklearn.model_selection import KFold
import gc

class CVTuner(kt.engine.tuner.Tuner):
    
    #run trial override
    def run_trial(self, trial, X, y, epochs=1, verbose=True, callbacks=None, batch_size=512, k=5):
        
        val_losses = []        
        
        kf = KFold(n_splits=k)
        k_iter = 1
        for train_idx, val_idx in kf.split(X):            
            gc.collect()
            print('')
            print('------KFold--CrossValidation--iteration--N°{}------'.format(k_iter), end='\n\n')
            k_iter+=1
            #kfold cross-validation split
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]

            #tensorflow data pipeline
            train_set, validation_set = self.__tensorflow_dataPipeLine(X_train, y_train, X_val, y_val, batch_size)
            
            model = self.hypermodel.build(trial.hyperparameters)
            hist = model.fit(train_set,
                      validation_data=validation_set,
                      epochs=epochs,
                      verbose = verbose,
                      callbacks=callbacks)

            val_losses.append(max(hist.history['val_sparse_categorical_accuracy'])) 
        val_losses = np.asarray(val_losses)
        self.oracle.update_trial(trial.trial_id, {'val_sparse_categorical_accuracy':np.mean(val_losses)})
        self.save_model(trial.trial_id, model)
        gc.collect()
        
    def __tensorflow_dataPipeLine(self, X_train, y_train, X_val, y_val, batch_size):

        dataset = Dataset.from_tensor_slices((X_train, y_train))
        dataset = dataset.batch(batch_size)
        dataset = dataset.shuffle(1000).repeat(2)
        train_set = dataset.prefetch(tf.data.experimental.AUTOTUNE)

        dataset = Dataset.from_tensor_slices((X_val, y_val))
        dataset = dataset.batch(batch_size)
        dataset = dataset.shuffle(1000).repeat(2)
        validation_set = dataset.prefetch(tf.data.experimental.AUTOTUNE)
        
        return train_set, validation_set

In [116]:
tuner = CVTuner(
        hypermodel=generate_model,
        directory = './tuner',
        project_name = 'path_perdiction',
        overwrite = True,
        oracle=kt.oracles.BayesianOptimization(
        objective = kt.Objective("val_sparse_categorical_accuracy", direction="max"),
        max_trials=20))

In [117]:
tuner.search_space_summary()

Search space summary
Default search space size: 9
num_stacked_dense (Choice)
{'default': 1, 'conditions': [], 'values': [1, 2, 3], 'ordered': True}
num_stacked_gru (Choice)
{'default': 0, 'conditions': [], 'values': [0, 1], 'ordered': True}
leakyRelu_alpha (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 0.3, 'step': None, 'sampling': None}
droput_rate (Float)
{'default': 0.1, 'conditions': [], 'min_value': 0.1, 'max_value': 0.5, 'step': None, 'sampling': None}
learning_rate (Float)
{'default': 0.001, 'conditions': [], 'min_value': 0.001, 'max_value': 0.01, 'step': None, 'sampling': None}
gru_units (Int)
{'default': None, 'conditions': [], 'min_value': 64, 'max_value': 256, 'step': 32, 'sampling': None}
dense_units (Int)
{'default': None, 'conditions': [], 'min_value': 128, 'max_value': 320, 'step': 32, 'sampling': None}
num_stacked_convolution (Choice)
{'default': 1, 'conditions': [], 'values': [1, 2, 3], 'ordered': True}
num_convolution_kernels (Int)
{'defaul

In [None]:
tuner.search(features, target, epochs=30, k=5, batch_size=64,
             callbacks=[EarlyStopping(restore_best_weights=True, patience=3, monitor='val_sparse_categorical_accuracy')])

In [None]:
model = tuner.get_best_models(num_models=1)[0]
model.summary()

In [None]:
#export model
path_model.save_weights('./checkpoint/path_prediction.cpkt')