## Correct path prediction :

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime

In [2]:
import tensorflow as tf
from tensorflow .data import Dataset
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import Dense, Input, RNN, GRU, LSTM, Dropout, BatchNormalization
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam, RMSprop

In [3]:
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

In [4]:
picktime = pd.read_csv('../Data preprocessing/data/gps_clean.csv', low_memory=False)
picktime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1950296 entries, 0 to 1950295
Data columns (total 12 columns):
 #   Column         Dtype  
---  ------         -----  
 0   vehicle_id     int64  
 1   line_id        int64  
 2   latitude       float64
 3   longitude      float64
 4   datetime       object 
 5   station_id     int64  
 6   vehicle_type   float64
 7   sequence_id    object 
 8   order          int64  
 9   line_label     object 
 10  datetime_diff  float64
 11  outlier        int64  
dtypes: float64(4), int64(5), object(3)
memory usage: 178.6+ MB


In [5]:
target_encoder = dict( (target,code) for code, target in enumerate(picktime.line_id.unique()) )
target_decoder = dict( (code, target) for target, code in target_encoder.items() ) 

In [6]:
nb_targets = picktime.line_id.nunique()

In [7]:
nb_targets

43

In [8]:
correct_lineid = picktime[['sequence_id','line_id']].groupby('sequence_id').first().line_id

### DATA ETL :

In [11]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import gc
import random
from time import time
import os

filenames = os.listdir('../Data preprocessing/data/fullpick')[1:]
filenames = ['../Data preprocessing/data/fullpick/'+ filename for filename in filenames]
names = [name[:-4] for name in os.listdir('../Data preprocessing/data/fullpick') ][1:]

print('---------------------------ETL------------------------------', end='\n')
print('\n\n',end='')

data_length_sum = 0
features_records_sum = 0
maxlength_exeeded = 0

for c, filename in enumerate(filenames):
    print('loading chunk ',names[c], end=' ... ')
    start = time()
    #load chunk
    fullpick = pd.read_csv(filename)
    stop = time()
    print('done in {:.2f} second(s)'.format(stop-start))

    print('chunk preprocessing', end=' ... ')
    #convert str to datetime
    start = time()
    fullpick.datetime = fullpick.datetime.apply(lambda x : datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

    #datetime features : 
    fullpick['hour'] = fullpick.datetime.dt.hour
    fullpick['day'] = fullpick.datetime.dt.day
    fullpick['month'] = fullpick.datetime.dt.month
    fullpick['quarter'] = fullpick.datetime.dt.quarter
    fullpick['dayofweek'] = fullpick.datetime.dt.dayofweek
    fullpick['service_class'] = fullpick['datetime'].dt.weekday.apply(lambda x : 1 if x in [5,6] else 0)

    #map true line_id
    fullpick['correct_line_id'] = fullpick.sequence_id.map(correct_lineid)   #affect target (True line_id)
    fullpick['correct_line_id'] = fullpick.correct_line_id.map(target_encoder)    #target label encoder

    #scale data between 0 and 1
    scaler = MinMaxScaler()
    fullpick.vehicle_id =  scaler.fit_transform(fullpick[['vehicle_id']])
    fullpick.line_id =  scaler.fit_transform(fullpick[['line_id']])
    fullpick.latitude =  scaler.fit_transform(fullpick[['latitude']])
    fullpick.longitude =  scaler.fit_transform(fullpick[['longitude']])
    fullpick.direction =  scaler.fit_transform(fullpick[['direction']])
    fullpick.vehicle_type =  scaler.fit_transform(fullpick[['vehicle_type']])
    fullpick.hour =  scaler.fit_transform(fullpick[['hour']])
    fullpick.day =  scaler.fit_transform(fullpick[['day']])
    fullpick.month =  scaler.fit_transform(fullpick[['month']])
    fullpick.quarter =  scaler.fit_transform(fullpick[['quarter']])
    fullpick.dayofweek =  scaler.fit_transform(fullpick[['dayofweek']])
    fullpick.service_class =  scaler.fit_transform(fullpick[['service_class']])

    #grouping sequences
    grouping_dict = {'sequence_id':'first', 'station_id':'first', 'vehicle_id':'first', 'line_id':'first', 'correct_line_id':'first', 'vehicle_type':'first',
                     'latitude':list, 'longitude':list, 'direction':list, 'hour':list, 'day':list, 'month':list, 'quarter':list,
                     'dayofweek':list, 'service_class':list}
    fullpick = fullpick.set_index('datetime').groupby(['sequence_id','station_id'], as_index=False).agg(grouping_dict).reset_index(drop = True)
    stop = time()
    print('done in {:.2f} second(s)'.format(stop-start))

    print('generating stop steps sequences', end=' ... ')
    #generating X_train and y_train time step squences
    start = time()
    n = fullpick.shape[0]
    features = []
    labels = []
    sequence_timestep = {}
    line_id_sequence_map = {}
    
    for sequence_id, station_id, vehicle_id, line_id, correct_line_id, vehicle_type, \
        latitude, longitude, direction, hour, day, month, quarter, dayofweek, service_class in fullpick.values :
        n = len(latitude)
        samples = []
        sample = np.zeros(12)
        for i in range(n):
            sample[0] = vehicle_id
            sample[1] = line_id
            sample[2] = latitude[i]
            sample[3] = longitude[i]
            sample[4] = direction[i]
            sample[5] = vehicle_type
            sample[6] = hour[i]
            sample[7] = day[i]
            sample[8] = month[i]
            sample[9] = quarter[i]
            sample[10] = dayofweek[i]
            sample[11] = service_class[i]
            samples.append(sample)
        if sequence_id in sequence_timestep : 
            sequence_timestep[sequence_id].append(samples)       
        else:
            sequence_timestep[sequence_id] = [samples]
        labels.append(np.array(correct_line_id))

    #add history of past time steps to sequences
    for sequence in sequence_timestep.keys() :    #loop on each sequence
        history = []         #history keeps 25% of data for each past time step (data is selected randomly)
        for i, timestep in enumerate(sequence_timestep[sequence]) :     #loop on each sequence time step
            if i != 0 :
                sequence_timestep[sequence][i] = history + sequence_timestep[sequence][i]    #add history list at the start of the current timestep
                
            #if sequence_timestep dims exeeded 200, select 200 random sample 
            if len(sequence_timestep[sequence][i]) > 200 :
                sequence_timestep[sequence][i] = random.sample(sequence_timestep[sequence][i], 200)
                maxlength_exeeded += 1
                
            #add 25% of the current time_step at the end of history list 
            n = len(timestep)
            k = round(n*0.25)
            history += random.sample(timestep, k)

    #convert sequences to numpy arrays
    for sequence in sequence_timestep.keys() :
        for i in range(len(sequence_timestep[sequence])) :
            sequence_timestep[sequence][i] = np.array(sequence_timestep[sequence][i])
        sequence_timestep[sequence] = np.array(sequence_timestep[sequence], object)

    #generate X_train numpy array 
    features = []
    for path in sequence_timestep.values():
        for sequence in path : 
            features.append(sequence)

    #convert X_train and y_train to numpy arrays
    features = np.array(features, dtype=object)
    labels = np.array(labels).reshape(-1,1)
    stop = time()
    print('done in {:.2f} second(s)'.format(stop-start))

    print('extracting preprocessed numpy array data', end=' ... ')
    #export data ready to consume by TF models
    start = time()
    np.save('../Data preprocessing/data/Fullpick ETL/features/'+names[c]+'.npy', features)
    np.save('../Data preprocessing/data/Fullpick ETL/labels/'+names[c]+'.npy', labels)
    stop = time()
    print('done in {:.2f} second(s)'.format(stop-start))

    #sequences and records data stats
    tot_recs = 0
    for i in range(features.shape[0]):
        tot_recs += features[i].shape[0]
    features_records_sum += tot_recs
    data_length_sum += labels.shape[0]

    print('chunk statistics report : ')
    print('                          * total records sum = ',tot_recs)
    print('                          * data length       = ',labels.shape[0])

    #delete temp objects
    print('delete temp objects', end=' ... ')
    del features
    del labels
    del samples
    del sample
    del fullpick
    gc.collect()
    print('done', end='\n\n')

    print('------------------------------------------------------------------------', end='\n\n')

print('Final report : ')
print('               * total records sum = ',features_records_sum)
print('               * data length       = ',data_length_sum)

---------------------------ETL------------------------------


loading chunk  fullpick_chunk1 ... done in 6.37 second(s)
chunk preprocessing ... done in 114.75 second(s)
generating stop steps sequences ... done in 14.07 second(s)
extracting preprocessed numpy array data ... done in 69.77 second(s)
chunk statistics report : 
                          * total records sum =  9475562
                          * data length       =  70785
delete temp objects ... done

------------------------------------------------------------------------

loading chunk  fullpick_chunk10 ... done in 10.72 second(s)
chunk preprocessing ... done in 78.22 second(s)
generating stop steps sequences ... done in 9.81 second(s)
extracting preprocessed numpy array data ... done in 12.18 second(s)
chunk statistics report : 
                          * total records sum =  6076372
                          * data length       =  49717
delete temp objects ... done

-----------------------------------------------------

In [17]:
maxlength_exeeded

246753

### Extract a stratified (with respect to line_id) sequence sample (10% of data)

In [12]:
import os
import re

features_path = '../Data preprocessing/data/Fullpick ETL/features/'
labels_path = '../Data preprocessing/data/Fullpick ETL/labels/'
filenames = sorted(os.listdir('../Data preprocessing/data/Fullpick ETL/features'), key = lambda x : int(re.findall(r'\d+',x)[0]) )

features_names = [ features_path+filename for filename in filenames ]
label_names = [ labels_path+filename for filename in filenames ]

In [13]:
from sklearn.model_selection import train_test_split
from tqdm import tqdm

for feature_filename, label_filename in tqdm(zip(features_names, label_names)):

    #load data
    features = np.load(feature_filename, allow_pickle=True)
    labels = np.load(label_filename, allow_pickle=False)
    
    #startified data sampling
    _, stratified_features, _, stratified_labels  = train_test_split(features, labels, test_size=0.1, stratify=labels)

    #save startified samples
    np.save(feature_filename.replace('features','startified sample/features'), stratified_features)
    np.save(label_filename.replace('labels','startified sample/labels'), stratified_labels)

24it [10:47, 26.99s/it]


### Join all startified samples into one train/test sample

In [14]:
import os
import re

features_path = '../Data preprocessing/data/Fullpick ETL/startified sample/features/'
labels_path = '../Data preprocessing/data/Fullpick ETL/startified sample/labels/'

filenames = sorted(os.listdir('../Data preprocessing/data/Fullpick ETL/startified sample/features'), key = lambda x : int(re.findall(r'\d+',x)[0]) )

features_names = [ features_path+filename for filename in filenames ]
label_names = [ labels_path+filename for filename in filenames ]

In [15]:
first  = True
for feature_filename, label_filename in tqdm(zip(features_names, label_names)):
    #load data
    if first :
        first  = False
        data   = np.load(feature_filename, allow_pickle=True)
        target = np.load(label_filename, allow_pickle=False)
    else : 
        data = np.append(data, np.load(feature_filename, allow_pickle=True))
        target = np.append(target, np.load(label_filename, allow_pickle=False))

print('sample length : ', len(target))

24it [00:36,  1.53s/it]

sample length :  139397





In [16]:
extraction_path = '../Data preprocessing/data/Fullpick ETL/startified sample/merged sample/'

np.save(extraction_path+'features', data)
np.save(extraction_path+'labels', target)

### Model training and validation

In [3]:
data = np.load('../Data preprocessing/data/Fullpick ETL/startified sample/merged sample/features.npy', allow_pickle=True)
target = np.load('../Data preprocessing/data/Fullpick ETL/startified sample/merged sample/labels.npy')

In [11]:
#model architecture keras API

with tf.device('/gpu:0'):
    #three time step inputs
    inputs = Input(shape = (None,12), ragged=True)

    #Lstm / Gru / Rnn
    x = LSTM(units = 124, activation='tanh' , input_shape = (-1, None, 12), return_sequences = False )(inputs)

    #Dense layers
    x = Dropout(0.2)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(256, activation='relu')(x)
    out = Dense(43, activation='softmax')(x)

    path_model = Model(inputs, out)
    path_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, None, 12)]        0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 124)               67952     
_________________________________________________________________
dropout_2 (Dropout)          (None, 124)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               64000     
_________________________________________________________________
dropout_3 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_4 (Dense)              (None, 43)                11051 

In [12]:
optimizer = Adam(lr=0.002)
loss = SparseCategoricalCrossentropy(from_logits = True)
metric = SparseCategoricalAccuracy()
path_model.compile(optimizer=optimizer, loss=loss, metrics=metric)

In [13]:
#split train and validation size
chunk_size = target.shape[0]
validation_size = round(chunk_size*0.1) 
train_size = chunk_size - validation_size

In [14]:
train_size

125457

In [15]:
validation_size

13940

In [None]:
data   = tf.ragged.constant(data, dtype=tf.float32)
target = tf.constant(target, dtype=tf.int32)

In [None]:
# tensorflow data pipeline
ds = tf.data.Dataset.from_tensor_slices((data, target))
ds = ds.shuffle(1024).repeat(2)
ds = ds.batch(300)
ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
train = ds.take(train_size)
validation = ds.skip(train_size)

In [None]:
path_model.fit(train, validation_data = validation, epochs=20 , shuffle=False, workers=8, use_multiprocessing=True,
               callbacks = EarlyStopping(monitor = 'val_sparse_categorical_accuracy', patience = 3, restore_best_weights=True)  )

In [None]:
#export model
path_model.save_weights('./checkpoint/path_many_to_one.cpkt')