# 2. Tensorflow Prophet Testing

## I - Setup

In [1]:
# Import of librairies
import tensorflow as tf
import mysql.connector as mariadb
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime
import math
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from tqdm import tqdm
from joblib import dump

class sql_query:
    def __init__(self, credentials_path):
        self.db_credentials = pd.read_csv(credentials_path, index_col="Field")
      
    
    def __call__(self, query):
        
        mariadb_connection = mariadb.connect(
            user=self.db_credentials.loc["user"][0],
            password=self.db_credentials.loc["password"][0],
            host=self.db_credentials.loc["host"][0],
            port=3306,
            db = "db_velib")
        
        self.cursor = mariadb_connection.cursor()
    
        cursor = self.cursor
        cursor.execute(query)
        field_names = [i[0] for i in cursor.description]
        df = pd.DataFrame(cursor, columns=field_names)
        return df
    
# Transforming the input data in the proper format 



def data_preparation(dataset, target, start_index, end_index, history_size,
                      target_size, step, single_step=False):
    data = []
    labels = []

    start_index = start_index + history_size
    if end_index is None:
        end_index = len(dataset) - target_size

    for i in range(start_index, end_index):
        indices = range(i-history_size, i, step)
        data.append(dataset[indices])

        if single_step:
            labels.append(target[i+target_size])
        else:
            labels.append(target[i:i+target_size])

    return np.array(data), np.array(labels)


def measure_rmse(actual, predicted):
    return math.sqrt(mean_squared_error(actual, predicted))

In [9]:
# Main pipelinhe

# Variables
day_of_testing = '2020-05-19'
past_history = 36
future_target = 6
tf.random.set_seed(13)
past_history = 36
future_target = 6
STEP = 1
BATCH_SIZE = 32
BUFFER_SIZE = 100000
EPOCHS = 6
EVALUATION_INTERVAL = 20000

In [10]:
# Extracting the list of the stations

request = sql_query("../../aws_mariadb_crendentials.csv")
query = """
SELECT DISTINCT station_id FROM velib_realtime
"""
df= request(query)
# Removing bad values
df= df.drop(0)
df = df.drop(1391)
list_of_stations = list(df.station_id)
print(list_of_stations[0:5])


# Initializing and building dataset

request = sql_query("../../aws_mariadb_crendentials.csv")

# Intialization


query = """
SELECT DISTINCT date_of_update, nb_total_free_bikes FROM velib_realtime
WHERE station_id = {}
AND date_of_update > DATE('2020-05-05')
AND date_of_update <= DATE_ADD(DATE('{}'), INTERVAL 1 DAY)
AND MINUTE(date_of_update)%5=0
ORDER BY date_of_update ASC
""".format(list_of_stations[0], day_of_testing)

df = request(query)
df.index = df['date_of_update']
df = df.nb_total_free_bikes

TRAIN_SPLIT = round(df.shape[0]*0.7)

# StandardScaler transformation of the dataset

std = StandardScaler()
std.fit(df[:TRAIN_SPLIT].values.reshape(-1,1))
df = std.transform(df.values.reshape(-1,1))

# Creating proper format data

x_train, y_train = data_preparation(df, df[1:], 0, TRAIN_SPLIT,
                                           past_history,
                                           future_target, STEP)
x_val, y_val = data_preparation(df, df[1:], TRAIN_SPLIT, None,
                                       past_history,
                                       future_target, STEP)

# Creating conso bases

x_train_conso = x_train
y_train_conso = y_train
x_val_conso = x_val
y_val_conso = y_val

# Looping with station_id
for station_id in tqdm(list_of_stations[1:]):

    query = """
    SELECT DISTINCT date_of_update, nb_total_free_bikes FROM velib_realtime
    WHERE station_id = {}
    AND date_of_update > DATE('2020-05-05')
    AND date_of_update <= DATE_ADD(DATE('{}'), INTERVAL 1 DAY)
    AND MINUTE(date_of_update)%5=0
    ORDER BY date_of_update ASC
    """.format(station_id, day_of_testing)

    df = request(query)
    df.index = df['date_of_update']
    df = df.nb_total_free_bikes



    TRAIN_SPLIT = round(df.shape[0]*0.7)

    # StandardScaler transformation of the dataset

    df = std.transform(df.values.reshape(-1,1))

    # Creating proper format data

    x_train, y_train = data_preparation(df, df[1:], 0, TRAIN_SPLIT,
                                               past_history,
                                               future_target, STEP)
    x_val, y_val = data_preparation(df, df[1:], TRAIN_SPLIT, None,
                                           past_history,
                                           future_target, STEP)

    # Creating format for NN intput
    
    x_train_conso = np.concatenate([x_train_conso, x_train])
    y_train_conso = np.concatenate([y_train_conso, y_train])
    x_val_conso = np.concatenate([x_val_conso, x_val])
    y_val_conso =  np.concatenate([y_val_conso, y_val])
    
    #print('x_train_conso shape : ', x_train_conso.shape)

  7%|▋         | 100/1389 [00:40<08:48,  2.44it/s]


KeyboardInterrupt: 

In [11]:
# Back to regular baseline

x_train_conso = x_train_conso.reshape(x_train_conso.shape[0], x_train_conso.shape[1], 1)
x_val_conso = x_val_conso.reshape(x_val_conso.shape[0], x_val_conso.shape[1], 1)


# Creating batches for tensorflow use

train_data_conso = tf.data.Dataset.from_tensor_slices((x_train_conso, y_train_conso))
train_data_conso = train_data_conso.cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).repeat()

val_data_conso = tf.data.Dataset.from_tensor_slices((x_val_conso, y_val_conso))
val_data_conso = val_data_conso.batch(BATCH_SIZE).repeat()

# Modeling A

LSTM_model_A = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(32, input_shape=x_train_conso.shape[-2:]),
    tf.keras.layers.Dense(future_target)
])

LSTM_model_A.compile(optimizer='adam', loss='mean_squared_error')

LSTM_model_A_history = LSTM_model_A.fit(train_data_conso, epochs=EPOCHS,
                                            steps_per_epoch=EVALUATION_INTERVAL,
                                            validation_data=val_data_conso,
                                            validation_steps=200)

# Modeling B

LSTM_model_B = keras.Sequential()
LSTM_model_B.add(
  keras.layers.Bidirectional(
    keras.layers.LSTM(
      units=64,
      input_shape=(x_train_conso.shape[-2:])
    )
  )
)
LSTM_model_B.add(keras.layers.Dropout(rate=0.2))
LSTM_model_B.add(keras.layers.Dense(units=future_target))

LSTM_model_B.compile(loss='mean_squared_error', optimizer='adam')

LSTM_model_B_history = LSTM_model_B.fit(train_data_conso, epochs=EPOCHS,
                                        steps_per_epoch=EVALUATION_INTERVAL,
                                        validation_data=val_data_conso,




Train for 20000 steps, validate for 200 steps
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Train for 20000 steps, validate for 200 steps
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [None]:
# Export of the models

LSTM_model_A.save('/home/exalis/Github/velib-prediction-v2/4. Models/Tensorflow Univariate - {} - {} - LSTM_A.h5'.format(day_of_testing, 'global'))
LSTM_model_B.save('/home/exalis/Github/velib-prediction-v2/4. Models/Tensorflow Univariate - {} - {} - LSTM_B.h5'.format(day_of_testing, 'global'))
dump(std, '/home/exalis/Github/velib-prediction-v2/4. Models/Tensorflow Univariate - {} - {} - std.joblib'.format(day_of_testing, 'global'))