# 2. Tensorflow Prophet Testing

## I - Setup

In [1]:
# Import of librairies
import tensorflow as tf
import mysql.connector as mariadb
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime
import math
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from tqdm import tqdm
from joblib import load
from timeloop import Timeloop
from datetime import timedelta
import time 

class sql_query:
    def __init__(self, credentials_path):
        self.db_credentials = pd.read_csv(credentials_path, index_col="Field")
      
    
    def __call__(self, query):
        
        mariadb_connection = mariadb.connect(
            user=self.db_credentials.loc["user"][0],
            password=self.db_credentials.loc["password"][0],
            host=self.db_credentials.loc["host"][0],
            port=3306,
            db = "db_velib")
        
        self.cursor = mariadb_connection.cursor()
        cursor = self.cursor
        cursor.execute("SET  time_zone = 'Europe/Paris'")
        cursor.execute(query)
        field_names = [i[0] for i in cursor.description]
        df = pd.DataFrame(cursor, columns=field_names)
        return df
    
# Transforming the input data in the proper format 


def measure_rmse(actual, predicted):
    return math.sqrt(mean_squared_error(actual, predicted))

def list_stations():
    request = sql_query("../../aws_mariadb_crendentials - write.csv")
    query = """
    SELECT DISTINCT station_id FROM velib_realtime
    """
    df= request(query)
    # Removing bad values
    df= df.drop(0)
    df = df.drop(1391)
    list_of_stations = list(df.station_id)
    return list_of_stations

def loading_models_unique(station_id, day_of_testing):

    try:
        LSTM_A = tf.keras.models.load_model('/home/exalis/Github/velib-prediction-v2/4. Models/Tensorflow Univariate - {} - {} - LSTM_A.h5'.format(day_of_testing, station_id))
        LSTM_B = tf.keras.models.load_model('/home/exalis/Github/velib-prediction-v2/4. Models/Tensorflow Univariate - {} - {} - LSTM_B.h5'.format(day_of_testing, station_id))
        std = load('/home/exalis/Github/velib-prediction-v2/4. Models/Tensorflow Univariate - {} - {} - std.joblib'.format(day_of_testing, station_id))
        return LSTM_A, LSTM_B, std
    
    except:
        print('impossible to load ', list_of_stations[i])


def create_result_df():
    # Extracting base for prediction 

    request = sql_query("../../aws_mariadb_crendentials - write.csv")

    query = """
    SELECT station_id, date_of_update, nb_total_free_bikes FROM db_velib.velib_realtime
    WHERE date_of_update >= DATE_SUB(NOW(), INTERVAL 185 Minute) AND MINUTE(date_of_update)%5 = 0
    ORDER BY station_id, date_of_update ASC;
    """
    df= request(query)
    df.index = df.date_of_update
    df = df[['station_id','nb_total_free_bikes']]
    df = df.pivot_table(df, index= 'station_id', columns=df.index)

    # Creating dataframe for proper predction

    df_prediction = pd.DataFrame(index=df.index, columns=['last_observations','model_A', 'model_B', 'date_of_prediction'])
    
    for i in df_prediction.index:
        df_prediction["last_observations"].loc[i] = np.array(df.loc[i])
    
    df_prediction['date_of_prediction'] = str(pd.Timestamp.now())[:16]
    return df_prediction


def predict_iteration_unique(station_id, df_prediction, LSTM_A, LSTM_B, std):
    # Request for each minutes

    try:
        input_data = std.transform(df_prediction[df_prediction.index == station_id]["last_observations"].iloc[0].reshape(-1, 1))[-36:]
        df_prediction.loc[station_id]['model_A'] = std.inverse_transform(LSTM_A.predict(input_data.reshape(1,past_history,1))[0])
        df_prediction.loc[station_id]['model_B'] = std.inverse_transform(LSTM_B.predict(input_data.reshape(1,past_history,1))[0])
        return df_prediction
    except:
        print('error on ', station_id)
            
   

## I - Basic Run

In [7]:
%%time
# Main pipelinhe

# Variables
day_of_testing = '2020-05-19'

# Loading global model for al stations
station_id = 'global'
past_history = 36
future_target = 6
list_of_stations = list_stations()
df_prediction = create_result_df()

# Loading the models only once
LSTM_A, LSTM_B, std = loading_models_unique(station_id, day_of_testing)

for station_id in tqdm(list_of_stations):

    df_prediction = predict_iteration_unique(station_id, df_prediction, LSTM_A, LSTM_B, std)

df_prediction.to_csv('prediction - {}.csv'.format(str(pd.Timestamp.now())[:16]))
df_prediction.head()



100%|██████████| 1390/1390 [01:30<00:00, 15.37it/s]


CPU times: user 1min 34s, sys: 1.45 s, total: 1min 35s
Wall time: 1min 32s


In [8]:
df_prediction.head()

Unnamed: 0_level_0,last_observations,model_A,model_B,date_of_prediction
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1001,"[14, 14, 15, 14, 14, 12, 13, 15, 15, 14, 13, 1...","[15.273676, 15.231695, 15.175198, 15.235238, 1...","[14.727909, 14.658568, 14.602522, 14.43373, 14...",2020-05-27 19:59
1002,"[21, 21, 21, 20, 20, 18, 18, 19, 20, 21, 20, 1...","[18.044127, 17.870247, 17.801634, 18.003893, 1...","[17.52366, 17.463417, 17.327711, 17.12547, 17....",2020-05-27 19:59
1003,"[41, 40, 38, 39, 38, 40, 40, 40, 39, 40, 40, 3...","[38.40056, 38.845818, 38.19857, 38.379547, 38....","[40.199287, 40.079327, 39.655148, 39.16375, 38...",2020-05-27 19:59
1006,"[32, 32, 32, 30, 30, 32, 29, 30, 31, 30, 31, 3...","[22.111727, 21.848858, 21.868937, 21.952255, 2...","[22.12679, 22.086313, 21.87791, 21.54091, 21.5...",2020-05-27 19:59
1007,"[16, 16, 16, 15, 16, 16, 16, 15, 15, 15, 15, 1...","[3.229335, 3.2705624, 3.3252575, 3.276993, 3.3...","[3.8500624, 3.9042127, 3.967713, 3.9145994, 3....",2020-05-27 19:59


# II. Multiprocessing

Analysis cuda and processor, only one CPU core is predicting... can we use multithread?

In [9]:
from multiprocessing import Pool
from multiprocessing import freeze_support

In [10]:
%%time
# Main pipelinhe

# Variables
day_of_testing = '2020-05-19'

# Loading global model for al stations
station_id = 'global'
past_history = 36
future_target = 6
list_of_stations = list_stations()[:15]
df_prediction = create_result_df()

# Loading the models only once
LSTM_A, LSTM_B, std = loading_models_unique(station_id, day_of_testing)

CPU times: user 1.2 s, sys: 48.1 ms, total: 1.25 s
Wall time: 1.64 s


In [27]:
class df_prediction_generation_class:
    def __init__(self, df_prediction, LSTM_A, LSTM_B, std):
        self.df_prediction = df_prediction
        self.LSTM_A = LSTM_A
        self.LSTM_B = LSTM_B
        self.std = std
      
    def __call__(self, station_id):
        df_prediction = predict_iteration_unique(station_id, df_prediction, LSTM_A, LSTM_B, std)

In [28]:
df_prediction_generation_temp = df_prediction_generation_class(df_prediction, LSTM_A, LSTM_B, std)

In [29]:
def run_multiprocessing(func, i, n_processors):
    with Pool(processes=n_processors) as pool:
        return pool.map(func, i)

def main(station_id):
    n_processors = 14
    out = run_multiprocessing(df_prediction_generation_temp, station_id, n_processors)

In [30]:
%%time
df_prediction = create_result_df()
if __name__ == "__main__":
    freeze_support()   # required to use multiprocessing
    main(list_of_stations)

TypeError: can't pickle _thread.RLock objects

# III. Looping

In [6]:
df_prediction.to_csv('prediction - {}.csv'.format(str(pd.Timestamp.now())[:16]))


tl = Timeloop()


@tl.job(interval=timedelta(seconds=30))
def predicting_by_5_minutes():
    df_prediction = predict_iteration()
    df_prediction.to_csv('prediction - {}.csv'.format(str(pd.Timestamp.now())[:16]))

tl.start(block=True)