# Building and Testing the performance of different models for the "Occupation Quota" & "Hourly influx" of a Parisian street

In [1]:
"""
This notebook is used finetune and test out different Hyperparameter & Feature combinations.

- The different possible hyperparameters are set in the variable leaf 
  In the code check:
      leaf in [5,6,7,8,9,10] 

- The different Feature combinations are set in the variable experiments. 
  In the code check:

    # Sepearate features to use in combinations later
    feat_time = ['weekday','weekofyear', 'month', 'year', 'time']
    feat_meteo_d = ['tmin', 'tmax']
    feat_conf_fer = ['confin_0','confin_1', 'confin_2', 'couvrefeu', 'ferie']
    feat_mean = ['mean_taux_occupation_past_week'] if TARGET =="Taux d'occupation" else ['mean_debit_horaire_past_week']
    feat_covid = ['taux_occupation_sae']
    
    # Create feature combinations
    experiments = [feat_time+feat_mean , feat_time+feat_mean+feat_meteo_d , 
                feat_time+feat_mean+feat_conf_fer,
                feat_time+feat_mean+feat_meteo_d+feat_conf_fer,
                feat_time+feat_mean+feat_covid, feat_time+feat_mean+feat_meteo_d+feat_covid,
                feat_time+feat_mean+feat_meteo_d+feat_conf_fer+feat_covid
                ]


The notebook is to be used once. After each execution, all models will be created created and their parameters are saved in the CSV.
saved.
"""

'\nThis notebook is used finetune and test out different Hyperparameter & Feature combinations.\n\n- The different possible hyperparameters are set in the variable leaf \n  In the code check:\n      leaf in [5,6,7,8,9,10] \n\n- The different Feature combinations are set in the variable experiments. \n  In the code check:\n\n    # Sepearate features to use in combinations later\n    feat_time = [\'weekday\',\'weekofyear\', \'month\', \'year\', \'time\']\n    feat_meteo_d = [\'tmin\', \'tmax\']\n    feat_conf_fer = [\'confin_0\',\'confin_1\', \'confin_2\', \'couvrefeu\', \'ferie\']\n    feat_mean = [\'mean_taux_occupation_past_week\'] if TARGET =="Taux d\'occupation" else [\'mean_debit_horaire_past_week\']\n    feat_covid = [\'taux_occupation_sae\']\n    \n    # Create feature combinations\n    experiments = [feat_time+feat_mean , feat_time+feat_mean+feat_meteo_d , \n                feat_time+feat_mean+feat_conf_fer,\n                feat_time+feat_mean+feat_meteo_d+feat_conf_fer,\n     

#### Libraries

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
import time
import os
import tqdm.notebook as tq
import tensorflow as tf
import warnings
import matplotlib.image as mpimg
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestRegressor

## ---------- Building and Testing of several models with different Hyperparameters & Feautures ---------------



In [None]:
def set_date(df):
  df["Date et heure de comptage"] = pd.to_datetime(df["Date et heure de comptage"],format='%Y-%m-%d %H:%M:%S%z')
  return df

Regression Forest

In [None]:
count = 1

for street in ['washington','convention','sts'] :
  for TARGET in ["Taux d'occupation", "Débit horaire"]:
    resume = []

    print("Street : {}".format(street) )
    print('Target : {}'.format(TARGET) )
  
    if (street == 'washington' ):
      data_path = "datasets/washington_edited.csv" 
    
    elif (street == 'convention'):
      data_path = "datasets/convention_edited.csv"

    else :
      data_path = "datasets/sts_edited.csv"

    # Create folder with streetname if not created
    if street not in os.listdir(os.getcwd()) : 
      os.mkdir(street)
      print('Created Folder')

    # Create folder with target if not created
  
    if TARGET not in os.listdir(os.path.join(os.getcwd(),street)) : 
      os.mkdir(os.path.join(os.getcwd(),street,TARGET))

    # Load dataframe and adapt it to target       
    dataframe = pd.read_csv(data_path)
    # Do preprocessing
    dataframe.drop(["Unnamed: 0","index"], axis = 1 , inplace= True)
    dataframe = set_date(dataframe)
    dataframe.time = dataframe.time.apply(lambda x : int(x[:2]))
    
    if TARGET == "Taux d'occupation": 
      dataframe.drop(["Débit horaire","mean_debit_horaire_past_week","debit_horaire_past_week"], axis = 1 , inplace= True)
    else : 
      dataframe.drop(["Taux d'occupation","mean_taux_occupation_past_week","taux_occupation_past_week"], axis = 1 , inplace= True)
    
    dataframe.drop(["Etat trafic","Libelle noeud amont","Libelle noeud aval","date"], axis = 1 , inplace= True)
    dataframe = dataframe.interpolate() #get rid of NaN values

    
    # Sepearate features to use in combinations later
    feat_time = ['weekday','weekofyear', 'month', 'year', 'time']
    feat_meteo_d = ['tmin', 'tmax']
    feat_conf_fer = ['confin_0','confin_1', 'confin_2', 'couvrefeu', 'ferie']
    feat_mean = ['mean_taux_occupation_past_week'] if TARGET =="Taux d'occupation" else ['mean_debit_horaire_past_week']
    feat_covid = ['taux_occupation_sae']
    
    # Create feature combinations
    experiments = [feat_time+feat_mean , feat_time+feat_mean+feat_meteo_d , 
                feat_time+feat_mean+feat_conf_fer,
                feat_time+feat_mean+feat_meteo_d+feat_conf_fer,
                feat_time+feat_mean+feat_covid, feat_time+feat_mean+feat_meteo_d+feat_covid,
                feat_time+feat_mean+feat_meteo_d+feat_conf_fer+feat_covid
                ]

    # Split in training and testing data
    split_rate = 0.8
    i_split = int(dataframe.shape[0] * split_rate)
    df_train = dataframe[:i_split]
    df_test = dataframe[i_split:]
    tt = df_test['Date et heure de comptage']

    # Run for different hyperparameters & feautre combinations (experiments) and store results
    for leaf in [5,6,7,8,9,10] : 
      for e in tq.tqdm(experiments) : 
        timestep = datetime.datetime.now().strftime('%Y-%m-%d %H_%M_%S')
        os.mkdir(os.path.join(os.getcwd(),street,TARGET,timestep))
        x_train =  df_train[e].values
        x_test = df_test[e].values
        y_train = df_train[TARGET].values
        y_test = df_test[TARGET].values
        model = RandomForestRegressor(min_samples_leaf=leaf)
        model.fit(x_train,y_train)
        pred = model.predict(x_test)
        mse_tot = mean_squared_error(y_test,pred)
        mse_first = mean_squared_error(y_test[:24*6],pred[:24*6])
        record = {'name' : timestep , 'features' : e, "target": TARGET, "street": street, "mse_tot":mse_tot , "mse_first" :mse_first , "min_samples_leaf" : leaf, "path":os.path.join(os.getcwd(),street,TARGET,timestep)}
        resume.append(record)
        pd.DataFrame.from_records([record]).to_csv(os.path.join(os.getcwd(),street,TARGET,timestep,timestep+".csv"))
        fig = plt.figure(count)
        count = count + 1
        plt.plot(tt, y_test , color = 'r',label = 'real values')
        plt.plot(tt, pred , color ='b',label = 'prediected values')
        plt.title("Regression Forest  ||  The MSE_tot: {}".format(mse_tot))
        fig.savefig(os.path.join(os.getcwd(),street,TARGET,timestep,'total'+timestep+'.jpeg'))
        fig = plt.figure(count)
        count = count +1 
        plt.plot(tt[:24*6], y_test[:24*6] , color = 'r',label = 'real values')
        plt.plot(tt[:24*6], pred[:24*6] , color = 'b',label = 'predicted values')
        plt.title("Regression Forest  ||  The MSE_first_6days: {}".format(mse_first))
        fig.savefig(os.path.join(os.getcwd(), street,TARGET,timestep,'first'+timestep+'.jpeg'))
    pd.DataFrame.from_records(resume).sort_values('mse_tot').to_csv(os.path.join(os.getcwd(),street,TARGET,"Regression_Forest_"+street+'_'+TARGET+".csv"))


Output hidden; open in https://colab.research.google.com to view.