## **Introduction**

In this jupyter notebook, we build LSTM models for taxi trips from the period 2015 to 2016.

## **Setting up the environment**

In [None]:
!pip install holidays

!pip install ipython-autotime
%load_ext autotime

Collecting ipython-autotime
  Downloading https://files.pythonhosted.org/packages/e6/f9/0626bbdb322e3a078d968e87e3b01341e7890544de891d0cb613641220e6/ipython-autotime-0.1.tar.bz2
Building wheels for collected packages: ipython-autotime
  Building wheel for ipython-autotime (setup.py) ... [?25l[?25hdone
  Created wheel for ipython-autotime: filename=ipython_autotime-0.1-cp36-none-any.whl size=1832 sha256=726c7672a4611d26380959f1a1003f41f15f9f06cf6be1cd13a0d22d702df31a
  Stored in directory: /root/.cache/pip/wheels/d2/df/81/2db1e54bc91002cec40334629bc39cfa86dff540b304ebcd6e
Successfully built ipython-autotime
Installing collected packages: ipython-autotime
Successfully installed ipython-autotime-0.1


In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from numpy import array
import datetime
from datetime import date
from datetime import timedelta
import os
import holidays

from keras.preprocessing.sequence import TimeseriesGenerator
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras import optimizers 
import keras.backend as K

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from google.colab import drive
drive.mount('/content/gdrive')

Using TensorFlow backend.


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive
time: 20.7 s


## **Common parameters and functions to all models**

In [None]:
#initial date for training Jan 1st 2015 Midnight
initial_date = datetime.datetime(2015, 1, 1, 0, 0)

#validation hour range
validation_range = 24*7*2

#test hour range
test_range = 24*7*3

def convertDatetoHourIndex(taxiTripTimeStamp, yeartaxitrip):
  taxitripdate = datetime.datetime(year=yeartaxitrip, month=taxiTripTimeStamp[0], day=taxiTripTimeStamp[1], hour=taxiTripTimeStamp[2])
  diff = taxitripdate - initial_date
  return int(diff.total_seconds() / 3600.0)

def returnValidTestIndexes(endTrainingIndex):
  startValidationIndex = endTrainingIndex
  endValidationIndex = endTrainingIndex + validation_range
  startTestIndex = endValidationIndex
  return [startValidationIndex, startTestIndex]


time: 6.17 ms


In [None]:
#CAN BE MODIFIED!!!!!!!
end_training_index = convertDatetoHourIndex([9, 7, 8], 2016)
test_StartDate = "2016-09-07 08:00"
test_start_date_title = "2016_09_07_800"


#loading the dataframe
taxitrips_df = pd.read_csv("/content/two_years_2015_2016_dataframe.csv")

#list of regions by new/end flows
regions_to_loop_newflow = ['26_new', '53_new', '60_new', '62_new', '66_new', '68_new', '72_new', '73_new', '75_new', '76_new', '77_new', '78_new', '82_new', '83_new', '88_new', '125_new']
regions_to_loop_endflow = ['26_end', '53_end', '60_end', '62_end', '66_end', '68_end', '72_end', '73_end', '75_end', '76_end', '77_end', '78_end', '82_end', '83_end', '88_end', '125_end']

#setting training data
forecastHourRange = test_range
start_training_index = 0


#NO MODIFICATION NEEDED: path to folder containing the forecasts
base_results_csv_path_LSTM_twovariate = "/content/gdrive/My Drive/urban-computing-project/LSTM_twovariate_results/"
base_results_csv_path_LSTM_multivariate = "/content/gdrive/My Drive/urban-computing-project/LSTM_multivariate_results/"

statistical_method_twovariate = "LSTM_twovariate"
statistical_method_multivariate = "LSTM_twovariate"

#federal holidays
us_holidays = holidays.UnitedStates()

time: 122 ms


**RMSE functions**

In [None]:
def returnRMSEofTensors(yforecast, y):
  yforecast[yforecast < 0] = 0
  ydifference = (yforecast - y)**2
  rmse = (ydifference.sum()/(y.shape[0]*y.shape[1]))**0.5
  return int(rmse)

def returnRMSEofTensorsForPeriodForecastArray(yforecast, y, periodForecastArray):
  rmseArray = []
  for periodForecast in periodForecastArray:
    rmse = returnRMSEofTensors(yforecast[:periodForecast], y[:periodForecast])
    rmseArray.append(rmse)
  return rmseArray

def returnAverageValueTaxiFlows(y):
  averageflows = y.sum() / (y.shape[0]*y.shape[1])
  return int(averageflows)

def returnAverageTaxiFlowsArray(y, periodForecastArray):
  averageTaxiFlowsArray = []
  for periodForecast in periodForecastArray:
    averageTaxiFlowsArray.append(returnAverageValueTaxiFlows(y[:periodForecast]))
  return averageTaxiFlowsArray

def returnRelativeRMSEtoAverage(rmseArray, averageTaxiTripsArray):
  relativeRMSEtoAverageArray = []
  m = len(rmseArray)
  k = 0
  while k < m:
    if averageTaxiTripsArray[k] > 0:
      relativeRMSEtoAverageArray.append(int(100*rmseArray[k]/averageTaxiTripsArray[k]))
    else:
      relativeRMSEtoAverageArray.append(0)
    k = k + 1
  return relativeRMSEtoAverageArray


time: 24.9 ms


**File management functions**

In [None]:
def returnBaseNameOfFileFullPath(resultDatasetPath):
    name = resultDatasetPath.split('/')
    baseName = name[len(name)-1]
    name = baseName.split('.')
    return name[0]

def returnForecastsInDataframe(yForecastTest, yTest, regionsTitleArray, initialTestingDate):
    n = yTest.shape[0]
    date_df = pd.date_range(pd.Timestamp(initialTestingDate), periods=n, freq='h')
    
    region_forecasts_title_array = []
    region_test_title_array = []
    for regionIDtitle in regionsTitleArray:
      region_forecasts_title_array.append(regionIDtitle + "_forecast")
      region_test_title_array.append(regionIDtitle + "_test")
    
    columns = region_forecasts_title_array + region_test_title_array
    y_data = np.concatenate( (yForecastTest, yTest) ,axis=1)
    data = np.array(y_data)
    df = pd.DataFrame(data=data, columns=columns)

    df["timestamp"] = date_df

    return df

def saveforecastsintoCSV(yForecastTest, yTest, regionsTitleArray, initialTestingDate, resultsCSVFilePath):
  df = returnForecastsInDataframe(yForecastTest, yTest, regionsTitleArray, initialTestingDate)
  df.to_csv(resultsCSVFilePath)

time: 17.5 ms


**LSTM model**

In [None]:
def returnLSTMModel(sequenceLength, xtrain, ytrain, batchSize, nweights, initialLearningRate):
  generator = TimeseriesGenerator(xtrain, ytrain, length=sequenceLength, batch_size=batchSize)
  lstm_model = Sequential()
  lstm_model.add(LSTM(nweights, activation='relu', input_shape=(sequenceLength, xtrain.shape[1])))
  lstm_model.add(Dense(ytrain.shape[1]))
  adamOpti = optimizers.Adam(learning_rate=initialLearningRate)
  lstm_model.compile(optimizer=adamOpti, loss='mse')

  return [lstm_model, generator]


def returnLSTMForecast(lstm_model, sequenceLength, forecastPeriodRange, xtrain, featuresArray):
  lstm_predictions_scaled = list()

  batch = xtrain[-sequenceLength:]

  current_batch = batch.reshape((-1, sequenceLength, xtrain.shape[1]))

  j = 0
  for i in range(forecastPeriodRange):   
      lstm_pred = lstm_model.predict(current_batch)[0]

      elem_array = []
      for prediction in lstm_pred:
        elem_array.append(prediction)

      for feature in featuresArray[j]:
        elem_array.append(feature)

      elem = np.array(elem_array)
      elem = elem.reshape((1, xtrain.shape[1]))

      lstm_predictions_scaled.append(lstm_pred) 

      current_batch = np.append(current_batch[0] , elem, axis = 0)
      current_batch = current_batch[1:]
      current_batch = current_batch.reshape((-1, sequenceLength, xtrain.shape[1]))
      j = j + 1

  return lstm_predictions_scaled 

time: 33.7 ms


**Simulation function**

In [None]:
def runLSTMSimulation(resultsCSVFilePath, regionsTitleArray, dataSets, featuresSets, initialTestingDate, endingIndexForTraining, periodForecastArray,sequenceLength, batchSize, nweights, lstmModelName, numberEpochsArray,learningRateArray, verbose, saveModelToPath = "No"):

  startValidationIndex, startTestIndex = returnValidTestIndexes(endingIndexForTraining)

  #getting the data
  y_ = dataSets.copy()
  y_train = y_[:startValidationIndex].copy()
  y_validation = y_[startValidationIndex:startTestIndex].copy()
  y_test = y_[startTestIndex:].copy()
  
  #features
  features_train = featuresSets[:startValidationIndex]
  features_validation = featuresSets[startValidationIndex:startTestIndex]
  features_test = featuresSets[startTestIndex:]
  
  #scaling the target data
  scaler = MinMaxScaler()

  #scaling the data based on data inside the train dataset
  scaler.fit(y_train)
  scaled_train_data = scaler.transform(y_train)

  #the input of the test is the validation set
  scaled_test_data = scaler.transform(y_validation)

  #combining the data
  scaled_train_data_input = np.concatenate((scaled_train_data, features_train), axis=1)

  #the input for the test is the data of the validation
  scaled_test_data_input = np.concatenate((scaled_test_data, features_validation), axis=1)

  #get the lstm model
  lstmModel, generator = returnLSTMModel(sequenceLength, scaled_train_data_input, scaled_train_data, batchSize, nweights, learningRateArray[0])

  #training: 1st training with first learning rate and number of epochs
  lstmModel.fit_generator(generator, epochs=numberEpochsArray[0], verbose=verbose)

  #training: 2st training with second learning rate and number of epochs
  K.set_value(lstmModel.optimizer.lr, learningRateArray[1])
  lstmModel.fit_generator(generator, epochs=numberEpochsArray[1], verbose=verbose)
  
  #forecasts on validation set
  #returnLSTMForecast(lstm_model, sequenceLength, forecastPeriodRange, xtrain, featuresArray)
  period_to_forecast = y_validation.shape[0]
  lstm_predictions_scaled = returnLSTMForecast(lstmModel, sequenceLength, period_to_forecast, scaled_train_data_input, features_validation)
  lstm_predictions = scaler.inverse_transform(lstm_predictions_scaled)
  y_forecast_validation = lstm_predictions.copy()

  #forecast on validation period, computing RMSE for validation set
  rmse_validation_array = returnRMSEofTensorsForPeriodForecastArray(y_forecast_validation, y_validation, periodForecastArray[0])
  average_taxi_trips_validation_array = returnAverageTaxiFlowsArray(y_validation, periodForecastArray[0])
  rmse_to_average_taxi_trips_validation_array = returnRelativeRMSEtoAverage(rmse_validation_array, average_taxi_trips_validation_array)
  print("Validation set results")
  print("Region IDs :", regionsTitleArray)
  print("Forecasting :", periodForecastArray[0])
  print("RMSE :", rmse_validation_array)
  print("Average Taxi flows :", average_taxi_trips_validation_array)
  print("RMSE to average % :", rmse_to_average_taxi_trips_validation_array)

  #forecast on test period, computing RMSE for test set
  period_to_forecast = y_test.shape[0]
  lstm_predictions_scaled = returnLSTMForecast(lstmModel, sequenceLength, period_to_forecast, scaled_test_data_input, features_test)
  lstm_predictions = scaler.inverse_transform(lstm_predictions_scaled)
  y_forecast_test = lstm_predictions.copy()

  rmse_test_array = returnRMSEofTensorsForPeriodForecastArray(y_forecast_test, y_test, periodForecastArray[1])
  average_taxi_trips_test_array = returnAverageTaxiFlowsArray(y_test, periodForecastArray[1])
  rmse_to_average_taxi_trips_test_array = returnRelativeRMSEtoAverage(rmse_test_array, average_taxi_trips_test_array)
  print()
  print("Test set results")
  print("Forecasting :", periodForecastArray[1])
  print("RMSE :", rmse_test_array)
  print("Average Taxi flows :", average_taxi_trips_test_array)
  print("RMSE to average % :", rmse_to_average_taxi_trips_test_array)

  print()
  
  #save results in CSV
  #to save: regionsTitleArray, y_forecast_test, y_test, initialTestingDate, resultsCSVFilePath 
  saveforecastsintoCSV(y_forecast_test, y_test, regionsTitleArray, initialTestingDate, resultsCSVFilePath)

  #save model 
  if len(saveModelToPath) > 5:
    lstmModel.save(saveModelToPath)


time: 78.3 ms


In [None]:
taxitrips_df.head()

Unnamed: 0.1,Unnamed: 0,timestamp,month,day,hour,weekday,holiday,26_new,26_end,53_new,53_end,60_new,60_end,62_new,62_end,66_new,66_end,68_new,68_end,72_new,72_end,73_new,73_end,75_new,75_end,76_new,76_end,77_new,77_end,78_new,78_end,82_new,82_end,83_new,83_end,88_new,88_end,125_new,125_end
0,0,2015-01-01 00:00:00,1,1,0,4,1,20,0,108,96,904,818,59,84,1695,1168,123,103,410,382,62,100,94,55,207,231,149,137,169,140,19,23,68,72,39,33,2,3
1,1,2015-01-01 01:00:00,1,1,1,4,1,15,0,177,180,1626,1595,118,158,2117,1589,169,156,512,420,56,37,78,36,203,350,171,223,246,255,35,45,50,107,35,68,9,0
2,2,2015-01-01 02:00:00,1,1,2,4,1,4,0,229,253,1967,2107,186,262,2019,1683,212,217,341,388,77,35,91,54,192,326,189,218,274,303,34,54,48,84,42,95,1,0
3,3,2015-01-01 03:00:00,1,1,3,4,1,0,8,168,245,1529,1791,124,166,1650,1290,163,195,173,289,50,31,79,40,122,235,113,155,163,221,32,35,22,52,25,61,0,0
4,4,2015-01-01 04:00:00,1,1,4,4,1,0,4,39,117,693,823,40,86,1018,663,82,117,61,149,14,29,39,20,37,108,52,102,62,145,20,26,16,28,5,41,0,0


time: 79.4 ms


### **2variate, LAUNCHING THE SIMULATION PART:**

In [None]:
regions_to_loop_endflow = []
regions_to_loop_newflow = []

for (columnName, columnData) in taxitrips_df.iteritems():
  if "end" in columnName :
    regions_to_loop_endflow.append(columnName)
  if "new" in columnName:
    regions_to_loop_newflow.append(columnName)

regions_pairs_to_loop = []

p = 0
while p < len(regions_to_loop_newflow):
  regions_pairs_to_loop.append([regions_to_loop_newflow[p], regions_to_loop_endflow[p]])
  p = p + 1

print(regions_pairs_to_loop)

[['26_new', '26_end'], ['53_new', '53_end'], ['60_new', '60_end'], ['62_new', '62_end'], ['66_new', '66_end'], ['68_new', '68_end'], ['72_new', '72_end'], ['73_new', '73_end'], ['75_new', '75_end'], ['76_new', '76_end'], ['77_new', '77_end'], ['78_new', '78_end'], ['82_new', '82_end'], ['83_new', '83_end'], ['88_new', '88_end'], ['125_new', '125_end']]
time: 6.68 ms


### **Testing 2016-09-07 08:00**

In [None]:
regions_to_loop_flows = [["26_new", "26_end"], ["125_new", "125_end"]]

features_array = ["month", "weekday", "hour", "holiday"]


#MAY BE MODIFIED!!!!!!!
statistical_method = "LSTM_two_variate"
end_validation_index = convertDatetoHourIndex([9, 7, 8], 2016)
test_startDate = "2016-09-07 08:00"
test_start_date_title = "2016_09_07_800"
base_results_csv_path = "/content/gdrive/My Drive/urban-computing-project/LSTM_two_variate_results/"

base_results_csv_path = "/content/"

#setting training data
#forecastHourRange = 24*7*3
start_training_index = 0
end_training_index = end_validation_index - validation_range
end_testing_index = end_validation_index + test_range

#forecasts parameters
period_forecast_array_validation = [1, 2, 3, 4, 5, 10, 12, 16, 24, 48, 72, 24*7, 24*7*2]
period_forecast_array_test = [1, 2, 3, 4, 5, 10, 12, 16, 24, 48, 72, 168, 336, 504]
period_forecast_array = [period_forecast_array_validation, period_forecast_array_test]

#lstm parameters
lstm_model_name = statistical_method
sequence_length = 6
batch_size = 64
n_weights = 420
number_epochs_array = [16, 10]
learning_rate_array = [0.01, 0.001]
verboseValue = 1

print("Testing on :", test_startDate)
print("Features :", features_array)
print()

for regionIDsarray in regions_to_loop_flows:

  print("Region IDs :", regionIDsarray)

  regionsIDstr = ""
  data_set = array([])
  for regionIDval in regionIDsarray:
    regionsIDstr = regionsIDstr + "_" + regionIDval
    y_dataset = array(taxitrips_df[regionIDval][start_training_index:end_testing_index].copy())
    y_dataset = y_dataset.reshape(y_dataset.shape[0], 1)

    if data_set.shape[0] == 0:
      data_set = y_dataset.copy()
    else:
      data_set = np.concatenate((data_set, y_dataset), axis=1)

  features_set = array([])
  for feature_title in features_array:
    feature_dataset = array(taxitrips_df[feature_title][start_training_index:end_testing_index].copy())
    feature_dataset = feature_dataset.reshape(feature_dataset.shape[0], 1)

    if features_set.shape[0] == 0:
      features_set = feature_dataset.copy()
    else:
      features_set = np.concatenate((features_set, feature_dataset), axis=1)


  results_CSV_FilePath = base_results_csv_path + lstm_model_name + regionsIDstr + "_" + test_start_date_title + ".csv"

  print("saving to", results_CSV_FilePath)

  runLSTMSimulation(results_CSV_FilePath, regionIDsarray, data_set, features_set, test_startDate, end_training_index, period_forecast_array, sequence_length, batch_size, n_weights, lstm_model_name, number_epochs_array, learning_rate_array, verboseValue)


Testing on : 2016-09-07 08:00
Features : ['month', 'weekday', 'hour', 'holiday']

Region IDs : ['26_new', '26_end']
saving to /content/LSTM_two_variate_26_new_26_end_2016_09_07_800.csv
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Validation set results
Region IDs : ['26_new', '26_end']
Forecasting : [1, 2, 3, 4, 5, 10, 12, 16, 24, 48, 72, 168, 336]
RMSE : [2, 34, 34, 30, 33, 45, 41, 38, 32, 32, 28, 32, 33]
Average Taxi flows : [87, 83, 89, 103, 121, 165, 162, 150, 109, 112, 105, 98, 90]
RMSE to average % : [2, 40, 38, 29, 27, 27, 25, 25, 29, 28, 26, 32, 36]

Test set results
Forecasting : [1, 2, 3, 4, 5, 10, 12, 16, 24, 48, 72, 168, 336, 504]
RMSE : [20, 44, 42, 47, 46, 41, 37, 42, 35, 39, 39, 46, 55, 59]
Average Taxi flows : [101, 128, 133,

26:

nweights : 420

batch : 64

epochs: 15, 10

RMSE : [5, 36, 36, 32, 33, 46, 42, 38, 34, 32, 29, 34, 35]

Average Taxi : [87, 83, 89, 103, 121, 165, 162, 150, 109, 112, 105, 98, 90]



In [None]:
resultsdf = pd.read_csv("/content/LSTM_two_variate__26_new_26_end.csv")
resultsdf.head()

Unnamed: 0.1,Unnamed: 0,26_new_forecast,26_end_forecast,26_new_test,26_end_test,timestamp
0,0,125.312207,89.977704,126.0,77.0,2016-09-07 08:00:00
1,1,160.439267,101.281889,216.0,93.0,2016-09-07 09:00:00
2,2,175.858911,116.185771,207.0,84.0,2016-09-07 10:00:00
3,3,187.447994,136.100535,247.0,100.0,2016-09-07 11:00:00
4,4,194.948627,159.136458,197.0,84.0,2016-09-07 12:00:00


time: 30.6 ms


### **Testing 5 octobre 8AM**

In [None]:
regions_to_loop_flows = regions_pairs_to_loop #[["66_new", "66_end"]]

features_array = ["month", "weekday", "hour", "holiday"]

#MAY BE MODIFIED!!!!!!!
statistical_method = "LSTM_two_var_MWHH"
end_validation_index = convertDatetoHourIndex([10, 5, 8], 2016)
test_startDate = "2016-10-05 08:00"
test_start_date_title = "2016_10_05_800"
base_results_csv_path = "/content/gdrive/My Drive/urban-computing-project/LSTM_two_variate_results/"

base_results_csv_path = "/content/"

#setting training data
#forecastHourRange = 24*7*3
start_training_index = 0
end_training_index = end_validation_index - validation_range
end_testing_index = end_validation_index + test_range

#forecasts parameters
period_forecast_array_validation = [1, 2, 3, 4, 5, 10, 12, 16, 24, 48, 72, 24*7, 24*7*2]
period_forecast_array_test = [1, 2, 3, 4, 5, 10, 12, 16, 24, 48, 72, 168, 336, 504]
period_forecast_array = [period_forecast_array_validation, period_forecast_array_test]

#lstm parameters
lstm_model_name = statistical_method
sequence_length = 6
batch_size = 64
n_weights = 450
number_epochs_array = [15, 10]
learning_rate_array = [0.01, 0.001]
verboseValue = 1

print("Testing on :", test_startDate)
print("Features :", features_array)
print()

for regionIDsarray in regions_to_loop_flows:

  print("Region IDs :", regionIDsarray)

  regionsIDstr = ""
  data_set = array([])
  for regionIDval in regionIDsarray:
    regionsIDstr = regionsIDstr + "_" + regionIDval
    y_dataset = array(taxitrips_df[regionIDval][start_training_index:end_testing_index].copy())
    y_dataset = y_dataset.reshape(y_dataset.shape[0], 1)

    if data_set.shape[0] == 0:
      data_set = y_dataset.copy()
    else:
      data_set = np.concatenate((data_set, y_dataset), axis=1)

  features_set = array([])
  for feature_title in features_array:
    feature_dataset = array(taxitrips_df[feature_title][start_training_index:end_testing_index].copy())
    feature_dataset = feature_dataset.reshape(feature_dataset.shape[0], 1)

    if features_set.shape[0] == 0:
      features_set = feature_dataset.copy()
    else:
      features_set = np.concatenate((features_set, feature_dataset), axis=1)


  results_CSV_FilePath = base_results_csv_path + lstm_model_name + regionsIDstr + "_" + test_start_date_title + ".csv"

  print("saving to", results_CSV_FilePath)

  runLSTMSimulation(results_CSV_FilePath, regionIDsarray, data_set, features_set, test_startDate, end_training_index, period_forecast_array, sequence_length, batch_size, n_weights, lstm_model_name, number_epochs_array, learning_rate_array, verboseValue)


Testing on : 2016-10-05 08:00
Features : ['month', 'weekday', 'hour', 'holiday']

Region IDs : ['66_new', '66_end']
saving to /content/LSTM_two_var_MWHH_66_new_66_end_2016_10_05_800.csv
Epoch 1/14
Epoch 2/14
Epoch 3/14
Epoch 4/14
Epoch 5/14
Epoch 6/14
Epoch 7/14
Epoch 8/14
Epoch 9/14
Epoch 10/14
Epoch 11/14
Epoch 12/14
Epoch 13/14
Epoch 14/14
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Validation set results
Region IDs : ['66_new', '66_end']
Forecasting : [1, 2, 3, 4, 5, 10, 12, 16, 24, 48, 72, 168, 336]
RMSE : [167, 170, 163, 142, 130, 145, 136, 130, 114, 146, 164, 140, 158]
Average Taxi flows : [1095, 1126, 1071, 1015, 1019, 1051, 1101, 1037, 755, 796, 859, 710, 674]
RMSE to average % : [15, 15, 15, 13, 12, 13, 12, 12, 15, 18, 19, 19, 23]

Test set results
Forecasting : [1, 2, 3, 4, 5, 10, 12, 16, 24, 48, 72, 168, 336, 504]
RMSE : [109, 85, 83, 72, 70, 77, 104, 98, 108, 172, 183, 168, 174, 180]
Average Taxi flows : [8

66:

batch 65 
nweights 470
epochs 10 10

RMSE: [161, 183, 178, 154, 145, 135, 127, 132, 109, 115, 129, 110, 144]

RMSE (other wiht 465 17 10): [109, 100, 97, 94, 85, 93, 88, 112, 98, 121, 139, 135, 158]

Average Taxi flows : [1095, 1126, 1071, 1015, 1019, 1051, 1101, 1037, 755, 796, 859, 710, 674]

In [None]:
resultsdf = pd.read_csv("/content/LSTM_two_variate__66_new_66_end.csv")
resultsdf.head()

Unnamed: 0.1,Unnamed: 0,66_new_forecast,66_end_forecast,66_new_test,66_end_test,timestamp
0,0,998.559728,560.217304,1141.0,595.0,2016-10-05 08:00:00
1,1,992.678063,696.645304,1103.0,724.0,2016-10-05 09:00:00
2,2,797.702052,676.854485,907.0,803.0,2016-10-05 10:00:00
3,3,763.601174,758.709401,797.0,825.0,2016-10-05 11:00:00
4,4,842.06761,888.98894,871.0,1022.0,2016-10-05 12:00:00


time: 25 ms


In [None]:
resultsdf = pd.read_csv("/content/LSTM_two_variate_66_new66_end.csv")
resultsdf.head()

Unnamed: 0.1,Unnamed: 0,66_new_forecast,66_end_forecast,66_new_test,66_end_test,timestamp
0,0,1010.937213,541.247729,1141.0,595.0,2016-10-05 08:00:00
1,1,1039.500943,695.708975,1103.0,724.0,2016-10-05 09:00:00
2,2,849.303351,689.876416,907.0,803.0,2016-10-05 10:00:00
3,3,784.445345,752.201009,797.0,825.0,2016-10-05 11:00:00
4,4,885.360155,909.178052,871.0,1022.0,2016-10-05 12:00:00


time: 40.7 ms


In [None]:
results_df_66_end = pd.read_csv("/content/gdrive/My Drive/urban-computing-project/TBATS_results/TBATS_66_end_2016_10_05_800.csv")
results_df_66_new = pd.read_csv("/content/gdrive/My Drive/urban-computing-project/TBATS_results/TBATS_66_new_2016_10_05_800.csv")

results_df_66_end.head()

all_results_df = pd.DataFrame()

all_results_df["timestamp"] = resultsdf["timestamp"].copy()
all_results_df["y_new_LSTM_forecast"] = resultsdf["66_new_forecast"].copy()
all_results_df["y_new"] = resultsdf["66_new_test"].copy()
all_results_df["y_new_TBATS_forecast"] = results_df_66_new["yforecast"].copy()
all_results_df["y_end_LSTM_forecast"] = resultsdf["66_end_forecast"].copy()
all_results_df["y_end"] = resultsdf["66_end_test"].copy()
all_results_df["y_end_TBATS_forecast"] = results_df_66_end["yforecast"].copy()
all_results_df.head()

Unnamed: 0,timestamp,y_new_LSTM_forecast,y_new,y_new_TBATS_forecast,y_end_LSTM_forecast,y_end,y_end_TBATS_forecast
0,2016-10-05 08:00:00,1010.937213,1141.0,936.009549,541.247729,595.0,533.711667
1,2016-10-05 09:00:00,1039.500943,1103.0,991.40312,695.708975,724.0,685.03368
2,2016-10-05 10:00:00,849.303351,907.0,854.976458,689.876416,803.0,714.773246
3,2016-10-05 11:00:00,784.445345,797.0,810.130331,752.201009,825.0,778.292041
4,2016-10-05 12:00:00,885.360155,871.0,856.986224,909.178052,1022.0,922.48575


time: 41.6 ms


In [None]:
results_df_66_end.head()

Unnamed: 0.1,Unnamed: 0,timestamp,regionID_flow,ytest,yforecast
0,0,2016-10-05 08:00:00,66_end,595,533.711667
1,1,2016-10-05 09:00:00,66_end,724,685.03368
2,2,2016-10-05 10:00:00,66_end,803,714.773246
3,3,2016-10-05 11:00:00,66_end,825,778.292041
4,4,2016-10-05 12:00:00,66_end,1022,922.48575


time: 19.7 ms


### **All multivariate series LSTM**

In [None]:
all_regions_together_to_loop = []

p = 0
while p < len(regions_to_loop_newflow):
  all_regions_together_to_loop.append(regions_to_loop_newflow[p])
  all_regions_together_to_loop.append(regions_to_loop_endflow[p])
  p = p + 1

print(all_regions_together_to_loop)

['26_new', '26_end', '53_new', '53_end', '60_new', '60_end', '62_new', '62_end', '66_new', '66_end', '68_new', '68_end', '72_new', '72_end', '73_new', '73_end', '75_new', '75_end', '76_new', '76_end', '77_new', '77_end', '78_new', '78_end', '82_new', '82_end', '83_new', '83_end', '88_new', '88_end', '125_new', '125_end']
time: 2.53 ms


In [None]:
regions_to_loop_flows = [all_regions_together_to_loop]

features_array = ["month", "weekday", "hour", "holiday"]

#MAY BE MODIFIED!!!!!!!
statistical_method = "LSTM_multivar_"
end_validation_index = convertDatetoHourIndex([10, 5, 8], 2016)
test_startDate = "2016-10-05 08:00"
test_start_date_title = "2016_10_05_800"
base_results_csv_path = "/content/gdrive/My Drive/urban-computing-project/LSTM_multivariate_results/"

base_results_csv_path = "/content/"

#setting training data
#forecastHourRange = 24*7*3
start_training_index = 0
end_training_index = end_validation_index - validation_range
end_testing_index = end_validation_index + test_range

#forecasts parameters
period_forecast_array_validation = [1, 2, 3, 4, 5, 10, 12, 16, 24, 48, 72, 24*7, 24*7*2]
period_forecast_array_test = [1, 2, 3, 4, 5, 10, 12, 16, 24, 48, 72, 168, 336, 504]
period_forecast_array = [period_forecast_array_validation, period_forecast_array_test]

#lstm parameters
lstm_model_name = statistical_method
sequence_length = 6
batch_size = 64
n_weights = 2000
number_epochs_array = [10, 10]
learning_rate_array = [0.01, 0.001]
verboseValue = 1

print("Testing on :", test_startDate)
print("Features :", features_array)
print()

for regionIDsarray in regions_to_loop_flows:

  print("Region IDs : ALL REGION IDs")

  regionsIDstr = "all_regions"
  data_set = array([])
  for regionIDval in regionIDsarray:
    #regionsIDstr = regionsIDstr + "_" + regionIDval
    y_dataset = array(taxitrips_df[regionIDval][start_training_index:end_testing_index].copy())
    y_dataset = y_dataset.reshape(y_dataset.shape[0], 1)

    if data_set.shape[0] == 0:
      data_set = y_dataset.copy()
    else:
      data_set = np.concatenate((data_set, y_dataset), axis=1)

  features_set = array([])
  for feature_title in features_array:
    feature_dataset = array(taxitrips_df[feature_title][start_training_index:end_testing_index].copy())
    feature_dataset = feature_dataset.reshape(feature_dataset.shape[0], 1)

    if features_set.shape[0] == 0:
      features_set = feature_dataset.copy()
    else:
      features_set = np.concatenate((features_set, feature_dataset), axis=1)


  results_CSV_FilePath = base_results_csv_path + lstm_model_name + regionsIDstr + "_" + test_start_date_title + ".csv"

  print("saving to", results_CSV_FilePath)
  
  print()

  runLSTMSimulation(results_CSV_FilePath, regionIDsarray, data_set, features_set, test_startDate, end_training_index, period_forecast_array, sequence_length, batch_size, n_weights, lstm_model_name, number_epochs_array, learning_rate_array, verboseValue)
