# Travel Time Estimation

## ETA (Estimated Time of Arrival)

## ETD (Estimated Time of Departure)

## 1. Importing libraries

In [1]:
## Created on 7 Feb, 2021 08:23 PM IST 
## For IXIGO Evaluation

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.express as px
import datetime
import seaborn as sns
import numpy as np
import math
from time import strftime
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

## 2. Load Data

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

## 3. Basic Info on Data

In [3]:
train.head()


Unnamed: 0,runDate,stations,trainCode,trainStationId,scheduledArrival,scheduledDeparture,actualArrival,actualDeparture,distance,dayCount,ArrivalDelay,DepartureDelay
0,2020-01-01,FZR,12138,2657812,,2020-01-01 21:40:00,2020-01-01 00:00:00,2020-01-01 21:40:00,0,0,0,0
1,2020-01-01,FDK,12138,2659393,2020-01-01 22:05:00,2020-01-01 22:07:00,2020-01-01 22:16:00,2020-01-01 22:18:00,32,0,11,11
2,2020-01-01,KKP,12138,2658108,2020-01-01 22:23:00,2020-01-01 22:25:00,2020-01-01 22:37:00,2020-01-01 22:38:00,45,0,14,13
3,2020-01-01,GJUT,12138,2734206,2020-01-01 22:38:00,2020-01-01 22:40:00,2020-01-01 22:56:00,2020-01-01 22:57:00,61,0,18,17
4,2020-01-01,GNA,12138,2742572,2020-01-01 22:54:00,2020-01-01 22:56:00,2020-01-01 23:14:00,2020-01-01 23:15:00,76,0,20,19


In [4]:
test.head()

Unnamed: 0,runDate,stations,trainId,trainCode,index,trainStationId,scheduledArrival,scheduledDeparture,distance,dayCount,day
0,2020-02-20,FZR,11528,12138,1,2657812,,2020-02-20 21:40:00,0,0,Thursday
1,2020-02-20,FDK,11528,12138,2,2659393,2020-02-20 22:05:00,2020-02-20 22:07:00,32,0,Thursday
2,2020-02-20,KKP,11528,12138,3,2658108,2020-02-20 22:23:00,2020-02-20 22:25:00,45,0,Thursday
3,2020-02-20,GJUT,11528,12138,4,2734206,2020-02-20 22:38:00,2020-02-20 22:40:00,61,0,Thursday
4,2020-02-20,GNA,11528,12138,5,2742572,2020-02-20 22:54:00,2020-02-20 22:56:00,76,0,Thursday


In [5]:
print(train.shape, test.shape)

(31875, 12) (695, 11)


### Missing Columns 
1. Train ID 
2. Index
3. Actual Arrival 
4. Actual Departure 
5. Day 
6. Arrival Delay 
7. Departure Delay

In [6]:
## Info

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31875 entries, 0 to 31874
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   runDate             31875 non-null  object
 1   stations            31875 non-null  object
 2   trainCode           31875 non-null  int64 
 3   trainStationId      31875 non-null  int64 
 4   scheduledArrival    31184 non-null  object
 5   scheduledDeparture  31097 non-null  object
 6   actualArrival       31875 non-null  object
 7   actualDeparture     31775 non-null  object
 8   distance            31875 non-null  int64 
 9   dayCount            31875 non-null  int64 
 10  ArrivalDelay        31875 non-null  int64 
 11  DepartureDelay      31875 non-null  int64 
dtypes: int64(6), object(6)
memory usage: 2.9+ MB


## 4. Correcting inconsistencies in Data

1. Checking percentage of NA values
2. Dropping NA values 
3. Imputing NA values
4. Renaming Column Names
5. Converting Data Types to datetime64[ns] format.

### 4.1 Imputing Data {2}
We can easily remove all the NA values but after careful filter rows inspection, it has come to our notice that stations like "CSMT" would have no data point left. Hence we cannot simply drop the data we have to impute the data after checking the values from "ArrivalDelay" and "DepartureDelay". 

NA Values in 
SELECT NA values

1. scheduledArrival = actualArrival - ArrivalDelay
2. scheduledDeparture = actualDeparture - DepartureDelay

Since, there are no NA values in actualArrival, we jump to actualDeparture

3. actualDeparture = scheduledDeparture + DepartureDelay  ( In cases where scheduledDeparture is available)

There are no cases of NA in ArrivalDelay, DepartureDelay.

In [7]:
## Percentage of missing values in each column

missing_values = pd.DataFrame({c:(train[c].isna().sum()/len(train))*100 for c in train.columns},index=["% missing values in train"])
missing_values = missing_values.append(pd.DataFrame({c:(test[c].isna().sum()/len(test))*100 for c in test.columns},index=["% missing values in test"]))
missing_values

Unnamed: 0,runDate,stations,trainCode,trainStationId,scheduledArrival,scheduledDeparture,actualArrival,actualDeparture,distance,dayCount,ArrivalDelay,DepartureDelay,trainId,index,day
% missing values in train,0.0,0.0,0.0,0.0,2.167843,2.440784,0.0,0.313725,0.0,0.0,0.0,0.0,,,
% missing values in test,0.0,0.0,0.0,0.0,2.158273,2.446043,,,0.0,0.0,,,0.0,0.0,0.0


In [8]:
train = train.rename(columns = {'scheduledArrival': 'sA', 'scheduledDeparture': 'sD', 'actualArrival': 'aA', 'actualDeparture': 'aD'})
test = test.rename(columns = {'scheduledArrival': 'sA', 'scheduledDeparture': 'sD'})


to_dt = lambda x: pd.to_datetime(x)

train['runDate'] = pd.to_datetime(train['runDate'])
train['aA'] = pd.to_datetime(train['aA'])
train['aD'] = pd.to_datetime(train['aD'])
train['sA'] = pd.to_datetime(train['sA'])
train['sD'] = pd.to_datetime(train['sD'])
test['sA'] = pd.to_datetime(test['sA'])
test['sD'] = pd.to_datetime(test['sD'])
test['runDate'] = pd.to_datetime(test['runDate'])


train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31875 entries, 0 to 31874
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   runDate         31875 non-null  datetime64[ns]
 1   stations        31875 non-null  object        
 2   trainCode       31875 non-null  int64         
 3   trainStationId  31875 non-null  int64         
 4   sA              31184 non-null  datetime64[ns]
 5   sD              31097 non-null  datetime64[ns]
 6   aA              31875 non-null  datetime64[ns]
 7   aD              31775 non-null  datetime64[ns]
 8   distance        31875 non-null  int64         
 9   dayCount        31875 non-null  int64         
 10  ArrivalDelay    31875 non-null  int64         
 11  DepartureDelay  31875 non-null  int64         
dtypes: datetime64[ns](5), int64(6), object(1)
memory usage: 2.9+ MB


## 5. Data Wrangling, Feature Engineering and EDA

1. Finding Unique Values in every Feature
2. Dropping Redundant Columns
3. Imputing NaN
4. Dropping some feautures
5. Breaking `datetime` to columns namely year, month, date, hours, minutes, secs

In [9]:
for i in range(len(train.columns)): 
    # print("Unique Values in " + train.columns[i] +  " = "  + " {} ".format(train[train.columns[i]].unique()))
    print(" Unique Values in " + train.columns[i] +  " = "  + " {}  ".format(train[train.columns[i]].unique().shape))

 Unique Values in runDate =  (50,)  
 Unique Values in stations =  (349,)  
 Unique Values in trainCode =  (15,)  
 Unique Values in trainStationId =  (349,)  
 Unique Values in sA =  (24480,)  
 Unique Values in sD =  (23505,)  
 Unique Values in aA =  (25594,)  
 Unique Values in aD =  (25496,)  
 Unique Values in distance =  (573,)  
 Unique Values in dayCount =  (3,)  
 Unique Values in ArrivalDelay =  (501,)  
 Unique Values in DepartureDelay =  (455,)  


After finding the unique values in all the features we get to know that the data is from the time span `runDate` 2020-01-01 to 2020-02-19 among 337  `stations` with 15 trains `trainCode` . Train Stations having their 337 `trainStationID` with Arrival and Departure frequency of 25051 in this time span. Trains take around `dayCount` from 0 to 2

Since, `stations` are already coded with their `trainStationID` we can remove one of them 

In [10]:
## Dropping the `stations` column to remove redundancy 
try:
    train = train.drop('stations', axis=1)
    test = test.drop('stations', axis=1)
except: 
    train
train

Unnamed: 0,runDate,trainCode,trainStationId,sA,sD,aA,aD,distance,dayCount,ArrivalDelay,DepartureDelay
0,2020-01-01,12138,2657812,NaT,2020-01-01 21:40:00,2020-01-01 00:00:00,2020-01-01 21:40:00,0,0,0,0
1,2020-01-01,12138,2659393,2020-01-01 22:05:00,2020-01-01 22:07:00,2020-01-01 22:16:00,2020-01-01 22:18:00,32,0,11,11
2,2020-01-01,12138,2658108,2020-01-01 22:23:00,2020-01-01 22:25:00,2020-01-01 22:37:00,2020-01-01 22:38:00,45,0,14,13
3,2020-01-01,12138,2734206,2020-01-01 22:38:00,2020-01-01 22:40:00,2020-01-01 22:56:00,2020-01-01 22:57:00,61,0,18,17
4,2020-01-01,12138,2742572,2020-01-01 22:54:00,2020-01-01 22:56:00,2020-01-01 23:14:00,2020-01-01 23:15:00,76,0,20,19
...,...,...,...,...,...,...,...,...,...,...,...
31870,2020-02-19,12925,2658284,2020-02-20 17:19:00,2020-02-20 17:21:00,2020-02-20 18:39:00,2020-02-20 18:41:00,1723,1,80,80
31871,2020-02-19,12925,2657701,2020-02-20 17:37:00,2020-02-20 17:39:00,2020-02-20 19:04:00,2020-02-20 19:06:00,1739,1,87,87
31872,2020-02-19,12925,2657676,2020-02-20 17:53:00,2020-02-20 17:58:00,2020-02-20 19:15:00,2020-02-20 19:30:00,1744,1,82,92
31873,2020-02-19,12925,2658299,2020-02-20 18:30:00,2020-02-20 18:32:00,2020-02-20 19:58:00,2020-02-20 20:00:00,1780,1,88,88


In [11]:
## Imputing NA values in sA and sD


to_td = lambda x : pd.to_timedelta(x, unit = 'm')


train['sA'] = train['aA'] -  train['ArrivalDelay'].apply(to_td)
train['sD'] = train['aD'] - train['DepartureDelay'].apply(to_td)

In [12]:
missing_values = pd.DataFrame({c:(train[c].isna().sum()/len(train))*100 for c in train.columns},index=["% missing values in train"])
missing_values = missing_values.append(pd.DataFrame({c:(test[c].isna().sum()/len(test))*100 for c in test.columns},index=["% missing values in test"]))
missing_values

Unnamed: 0,runDate,trainCode,trainStationId,sA,sD,aA,aD,distance,dayCount,ArrivalDelay,DepartureDelay,trainId,index,day
% missing values in train,0.0,0.0,0.0,0.0,0.313725,0.0,0.313725,0.0,0.0,0.0,0.0,,,
% missing values in test,0.0,0.0,0.0,2.158273,2.446043,,,0.0,0.0,,,0.0,0.0,0.0


### After Imputing the data 

Now we know that `ArrivalDelay`, `DepartureDelay` were derived columns but we can use those columns to derive the missing values in `scheduledArrival` and `scheduledDeparture`. 

For the missing values in test `sA` and `sD`; train `sD`. We will drop those values in both test and train.


In [13]:
## Dropping NA values  

train = train.dropna()
test = test.dropna()

### Dropping some features

Since `Arrival Delay` is a derived column we can remove the `actualArrival` and `actualDeparture` from the train data

In [14]:
try:
    train = train.drop('aA', axis = 1)
    train = train.drop('aD', axis = 1)
except:
    train
train

Unnamed: 0,runDate,trainCode,trainStationId,sA,sD,distance,dayCount,ArrivalDelay,DepartureDelay
0,2020-01-01,12138,2657812,2020-01-01 00:00:00,2020-01-01 21:40:00,0,0,0,0
1,2020-01-01,12138,2659393,2020-01-01 22:05:00,2020-01-01 22:07:00,32,0,11,11
2,2020-01-01,12138,2658108,2020-01-01 22:23:00,2020-01-01 22:25:00,45,0,14,13
3,2020-01-01,12138,2734206,2020-01-01 22:38:00,2020-01-01 22:40:00,61,0,18,17
4,2020-01-01,12138,2742572,2020-01-01 22:54:00,2020-01-01 22:56:00,76,0,20,19
...,...,...,...,...,...,...,...,...,...
31870,2020-02-19,12925,2658284,2020-02-20 17:19:00,2020-02-20 17:21:00,1723,1,80,80
31871,2020-02-19,12925,2657701,2020-02-20 17:37:00,2020-02-20 17:39:00,1739,1,87,87
31872,2020-02-19,12925,2657676,2020-02-20 17:53:00,2020-02-20 17:58:00,1744,1,82,92
31873,2020-02-19,12925,2658299,2020-02-20 18:30:00,2020-02-20 18:32:00,1780,1,88,88


### Breaking down the datetime values

1. Dropping the year 2020 from all the datetime formats
2. Dropping the `second` from datetime values. 
3. Breaking all the date time values in `test` and `train` data.

Not taking year and seconds because of having the same value all along 
 

In [15]:
## For train data

train['runDate_month'] = pd.DatetimeIndex(train['runDate']).month
train['runDate_date'] = pd.DatetimeIndex(train['runDate']).day

train['sA_month'] = pd.DatetimeIndex(train['sA']).month
train['sA_day'] = pd.DatetimeIndex(train['sA']).day
train['sA_minute'] = pd.DatetimeIndex(train['sA']).minute
train['sA_hour'] = pd.DatetimeIndex(train['sA']).hour

train['sD_month'] = pd.DatetimeIndex(train['sD']).month
train['sD_day'] = pd.DatetimeIndex(train['sD']).day
train['sD_hour'] = pd.DatetimeIndex(train['sD']).hour
train['sD_minute'] = pd.DatetimeIndex(train['sD']).minute

train
## Not taking year and seconds because of having the same value all along 


Unnamed: 0,runDate,trainCode,trainStationId,sA,sD,distance,dayCount,ArrivalDelay,DepartureDelay,runDate_month,runDate_date,sA_month,sA_day,sA_minute,sA_hour,sD_month,sD_day,sD_hour,sD_minute
0,2020-01-01,12138,2657812,2020-01-01 00:00:00,2020-01-01 21:40:00,0,0,0,0,1,1,1,1,0,0,1,1,21,40
1,2020-01-01,12138,2659393,2020-01-01 22:05:00,2020-01-01 22:07:00,32,0,11,11,1,1,1,1,5,22,1,1,22,7
2,2020-01-01,12138,2658108,2020-01-01 22:23:00,2020-01-01 22:25:00,45,0,14,13,1,1,1,1,23,22,1,1,22,25
3,2020-01-01,12138,2734206,2020-01-01 22:38:00,2020-01-01 22:40:00,61,0,18,17,1,1,1,1,38,22,1,1,22,40
4,2020-01-01,12138,2742572,2020-01-01 22:54:00,2020-01-01 22:56:00,76,0,20,19,1,1,1,1,54,22,1,1,22,56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31870,2020-02-19,12925,2658284,2020-02-20 17:19:00,2020-02-20 17:21:00,1723,1,80,80,2,19,2,20,19,17,2,20,17,21
31871,2020-02-19,12925,2657701,2020-02-20 17:37:00,2020-02-20 17:39:00,1739,1,87,87,2,19,2,20,37,17,2,20,17,39
31872,2020-02-19,12925,2657676,2020-02-20 17:53:00,2020-02-20 17:58:00,1744,1,82,92,2,19,2,20,53,17,2,20,17,58
31873,2020-02-19,12925,2658299,2020-02-20 18:30:00,2020-02-20 18:32:00,1780,1,88,88,2,19,2,20,30,18,2,20,18,32


Since, Number of features of the model must match the input. Model n_features is 14 and input n_features is 17 

In [16]:
## For test data

test['runDate_month'] = pd.DatetimeIndex(test['runDate']).month
test['runDate_date'] = pd.DatetimeIndex(test['runDate']).day

test['sA_month'] = pd.DatetimeIndex(test['sA']).month
test['sA_day'] = pd.DatetimeIndex(test['sA']).day
test['sA_hour'] = pd.DatetimeIndex(test['sA']).hour
test['sA_minute'] = pd.DatetimeIndex(test['sA']).minute

test['sD_month'] = pd.DatetimeIndex(test['sD']).month
test['sD_day'] = pd.DatetimeIndex(test['sD']).day
test['sD_hour'] = pd.DatetimeIndex(test['sD']).hour
test['sD_minute'] = pd.DatetimeIndex(test['sD']).minute

test = test.drop(['trainId', 'index'], axis=1)
test

Unnamed: 0,runDate,trainCode,trainStationId,sA,sD,distance,dayCount,day,runDate_month,runDate_date,sA_month,sA_day,sA_hour,sA_minute,sD_month,sD_day,sD_hour,sD_minute
1,2020-02-20,12138,2659393,2020-02-20 22:05:00,2020-02-20 22:07:00,32,0,Thursday,2,20,2,20,22,5,2,20,22,7
2,2020-02-20,12138,2658108,2020-02-20 22:23:00,2020-02-20 22:25:00,45,0,Thursday,2,20,2,20,22,23,2,20,22,25
3,2020-02-20,12138,2734206,2020-02-20 22:38:00,2020-02-20 22:40:00,61,0,Thursday,2,20,2,20,22,38,2,20,22,40
4,2020-02-20,12138,2742572,2020-02-20 22:54:00,2020-02-20 22:56:00,76,0,Thursday,2,20,2,20,22,54,2,20,22,56
5,2020-02-20,12138,2657716,2020-02-20 23:25:00,2020-02-20 23:50:00,88,0,Thursday,2,20,2,20,23,25,2,20,23,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
689,2020-02-20,12925,2658435,2020-02-21 16:46:00,2020-02-21 16:51:00,1687,1,Thursday,2,20,2,21,16,46,2,21,16,51
690,2020-02-20,12925,2658284,2020-02-21 17:19:00,2020-02-21 17:21:00,1723,1,Thursday,2,20,2,21,17,19,2,21,17,21
691,2020-02-20,12925,2657701,2020-02-21 17:37:00,2020-02-21 17:39:00,1739,1,Thursday,2,20,2,21,17,37,2,21,17,39
692,2020-02-20,12925,2657676,2020-02-21 17:53:00,2020-02-21 17:58:00,1744,1,Thursday,2,20,2,21,17,53,2,21,17,58


In [17]:
## Dropping sA, sD, and runDate

train = train.drop(['sA', 'sD', 'runDate'], axis=1)
test = test.drop(['sA', 'sD', 'runDate', 'day'], axis=1)

# Since the day is a derived feature and there are only 2 uniques values for 'day' i.e. Wednesday or Thursday. Hence dropping 'day' also
train


Unnamed: 0,trainCode,trainStationId,distance,dayCount,ArrivalDelay,DepartureDelay,runDate_month,runDate_date,sA_month,sA_day,sA_minute,sA_hour,sD_month,sD_day,sD_hour,sD_minute
0,12138,2657812,0,0,0,0,1,1,1,1,0,0,1,1,21,40
1,12138,2659393,32,0,11,11,1,1,1,1,5,22,1,1,22,7
2,12138,2658108,45,0,14,13,1,1,1,1,23,22,1,1,22,25
3,12138,2734206,61,0,18,17,1,1,1,1,38,22,1,1,22,40
4,12138,2742572,76,0,20,19,1,1,1,1,54,22,1,1,22,56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31870,12925,2658284,1723,1,80,80,2,19,2,20,19,17,2,20,17,21
31871,12925,2657701,1739,1,87,87,2,19,2,20,37,17,2,20,17,39
31872,12925,2657676,1744,1,82,92,2,19,2,20,53,17,2,20,17,58
31873,12925,2658299,1780,1,88,88,2,19,2,20,30,18,2,20,18,32


## 6. Model Selection

1. DecisionTreeRegressor
2. RandomForestRegressor

## 6.1 Model `ArrivalDelay`

In [18]:
# Get the Dependent and Independent Features.
##  Model AD

X = train.drop(['ArrivalDelay', 'DepartureDelay'], axis=1)
Y = train['ArrivalDelay']
print(X.shape, Y.shape, train.shape, test.shape)


(31775, 14) (31775,) (31775, 16) (663, 14)


Creating a test data from the train data to check the accuracy level of the model by cross validation with the existing data

In [19]:
## 6.1.1 DecisionTreeRegressor

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=0)

dt_reg_AD = DecisionTreeRegressor(random_state=0)
dt_reg_AD.fit(X_train, Y_train)


DecisionTreeRegressor(random_state=0)

## Feature Importance 

Checked whether feature Importance is bogus by removing the `trainCode` feature and got to know that regressor score becomes `0.1113` but when the `trainCode` is supplied the score becomes mpre than `93`. But since `DepartureDelay` cannot be passed the score remains out to be `0.77763`.


In [20]:
print("Feature Importance:\n")
for name, importance in zip(X.columns, np.sort(dt_reg_AD.feature_importances_)[::-1]):
    print("{} -- {:.2f}".format(name, importance))


dt_reg_AD.score(X_test, Y_test)
y_pred_dt_AD = dt_reg_AD.predict(X_test)


Feature Importance:

trainCode -- 0.30
trainStationId -- 0.24
distance -- 0.15
dayCount -- 0.09
runDate_month -- 0.09
runDate_date -- 0.03
sA_month -- 0.03
sA_day -- 0.03
sA_minute -- 0.02
sA_hour -- 0.01
sD_month -- 0.01
sD_day -- 0.01
sD_hour -- 0.01
sD_minute -- 0.00


In [21]:
## For actual Data

preds = pd.DataFrame()
dt_reg_AD.fit(X, Y)
preds['ArrivalDelay_DT'] = dt_reg_AD.predict(test)


In [22]:
## 6.1.2 RandomForestRegressor

rf_reg_AD = RandomForestRegressor(random_state=0, max_depth = 5, min_samples_split =  2, n_estimators = 20)
rf_reg_AD.fit(X_train, Y_train)

print("Feature Importance:\n")
for name, importance in zip(X.columns, np.sort(rf_reg_AD.feature_importances_)[::-1]):
    print("{} -- {:.2f}".format(name, importance))


rf_reg_AD.score(X_test, Y_test)

Feature Importance:

trainCode -- 0.39
trainStationId -- 0.34
distance -- 0.07
dayCount -- 0.06
runDate_month -- 0.05
runDate_date -- 0.04
sA_month -- 0.02
sA_day -- 0.01
sA_minute -- 0.01
sA_hour -- 0.01
sD_month -- 0.00
sD_day -- 0.00
sD_hour -- 0.00
sD_minute -- 0.00


0.3087801777044078

In [23]:
y_pred_rf_AD = rf_reg_AD.predict(X_test)

In [24]:
## Grid Search


params = {'n_estimators':[10,20], 'max_depth':[None, 5], 'min_samples_split':[2,3]}
rf_reg_AD = RandomForestRegressor(random_state=0)
reg_grid_AD = GridSearchCV(rf_reg_AD, params, cv=5, n_jobs=-1, verbose=1)
reg_grid_AD.fit(X, Y)
print(reg_grid_AD.best_params_)
print(reg_grid_AD.best_score_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   10.7s finished
{'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 20}
-0.6408014325728745


In [25]:
## For actual Data

rf_reg_AD.fit(X, Y)
preds['ArrivalDelay_RF'] = rf_reg_AD.predict(test)
preds

Unnamed: 0,ArrivalDelay_DT,ArrivalDelay_RF
0,2.0,9.66
1,19.0,13.97
2,29.0,16.81
3,16.0,8.17
4,4.0,3.54
...,...,...
658,76.0,69.62
659,76.0,71.79
660,76.0,70.12
661,67.0,68.13


## 6.2 Model `DepartureDelay`

In [26]:
# Get the Dependent and Independent Features.
##  Model DD

X = train.drop(['ArrivalDelay', 'DepartureDelay'], axis=1)
Y = train['DepartureDelay']
print(X.shape, Y.shape, train.shape, test.shape)

(31775, 14) (31775,) (31775, 16) (663, 14)


In [27]:
## 6.2.1 DecisionTreeRegressor

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=0)

dt_reg_DD = DecisionTreeRegressor(random_state=0)
dt_reg_DD.fit(X_train, Y_train)


DecisionTreeRegressor(random_state=0)

In [28]:
print("Feature Importance:\n")
for name, importance in zip(X.columns, np.sort(dt_reg_DD.feature_importances_)[::-1]):
    print("{} -- {:.2f}".format(name, importance))


dt_reg_DD.score(X_test, Y_test)
y_pred_dt_DD = dt_reg_DD.predict(X_test)

Feature Importance:

trainCode -- 0.23
trainStationId -- 0.22
distance -- 0.16
dayCount -- 0.12
runDate_month -- 0.12
runDate_date -- 0.03
sA_month -- 0.03
sA_day -- 0.03
sA_minute -- 0.03
sA_hour -- 0.02
sD_month -- 0.01
sD_day -- 0.01
sD_hour -- 0.01
sD_minute -- 0.00


In [29]:
## For actual Data

dt_reg_DD.fit(X, Y)
preds['DepartureDelay_DT'] = dt_reg_DD.predict(test)

In [30]:
## 6.2.2 RandomForestRegressor

rf_reg_DD = RandomForestRegressor(random_state=0, max_depth = 5, min_samples_split= 2, n_estimators= 20)
rf_reg_DD.fit(X_train, Y_train)

print("Feature Importance:\n")
for name, importance in zip(X.columns, np.sort(rf_reg_DD.feature_importances_)[::-1]):
    print("{} -- {:.2f}".format(name, importance))


rf_reg_DD.score(X_test, Y_test)

Feature Importance:

trainCode -- 0.36
trainStationId -- 0.35
distance -- 0.07
dayCount -- 0.05
runDate_month -- 0.04
runDate_date -- 0.04
sA_month -- 0.03
sA_day -- 0.02
sA_minute -- 0.02
sA_hour -- 0.01
sD_month -- 0.00
sD_day -- 0.00
sD_hour -- 0.00
sD_minute -- 0.00


0.2924207970153211

In [31]:
y_pred_rf_DD = rf_reg_DD.predict(X_test)

### Grid Search for Model DD

In [32]:
params = {'n_estimators':[10,20], 'max_depth':[None, 5], 'min_samples_split':[2,3]}
rf_reg_DD = RandomForestRegressor(random_state=0)
reg_grid_DD = GridSearchCV(rf_reg_DD, params, cv=5, n_jobs=-1, verbose=1)
reg_grid_DD.fit(X, Y)
print(reg_grid_DD.best_params_)
print(reg_grid_DD.best_score_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    8.9s finished
{'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 20}
-0.6289715615423939


In [33]:
## For actual Data

rf_reg_DD.fit(X, Y)
preds['DepartureDelay_RF'] = rf_reg_DD.predict(test)

In [34]:
preds

Unnamed: 0,ArrivalDelay_DT,ArrivalDelay_RF,DepartureDelay_DT,DepartureDelay_RF
0,2.0,9.66,9.0,9.92
1,19.0,13.97,21.0,14.73
2,29.0,16.81,1.0,11.66
3,16.0,8.17,1.0,11.05
4,4.0,3.54,1.0,5.03
...,...,...,...,...
658,76.0,69.62,72.0,70.34
659,76.0,71.79,76.0,73.43
660,76.0,70.12,76.0,72.51
661,67.0,68.13,68.0,70.32


In [38]:
test = test.append(preds)
test.to_csv('Anoushkrit_Goel_Solution.csv')