### Importing required packages


In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import requests as re
import json
from tqdm import tqdm
from datetime import date,timedelta
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
import numpy as np
import pickle
import matplotlib.pyplot as plt

### Data extraction and data exploration

In [2]:
# start and end data of data extraction
start = date(2020,8,1) # start date 
end = date(2021,9,24) # end date

In [3]:
mob_data = pd.DataFrame()
# going through all the data
for i in tqdm(range((end - start).days+1),desc = 'DataLoading'):
    date = start + timedelta(i)
    # going through all the intersection
    for j in range(1,4):
        params = {
        "date": date,
        "intersectionid": j
            }
        # sending requests
        res = re.get('https://opendata.citywindsor.ca/api/traffic',params = params,verify=False)
        # converting the json data to dataframe
        df = pd.json_normalize(res.json(),record_path=['traffic'],meta=['intersectionDescription'])
        frames = [mob_data,df]
        mob_data = pd.concat(frames)     

DataLoading: 100%|██████████| 420/420 [49:14<00:00,  7.04s/it]


In [5]:
# extracting date and time from timestamp
mob_data['day'] = pd.to_datetime(mob_data.timeStamp).dt.day
mob_data['month'] = pd.to_datetime(mob_data.timeStamp).dt.month
mob_data['year'] = pd.to_datetime(mob_data.timeStamp).dt.year
mob_data['hour'] = pd.to_datetime(mob_data.timeStamp).dt.hour
mob_data['day of the week'] = pd.to_datetime(mob_data.timeStamp).dt.dayofweek

# Selecting only required columns
mob_data = mob_data[['intersectionDescription','year','month','day','hour','day of the week','qty']]

In [None]:
mod_data.to_csv('mob_data.csv',index = False)

In [6]:
mob_data.head()

Unnamed: 0,intersectionDescription,year,month,day,hour,day of the week,qty
0,Dorchester Road and Huron Church Road,2020,8,1,0,5,2.0
1,Dorchester Road and Huron Church Road,2020,8,1,0,5,2.0
2,Dorchester Road and Huron Church Road,2020,8,1,0,5,4.0
3,Dorchester Road and Huron Church Road,2020,8,1,0,5,2.0
4,Dorchester Road and Huron Church Road,2020,8,1,0,5,3.0


In [8]:
# making different dataframes for all three intersections
totten_street = mob_data[mob_data['intersectionDescription'] == 'Totten Street and Huron Church Road']
dorchester_road = mob_data[mob_data['intersectionDescription'] == 'Dorchester Road and Huron Church Road']
malden_road = mob_data[mob_data['intersectionDescription'] == 'Malden Road and Huron Church Road']

In [9]:
# making the required changes in the data
totten_street_new = totten_street.groupby(['year','month','day of the week','day','hour'],as_index = False)['qty'].sum()
dorchester_road_new = dorchester_road.groupby(['year','month','day of the week','day','hour'],as_index = False)['qty'].sum()
malden_road_new = malden_road.groupby(['year','month','day of the week','day','hour'],as_index = False)['qty'].sum()

In [10]:
totten_street_new.to_csv('totten.csv',index = False)
dorchester_road_new.to_csv('dorchester.csv',index = False)
malden_road_new.to_csv('malden.csv',index = False)

In [12]:
# extracting the train and test data for all the three given intersection
# we have used the september 2021 for testing data for all three intersection

X_test_totten = totten_street_new[(totten_street_new['month'] == 9) & (totten_street_new['year'] == 2021)].drop('qty',axis = 1)
y_test_totten = totten_street_new[(totten_street_new['month'] == 9) & (totten_street_new['year'] == 2021)]['qty']

X_train_totten = totten_street_new[(totten_street_new['month'] < 9) | (totten_street_new['year'] == 2020) ].drop('qty',axis = 1)
y_train_totten = totten_street_new[(totten_street_new['month'] < 9) | (totten_street_new['year'] == 2020)]['qty']


X_test_dorchester = dorchester_road_new[(dorchester_road_new['month'] == 9) & (dorchester_road_new['year'] == 2021)].drop('qty',axis = 1)
y_test_dorchester = dorchester_road_new[(dorchester_road_new['month'] == 9) & (dorchester_road_new['year'] == 2021)]['qty']

X_train_dorchester = dorchester_road_new[(dorchester_road_new['month'] < 9) | (dorchester_road_new['year'] == 2020) ].drop('qty',axis = 1)
y_train_dorchester = dorchester_road_new[(dorchester_road_new['month'] < 9) | (dorchester_road_new['year'] == 2020)]['qty']


X_test_malden = malden_road_new[(malden_road_new['month'] == 9) & (malden_road_new['year'] == 2021)].drop('qty',axis = 1)
y_test_malden = malden_road_new[(malden_road_new['month'] == 9) & (malden_road_new['year'] == 2021)]['qty']

X_train_malden = malden_road_new[(malden_road_new['month'] < 9) | (malden_road_new['year'] == 2020) ].drop('qty',axis = 1)
y_train_malden = malden_road_new[(malden_road_new['month'] < 9) | (malden_road_new['year'] == 2020)]['qty']

### XGBoost Regressor

In [14]:
# selecting parameters to tune the xgboost regressor
xgb_params = {
    'reg':'squarederror',
    'max_depth': [int(i) for i in np.linspace(2,10,5)],
    'min_child_weight':[int(i) for i in np.linspace(2,10,5)],
    'gamma' : [i for i in np.linspace(0.0,0.5,6)],
    'subsample' : [ i for i in np.linspace(0.6,1.0,9)],
    'colsample_bytree' : [i for i in np.linspace(0.6,1.0,9)],
    'reg_lambda' : [0.01,0.1,1.0,10,20],
    'learning_rate' : [0.05,0.1,0.15,0.2,0.25,0.3]
}
xgb_params

{'colsample_bytree': [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0],
 'gamma': [0.0, 0.1, 0.2, 0.30000000000000004, 0.4, 0.5],
 'learning_rate': [0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
 'max_depth': [2, 4, 6, 8, 10],
 'min_child_weight': [2, 4, 6, 8, 10],
 'reg': 'squarederror',
 'reg_lambda': [0.01, 0.1, 1.0, 10, 20],
 'subsample': [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0]}

In [15]:
lst = ['totten','dorchester','malden'] # list of intersection 

In [16]:
# creating and saving the models for all the three intersection
for i in lst:
  xgb = XGBRegressor()
  print(f'For {i} intersection')
  print('')
  xgb_hptune = RandomizedSearchCV(xgb,xgb_params,cv=5,random_state = 0,n_iter = 10,n_jobs = -1,verbose = 2)
  X_train = locals()['X_train_' + i]
  y_train = locals()['y_train_' + i]
  xgb_hptune.fit(X_train,y_train)
  pickle.dump(xgb_hptune.best_estimator_,open(i+'_1.pkl','wb'))
  print('')

For totten intersection

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   16.5s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   21.3s finished



For dorchester intersection

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   19.5s finished



For malden intersection

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   19.8s finished





### RandomForest Regressor

In [23]:
for i in lst:

  X_train = locals()['X_train_' + i]
  y_train = locals()['y_train_' + i]

  # Number of trees in random forest
  n_estimators = [int(x) for x in np.linspace(start = 50, stop = 250, num = 5)]
  # Number of features to consider at every split
  max_features = ['auto', 'sqrt','log2']
  # Maximum number of levels in tree
  max_depth = [int(x) for x in np.linspace(2, 10, num = 5)]
  max_depth.append(None)
  # Minimum number of samples required to split a node
  min_samples_split = [int(x) for x in np.linspace(2,20,5)]
  # Minimum number of samples required at each leaf node
  min_samples_leaf = [int(x) for x in np.linspace(2,10,5)]
  # Method of selecting samples for training each tree
  bootstrap = [True]

  # Create the random grid
  random_grid = {'n_estimators': n_estimators,
                'max_features': max_features,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf,
                'bootstrap': bootstrap}

  print(f'For {i} initersection')
  print('')

  rf = RandomForestRegressor()
  rf_random_search = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 5, verbose=2, random_state=9, n_jobs = -1)
  rf_random_search.fit(X_train,y_train)


  # for fine tuning I have used GridsearchCV after RandomizedSearchCV
  grid = {
      'n_estimators' : [rf_random_search.best_params_['n_estimators'],rf_random_search.best_params_['n_estimators']+50,rf_random_search.best_params_['n_estimators']+100],
      'min_samples_split' : [rf_random_search.best_params_['min_samples_split'],rf_random_search.best_params_['min_samples_split']-1,rf_random_search.best_params_['min_samples_split']+1],
      'min_samples_leaf' : [rf_random_search.best_params_['min_samples_leaf'],rf_random_search.best_params_['min_samples_leaf']-1,rf_random_search.best_params_['min_samples_leaf']+1],
      'max_depth' : [rf_random_search.best_params_['max_depth'],None],
      'max_features' : [rf_random_search.best_params_['max_features']],
      'bootstrap' : [rf_random_search.best_params_['bootstrap']]
  }
  rf_clf = RandomForestRegressor()
  rf_grid_search = GridSearchCV(rf_clf,grid,n_jobs = -1,cv=5,verbose = 2)
  rf_grid_search.fit(X_train,y_train)

  pickle.dump(rf_grid_search.best_estimator_,open(i+'_2.pkl','wb'))
  print('')


For totten initersection

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   22.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   30.8s finished


Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   57.1s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed:  8.1min finished



For dorchester initersection

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   22.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   30.8s finished


Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   55.7s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed:  7.4min finished



For malden initersection

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   22.7s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   31.4s finished


Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   58.6s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed:  8.3min finished





### Checking the R2 score after averaging the prediction of both the models

##### Dorchester Road and Huron Church Road

In [40]:
model_1 = pickle.load(open('dorchester_1.pkl','rb'))
model_2 = pickle.load(open('dorchester_2.pkl','rb'))



In [42]:
y_pred_train = (model_1.predict(X_train_dorchester) + model_2.predict(X_train_dorchester))/2
r2_score(y_train_dorchester,y_pred_train)

0.9662424530840088

In [43]:
y_pred_test = (model_1.predict(X_test_dorchester) + model_2.predict(X_test_dorchester))/2
r2_score(y_test_dorchester,y_pred_test)

0.866805792905263

#### Totten Street and Huron Church Road

In [44]:
model_1 = pickle.load(open('totten_1.pkl','rb'))
model_2 = pickle.load(open('totten_2.pkl','rb'))



In [45]:
y_pred_train = (model_1.predict(X_train_totten) + model_2.predict(X_train_totten))/2
r2_score(y_train_totten,y_pred_train)

0.9586991370177642

In [46]:
y_pred_test = (model_1.predict(X_test_totten) + model_2.predict(X_test_totten))/2
r2_score(y_test_totten,y_pred_test)

0.8795961004299547

#### Malden Road and Huron Church Road

In [47]:
model_1 = pickle.load(open('malden_1.pkl','rb'))
model_2 = pickle.load(open('malden_2.pkl','rb'))



In [48]:
y_pred_train = (model_1.predict(X_train_malden) + model_2.predict(X_train_malden))/2
r2_score(y_train_malden,y_pred_train)

0.956636978461213

In [49]:
y_pred_test = (model_1.predict(X_test_malden) + model_2.predict(X_test_malden))/2
r2_score(y_test_dorchester,y_pred_test)

0.8938129016885403