In [1]:
import numpy as np
import pandas as pd
import psycopg2
import psycopg2.extras as extras
import matplotlib.pyplot as plt
from sklearn import preprocessing
from IPython.display import display, HTML, clear_output
from psycopg2 import Error
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

rand_seed = 4321

In [2]:
# set up connection variables
db_host = "localhost"
db_port = "5432"
db_user = "postgres"
db_pass = "hope"
db_name = "stib_transport"

# function to connect with postgres
def connect_postgres(db_host, db_port, db_user, db_pass, db_name):
    try:
        # connect to an existing database
        connection = psycopg2.connect(host = db_host,
                                      port = db_port,
                                      user = db_user,
                                      password = db_pass,
                                      database = db_name)
        # set auto-commit
        connection.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT);
        # create a cursor to perform database operations
        cur = connection.cursor()
        # print PostgreSQL details
        print("PostgreSQL server information")
        print(connection.get_dsn_parameters(), "\n")
        # execute a SQL query
        cur.execute("SELECT version();")
        # fetch result
        record = cur.fetchone()
        print("You are connected to - ", record, "\n")

    except (Exception, Error) as error:
        print("Error while connecting to PostgreSQL", error)
    else:
        return cur

In [3]:
# connect to postgres

cur = connect_postgres(db_host, db_port, db_user, db_pass, db_name)

PostgreSQL server information
{'user': 'postgres', 'dbname': 'stib_transport', 'host': 'localhost', 'port': '5432', 'tty': '', 'options': '', 'sslmode': 'prefer', 'sslcompression': '0', 'gssencmode': 'prefer', 'krbsrvname': 'postgres', 'target_session_attrs': 'any'} 

You are connected to -  ('PostgreSQL 14.5, compiled by Visual C++ build 1914, 64-bit',) 



## 1. Data Preprocessing & Transformations

In [4]:
# query 

cur.execute(
    f"""
    
    with base as (
        select
          (extract(epoch from vh_time) - 0) / (86400 - 0) as epoch_time_norm
        , line
        , direction
        , stop_seq
        , lower(vehicle) as vehicle
        , actual_headway - sched_avg_headway as extra_headway_duration
        from actual_headways 
        where day_category = 'Weekday'
    )

    , max_stop_seq as (
        select
          distinct
          line
        , direction
        , max(stop_seq) over(partition by line, direction order by stop_seq range between unbounded preceding and unbounded following) as max_stop_seq
        from stop_details_combined
    )

    , final_result as (
        select
          a.epoch_time_norm
        , a.line
        , a.direction - 1 as direction
        , (a.stop_seq::float - 1) / (b.max_stop_seq::float - 1) as stop_seq_norm
        , a.vehicle
        , a.extra_headway_duration
        from base a
        inner join max_stop_seq b
            on a.line = b.line
            and a.direction = b.direction
    )

    select * from final_result;
        
    """
)

df = pd.DataFrame(cur.fetchall(), columns = [desc[0] for desc in cur.description])

In [5]:
df.line = df.line.astype('int64')
df.extra_headway_duration = df.extra_headway_duration.astype('float64')
df.epoch_time_norm = df.epoch_time_norm.astype('float64')

In [6]:
df = df.join(pd.get_dummies(df.vehicle, prefix = 'vehicle'))
df = df.join(pd.get_dummies(df.direction, prefix = 'direction'))

In [7]:
df.line = df.line.apply(
    lambda x: str("{0:0=7d}".format(int(np.binary_repr(x))))
).astype('str')

df.head(3)

Unnamed: 0,epoch_time_norm,line,direction,stop_seq_norm,vehicle,extra_headway_duration,vehicle_bus,vehicle_metro,vehicle_tram,direction_0,direction_1
0,0.657986,1,0,0.0,metro,2.37,0,1,0,1,0
1,0.661701,1,0,0.0,metro,0.19,0,1,0,1,0
2,0.66544,1,0,0.0,metro,0.21,0,1,0,1,0


In [8]:
for i in range(7):
    df[f'line_bit_{i+1}'] = df['line'].apply(
    lambda x: x[i]
).astype('uint8')
    
df.drop(
    columns = [
        'line',
        'vehicle'
    ],
    inplace = True
)

df = df[['epoch_time_norm', 'stop_seq_norm', 'direction_0', 'direction_1',
       'vehicle_bus', 'vehicle_metro', 'vehicle_tram', 'line_bit_1',
       'line_bit_2', 'line_bit_3', 'line_bit_4', 'line_bit_5', 'line_bit_6',
       'line_bit_7', 'extra_headway_duration']]
    
df.head(3)

Unnamed: 0,epoch_time_norm,stop_seq_norm,direction_0,direction_1,vehicle_bus,vehicle_metro,vehicle_tram,line_bit_1,line_bit_2,line_bit_3,line_bit_4,line_bit_5,line_bit_6,line_bit_7,extra_headway_duration
0,0.657986,0.0,1,0,0,1,0,0,0,0,0,0,0,1,2.37
1,0.661701,0.0,1,0,0,1,0,0,0,0,0,0,0,1,0.19
2,0.66544,0.0,1,0,0,1,0,0,0,0,0,0,0,1,0.21


## 2. Data Training

In [10]:
from sklearn.model_selection import train_test_split

X =  df[['epoch_time_norm', 'stop_seq_norm', 'direction_0', 'direction_1',
       'vehicle_bus', 'vehicle_metro', 'vehicle_tram', 'line_bit_1',
       'line_bit_2', 'line_bit_3', 'line_bit_4', 'line_bit_5', 'line_bit_6',
       'line_bit_7']].values

y = df[['extra_headway_duration']].values

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = rand_seed, shuffle = True)

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, r2_score


def find_optimal_params(model, param_grid):
    rmse = make_scorer(mean_squared_error, squared = False)
    gs_cv = GridSearchCV(model, param_grid, scoring = rmse, n_jobs= -1, verbose = 2, cv = 2)
    gs_cv.fit(x_train, y_train.ravel())
    optimal_param = gs_cv.best_params_
    best_model = gs_cv.best_estimator_
    return best_model, optimal_param

In [12]:
from sklearn.ensemble import RandomForestRegressor

rfc = RandomForestRegressor(random_state = rand_seed, n_jobs = -1)

rfc_param_grid = {
    'n_estimators': np.arange(50, 120, 5).astype(int),
    'criterion': ['squared_error', 'friedman_mse'],
    'max_depth': [None] + list(np.arange(18, 30, 2).astype(int)),
    # 'max_leaf_nodes': [None],
    'min_samples_split': np.arange(2, 8).astype(int)
}

rfc_model, rfc_best_params = find_optimal_params(rfc, rfc_param_grid)

print('\nRandom Forest - Optimal hyperparameters:')
rfc_best_params

Fitting 2 folds for each of 1176 candidates, totalling 2352 fits

Random Forest - Optimal hyperparameters:


{'criterion': 'squared_error',
 'max_depth': None,
 'min_samples_split': 2,
 'n_estimators': 50}

In [24]:
y_predictions = rfc_model.predict(x_test)

In [25]:
mean_squared_error(y_test, y_predictions, squared = False)

7.04300510676131

In [27]:
rfc = RandomForestRegressor(random_state = rand_seed, n_jobs = -1, criterion = 'squared_error', max_depth = None, min_samples_split = 2)

rfc_param_grid = {
    'n_estimators': np.arange(5, 50, 5).astype(int),
}

rfc_model2, rfc_best_params2 = find_optimal_params(rfc, rfc_param_grid)

print('\nRandom Forest - Optimal hyperparameters:')
rfc_best_params2

Fitting 2 folds for each of 9 candidates, totalling 18 fits

Random Forest - Optimal hyperparameters:


{'n_estimators': 5}

In [28]:
y_predictions2 = rfc_model.predict(x_test)

In [29]:
mean_squared_error(y_test, y_predictions2, squared = False)

7.043005106761309

In [30]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

poly = PolynomialFeatures(degree = 4, include_bias=False)

poly_features = poly.fit_transform(x_train)

poly_reg_model = LinearRegression()
poly_reg_model.fit(poly_features, y_train)

LinearRegression()

In [33]:
test_features = poly.fit_transform(x_test)
y_predicted = poly_reg_model.predict(test_features)
mean_squared_error(y_test, y_predicted, squared = False)

6.670957305242388