## Analysis Of Daily Public Transport Passenger Boardings By Ticket Type


In [127]:
## imports
import pandas as pd
import numpy as np


In [128]:
df=pd.read_csv("Daily_Public_Transport_Passenger_Boardings_By_Ticket_Type_20240513.csv")
df.head()

Unnamed: 0,Date,MyWay,Paper Ticket
0,01/07/2019,66215,4325
1,15/09/2023,63800,7349
2,28/12/2021,9994,1882
3,11/01/2023,43769,3991
4,11/09/2021,3810,685


In [129]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1778 entries, 0 to 1777
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Date          1778 non-null   object
 1   MyWay         1778 non-null   int64 
 2   Paper Ticket  1778 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 41.8+ KB


In [130]:
df.isnull().sum()

Date            0
MyWay           0
Paper Ticket    0
dtype: int64

In [131]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MyWay,1778.0,40985.889201,23275.162569,0.0,17207.75,45079.5,60946.5,88313.0
Paper Ticket,1778.0,3744.153543,2153.56276,13.0,1991.5,3374.5,5383.0,10310.0


In [132]:
df.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Date,1778.0,1778.0,01/07/2019,1.0,,,,,,,
MyWay,1778.0,,,,40985.889201,23275.162569,0.0,17207.75,45079.5,60946.5,88313.0
Paper Ticket,1778.0,,,,3744.153543,2153.56276,13.0,1991.5,3374.5,5383.0,10310.0


In [133]:
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
df

Unnamed: 0,Date,MyWay,Paper Ticket
0,2019-07-01,66215,4325
1,2023-09-15,63800,7349
2,2021-12-28,9994,1882
3,2023-01-11,43769,3991
4,2021-09-11,3810,685
...,...,...,...
1773,2023-01-19,46083,4622
1774,2023-08-16,70115,6955
1775,2022-01-10,21932,2315
1776,2020-05-04,14842,854


In [134]:
df.sort_values(by='Date', inplace=True)
df.head()

Unnamed: 0,Date,MyWay,Paper Ticket
0,2019-07-01,66215,4325
1037,2019-07-02,69181,4764
1344,2019-07-03,68410,5086
1087,2019-07-04,68258,4750
1411,2019-07-05,64088,4919


## TASK 2

In [79]:
pip install statsmodels 

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [102]:
## creating a time series prediction models
##importing time series prediction model
from statsmodels.tsa.arima.model import ARIMA

In [135]:
# Load data and set frequency to daily
data = pd.read_csv("Daily_Public_Transport_Passenger_Boardings_By_Ticket_Type_20240513.csv", parse_dates=['Date'], index_col='Date')

data=data.sort_index()
data=df
# data.set_index('Date', inplace=True)
data

Unnamed: 0,Date,MyWay,Paper Ticket
0,2019-07-01,66215,4325
1037,2019-07-02,69181,4764
1344,2019-07-03,68410,5086
1087,2019-07-04,68258,4750
1411,2019-07-05,64088,4919
...,...,...,...
1380,2024-05-08,72435,9850
1447,2024-05-09,71487,9768
43,2024-05-10,67172,10310
1227,2024-05-11,54,2766


In [136]:
data

Unnamed: 0,Date,MyWay,Paper Ticket
0,2019-07-01,66215,4325
1037,2019-07-02,69181,4764
1344,2019-07-03,68410,5086
1087,2019-07-04,68258,4750
1411,2019-07-05,64088,4919
...,...,...,...
1380,2024-05-08,72435,9850
1447,2024-05-09,71487,9768
43,2024-05-10,67172,10310
1227,2024-05-11,54,2766


In [137]:
data = data.sort_index()

data.index.freq = 'D'


In [142]:
## excluding the paper ticket column
myway = data['MyWay']
myway.tail()


1773    46083
1774    70115
1775    21932
1776    14842
1777    69067
Name: MyWay, dtype: int64

In [143]:
from sklearn.model_selection import GridSearchCV
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from math import sqrt

# Define the range of ARIMA parameters
p_values = range(3)
d_values = range(3)
q_values = range(3)

# Initialize variables to store best model and its performance
best_model = None
best_rmse = float('inf')

# Define the date range for forecasting (1st May to 7th May)
start_date = '2024-05-01'
end_date = '2024-05-07'

# Extract the relevant portion of the data for training
myway_data_train = myway_data.loc[myway_data.index < start_date]

# Perform grid search
for p in p_values:
    for d in d_values:
        for q in q_values:
            order = (p, d, q)
            try:
                # Fit ARIMA model
                model = ARIMA(myway_data_train, order=order)
                model_fit = model.fit()

                # Forecast for the next 7 days
                forecast = model_fit.forecast(steps=7)

                # Calculate RMSE
                rmse = sqrt(mean_squared_error(myway_data[start_date:end_date], forecast))

                # Update best model if RMSE improves
                if rmse < best_rmse:
                    best_rmse = rmse
                    best_model = model_fit

                print(f"ARIMA{order} - RMSE: {rmse}")
            except:
                continue

# Check if a model was successfully fitted
if best_model is not None:
    # Forecast for the next 7 days using the best model
    forecast = best_model.forecast(steps=7)

    print("Forecasted patronage for 1st May to 7th May:")
    print(forecast)
else:
    print("No valid model found during hyperparameter tuning.")


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


ARIMA(0, 0, 0) - RMSE: 28611.65013543891
ARIMA(0, 0, 1) - RMSE: 26342.45043971131
ARIMA(0, 0, 2) - RMSE: 25944.30644444939
ARIMA(0, 1, 0) - RMSE: 28918.03293053365
ARIMA(0, 1, 1) - RMSE: 29874.770831965634


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-invertible starting MA parameters found.'


ARIMA(0, 1, 2) - RMSE: 23722.82098133791
ARIMA(0, 2, 0) - RMSE: 96556.56147416547


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


ARIMA(0, 2, 1) - RMSE: 29030.444991777993


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-invertible starting MA parameters found.'


ARIMA(0, 2, 2) - RMSE: 30031.43166411425
ARIMA(1, 0, 0) - RMSE: 23735.577739923287


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


ARIMA(1, 0, 1) - RMSE: 25132.563483159985


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


ARIMA(1, 0, 2) - RMSE: 26278.635042438957
ARIMA(1, 1, 0) - RMSE: 29247.226797934236


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


ARIMA(1, 1, 1) - RMSE: 22680.365969045008


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-invertible starting MA parameters found.'


ARIMA(1, 1, 2) - RMSE: 23557.008415199467
ARIMA(1, 2, 0) - RMSE: 117857.01417952735


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


ARIMA(1, 2, 1) - RMSE: 29390.51924067475


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


ARIMA(1, 2, 2) - RMSE: 30772.00604137952
ARIMA(2, 0, 0) - RMSE: 24839.321748184768


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


ARIMA(2, 0, 1) - RMSE: 25308.263958463038


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


ARIMA(2, 0, 2) - RMSE: 28575.44803018749
ARIMA(2, 1, 0) - RMSE: 24867.881335410577


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


ARIMA(2, 1, 1) - RMSE: 27212.511340613066


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


ARIMA(2, 1, 2) - RMSE: 30191.460785080133
ARIMA(2, 2, 0) - RMSE: 85549.98611924161


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


ARIMA(2, 2, 1) - RMSE: 24888.115458068773


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


ARIMA(2, 2, 2) - RMSE: 27254.438454576015
Forecasted patronage for 1st May to 7th May:
2024-05-01    60825.053664
2024-05-02    54713.427996
2024-05-03    51903.730458
2024-05-04    50612.028268
2024-05-05    50018.194048
2024-05-06    49745.190655
2024-05-07    49619.682810
Freq: D, Name: predicted_mean, dtype: float64


In [138]:
from sklearn.model_selection import GridSearchCV
p_values = range(3)
d_values = range(3)
q_values = range(3)

# Initialize variables to store best model and its performance
best_model = None
best_rmse = float('inf')

# Perform grid search
for p in p_values:
    for d in d_values:
        for q in q_values:
            order = (p, d, q)
            try:
                # Fit ARIMA model
                model = ARIMA(myway, order=order)
                model_fit = model.fit()

                # Forecast next 7 days
                forecast = model_fit.forecast(steps=7)

                # Calculate RMSE
                rmse = sqrt(mean_squared_error(myway_data[-7:], forecast))

                # Update best model if RMSE improves
                if rmse < best_rmse:
                    best_rmse = rmse
                    best_model = model_fit

                print(f"ARIMA{order} - RMSE: {rmse}")
            except:
                continue

# Check if a model was successfully fitted
if best_model is not None:
    # Forecast next 7 days using the best model
    forecast = best_model.forecast(steps=7)

    print("Forecasted patronage for next 7 days:")
    print(forecast)
else:
    print("No valid model found during hyperparameter tuning.")

ARIMA(0, 0, 0) - RMSE: 33249.42047092916
ARIMA(0, 0, 1) - RMSE: 33535.59047717581
ARIMA(0, 0, 2) - RMSE: 34853.814717438836
ARIMA(0, 1, 0) - RMSE: 59624.09416981887
ARIMA(0, 1, 1) - RMSE: 58620.81507971069


  warn('Non-invertible starting MA parameters found.'


ARIMA(0, 1, 2) - RMSE: 32819.74094263825
ARIMA(0, 2, 0) - RMSE: 59760.74702511677
ARIMA(0, 2, 1) - RMSE: 59624.29149217823


  warn('Non-invertible starting MA parameters found.'


ARIMA(0, 2, 2) - RMSE: 58475.413844696865
ARIMA(1, 0, 0) - RMSE: 40910.56273000746
ARIMA(1, 0, 1) - RMSE: 35382.653025756066
ARIMA(1, 0, 2) - RMSE: 34826.99102147069
ARIMA(1, 1, 0) - RMSE: 59625.59421567687
ARIMA(1, 1, 1) - RMSE: 37037.67293249353


  warn('Non-invertible starting MA parameters found.'


ARIMA(1, 1, 2) - RMSE: 33060.46679735529
ARIMA(1, 2, 0) - RMSE: 113615.19382362958
ARIMA(1, 2, 1) - RMSE: 59625.9261585025
ARIMA(1, 2, 2) - RMSE: 53271.786192593034
ARIMA(2, 0, 0) - RMSE: 35656.02269236512
ARIMA(2, 0, 1) - RMSE: 35138.119358478514


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


ARIMA(2, 0, 2) - RMSE: 33882.92063475942
ARIMA(2, 1, 0) - RMSE: 42725.47288609704
ARIMA(2, 1, 1) - RMSE: 30380.58203583444


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


ARIMA(2, 1, 2) - RMSE: 23870.474222567696
ARIMA(2, 2, 0) - RMSE: 108477.15731625212
ARIMA(2, 2, 1) - RMSE: 42688.66913046751
ARIMA(2, 2, 2) - RMSE: 30381.569144091274
Forecasted patronage for next 7 days:
2024-05-13    35839.924150
2024-05-14    65952.858982
2024-05-15    70036.986196
2024-05-16    49986.839033
2024-05-17    24464.149649
2024-05-18    12576.790857
2024-05-19    20245.443038
Freq: D, Name: predicted_mean, dtype: float64


## RANDOM FOREST

In [116]:
data.head()

Unnamed: 0_level_0,MyWay,Paper Ticket
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-07-01,66215,4325
2019-07-02,69181,4764
2019-07-03,68410,5086
2019-07-04,68258,4750
2019-07-05,64088,4919


In [117]:
myway.head()

Date
2019-07-01    66215
2019-07-02    69181
2019-07-03    68410
2019-07-04    68258
2019-07-05    64088
Freq: D, Name: MyWay, dtype: int64

In [113]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt

In [124]:
my=data['MyWay']

In [None]:
myway_data_excluded = my.loc[~((my.index >= '2024-05-01') & (my.index <= '2024-05-07'))]

In [114]:
X_train, X_test, y_train, y_test = train_test_split(myway.dropna(), myway.dropna(), test_size=0.2, shuffle=False)


NameError: name 'X' is not defined

In [None]:
#RandomForest

from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

predicted_patronage_rf = rf_model.predict(X_test)

print("Predicted Patronage for the next 7 days using Random Forest:")
for date, patronage in zip(predicted_dates, predicted_patronage_rf):
    print(f"{date.strftime('%d/%m/%Y')}: {patronage}")



In [126]:
from sklearn.model_selection import train_test_split
from datetime import datetime, timedelta
data=df
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y')
data_range = data[(data['Date'] >= '2023-04-01') & (data['Date'] <= '2024-04-30')]
data_range['MyWay'] = data_range['MyWay'].str.replace(',', '')
data_range['MyWay'] = data_range['MyWay'].astype(float)


X = data_range[['MyWay']]
y = data_range['MyWay']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

KeyError: 'Date'

## Evaluation Metrics

In [144]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [145]:
mae = mean_absolute_error(test, forecast)
mae

26258.777004337604

In [146]:
mse = mean_squared_error(test, forecast)
mse

924268696.8350552

In [147]:
rmse = np.sqrt(mse)
rmse

30401.787724327252

In [148]:
mape = np.mean(np.abs((test - forecast) / test)) * 100
mape

29.709982643407283