## Forecasting bus demand in Banana Republic municipalities

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_percentage_error as mape

In [2]:
df = pd.read_csv('municipality_bus_utilization.csv')
df = df.sort_values(['timestamp','municipality_id'])
df

Unnamed: 0,timestamp,municipality_id,usage,total_capacity
3,2017-06-04 07:59:42,0,204,2813
7,2017-06-04 07:59:42,1,129,397
6,2017-06-04 07:59:42,2,273,697
9,2017-06-04 07:59:42,3,623,1930
2,2017-06-04 07:59:42,4,1090,3893
...,...,...,...,...
13064,2017-08-19 16:30:35,5,202,587
13068,2017-08-19 16:30:35,6,1680,3113
13067,2017-08-19 16:30:35,7,1354,2019
13066,2017-08-19 16:30:35,8,1193,2947


In [3]:
# getting only the date and hour
df['timestamp'] = df.timestamp.apply(lambda x: x[:13])
# getting max values of usage, and unique values of total_capacity for each hour
data = df.groupby(['timestamp','municipality_id']).agg({'usage':'max','total_capacity':'max'}).reset_index()
# converting to datetime
data.timestamp = pd.to_datetime(data.timestamp)
data

Unnamed: 0,timestamp,municipality_id,usage,total_capacity
0,2017-06-04 07:00:00,0,204,2813
1,2017-06-04 07:00:00,1,129,397
2,2017-06-04 07:00:00,2,273,697
3,2017-06-04 07:00:00,3,623,1930
4,2017-06-04 07:00:00,4,1090,3893
...,...,...,...,...
6735,2017-08-19 16:00:00,5,239,587
6736,2017-08-19 16:00:00,6,1727,3113
6737,2017-08-19 16:00:00,7,1450,2019
6738,2017-08-19 16:00:00,8,1253,2947


In [4]:
# getting total capacities for each municipality_id
total_capacities = data.groupby('municipality_id').total_capacity.apply(lambda x: x.unique().item()).reset_index()
total_capacities

Unnamed: 0,municipality_id,total_capacity
0,0,2813
1,1,397
2,2,697
3,3,1930
4,4,3893
5,5,587
6,6,3113
7,7,2019
8,8,2947
9,9,1332


In [5]:
# adding seven days of data to predict
test_days = 7*24
last_date = data.timestamp.values[-1] # last interval
for i in range(1,test_days+1):
    Date = last_date + pd.DateOffset(hours=i) # next interval (hour)
    if Date.hour in data.timestamp.dt.hour.unique(): # if buses work in that hour
        for ID in data.municipality_id.unique(): # for all of municipality_id s
            Capacity = total_capacities[total_capacities.municipality_id == ID].total_capacity
            # in a row; add timestamp, municipality_id, usage, and total_capacity
            temp = pd.DataFrame({'timestamp': Date, 'municipality_id': ID,'usage': np.nan, 'total_capacity': Capacity})
            data = pd.concat([data,temp], axis=0)

In [6]:
# feature extraction from the timestamp column
data.reset_index(drop=True, inplace=True)
data['month'] = data.timestamp.dt.month
data['day'] = data.timestamp.dt.day
data['hour'] = data.timestamp.dt.hour
data['dayofweek'] = data.timestamp.dt.dayofweek
data

Unnamed: 0,timestamp,municipality_id,usage,total_capacity,month,day,hour,dayofweek
0,2017-06-04 07:00:00,0,204.0,2813,6,4,7,6
1,2017-06-04 07:00:00,1,129.0,397,6,4,7,6
2,2017-06-04 07:00:00,2,273.0,697,6,4,7,6
3,2017-06-04 07:00:00,3,623.0,1930,6,4,7,6
4,2017-06-04 07:00:00,4,1090.0,3893,6,4,7,6
...,...,...,...,...,...,...,...,...
7435,2017-08-26 16:00:00,5,,587,8,26,16,5
7436,2017-08-26 16:00:00,6,,3113,8,26,16,5
7437,2017-08-26 16:00:00,7,,2019,8,26,16,5
7438,2017-08-26 16:00:00,8,,2947,8,26,16,5


In [7]:
# specify train and test data
def TrainTestSplit(data, features, target, startdate, enddate):
    X_train = data[data.timestamp<startdate][features]
    y_train = data[data.timestamp<startdate][target]
    X_test = data[(data.timestamp>=startdate) & (data.timestamp<enddate)][features]
    y_test = data[(data.timestamp>=startdate) & (data.timestamp<enddate)][target]
    return X_train, y_train, X_test, y_test

In [8]:
# fit and predict
def FitPredict(X_train, y_train, X_test):
    model = LGBMRegressor(verbose=-1)
    model.fit(X_train, y_train)
    return model.predict(X_test)

In [9]:
# calculate accuracy
def Accuracy(actual, prediction):
    return 1-mape(actual, prediction)

In [10]:
features, target = ['municipality_id','total_capacity','month','day','hour','dayofweek'], 'usage'
accuracies = []
# train and predict for the last two weeks
for startdate, enddate in zip(['2017-08-05', '2017-08-12'],['2017-08-12', '2017-08-20']):
    X_train, y_train, X_test, y_test = TrainTestSplit(data, features, target, startdate, enddate)
    y_hat = FitPredict(X_train, y_train, X_test)
    accuracies.append(Accuracy(y_test, y_hat))

In [11]:
print ('Accuracy of the model for in prediction of last two weeks: %.3f'%np.mean(accuracies))

Accuracy of the model for in prediction of last two weeks: 0.819


In [12]:
# predict the next week
X_train, y_train, X_test, y_test = TrainTestSplit(data, features, target, '2017-08-20', '2017-08-27')
data_pred = data[(data.timestamp>='2017-08-20') & (data.timestamp<'2017-08-27')][['timestamp','municipality_id']]
data_pred['usage'] = np.ceil(FitPredict(X_train, y_train, X_test)).astype(int)
data_pred.reset_index(inplace=True, drop=True)

In [13]:
data_pred.to_csv('predictions.csv', index=False)
data_pred

Unnamed: 0,timestamp,municipality_id,usage
0,2017-08-20 07:00:00,0,633
1,2017-08-20 07:00:00,1,186
2,2017-08-20 07:00:00,2,351
3,2017-08-20 07:00:00,3,660
4,2017-08-20 07:00:00,4,1317
...,...,...,...
695,2017-08-26 16:00:00,5,241
696,2017-08-26 16:00:00,6,1488
697,2017-08-26 16:00:00,7,1354
698,2017-08-26 16:00:00,8,1204
