In [64]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error,  r2_score, mean_absolute_error

In [65]:
import warnings
warnings.filterwarnings('ignore')


In [66]:
path= './data/bus_running_times_feature_added_all.csv'
df = pd.read_csv(path)
df = df[df['week_no'] < 25]


In [67]:
df = df[['deviceid','week_no','segment','length','direction',
 'month','day','day_of_week',
 'time_of_day',
 'dt(n-1)','rt(w-1)','rt(w-2)','rt(w-3)','rt(t-1)','rt(t-2)','rt(n-1)','rt(n-2)','rt(n-3)',
 'precip','windspeed','temp','run_time_in_seconds']]
df

Unnamed: 0,deviceid,week_no,segment,length,direction,month,day,day_of_week,time_of_day,dt(n-1),...,rt(w-3),rt(t-1),rt(t-2),rt(n-1),rt(n-2),rt(n-3),precip,windspeed,temp,run_time_in_seconds
0,262.0,1.0,1.0,0.6261,1.0,10.0,1.0,4.0,6.50,0.0,...,96.0,96.0,96.0,96.0,96.0,96.0,0.0,6.1,20.0,69.0
1,262.0,1.0,2.0,1.2808,1.0,10.0,1.0,4.0,6.50,74.0,...,247.0,247.0,247.0,69.0,247.0,247.0,0.0,6.1,20.0,210.0
2,262.0,1.0,3.0,2.1125,1.0,10.0,1.0,4.0,6.75,0.0,...,506.0,506.0,506.0,210.0,69.0,506.0,0.0,6.1,20.0,496.0
3,262.0,1.0,4.0,1.5513,1.0,10.0,1.0,4.0,6.75,6.0,...,192.0,192.0,192.0,496.0,210.0,69.0,0.0,6.1,20.0,195.0
4,262.0,1.0,5.0,0.8450,1.0,10.0,1.0,4.0,6.75,0.0,...,114.0,114.0,114.0,195.0,496.0,210.0,0.0,6.1,20.0,97.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173930,121.0,24.0,30.0,2.5600,2.0,7.0,3.0,6.0,13.00,15.0,...,439.0,439.0,439.0,307.0,210.0,95.0,0.0,9.4,22.1,501.0
173931,121.0,24.0,31.0,0.4200,2.0,7.0,3.0,6.0,13.25,15.0,...,65.0,65.0,65.0,501.0,307.0,210.0,0.0,9.4,22.1,79.0
173932,121.0,24.0,32.0,1.3000,2.0,7.0,3.0,6.0,13.25,15.0,...,262.0,262.0,262.0,79.0,501.0,307.0,0.0,9.4,22.1,217.0
173933,121.0,24.0,33.0,1.2200,2.0,7.0,3.0,6.0,13.25,15.0,...,200.0,200.0,200.0,217.0,79.0,501.0,0.0,9.4,22.1,172.0


In [68]:
# dropping the NA data
df = df.dropna(subset=['run_time_in_seconds'])
df.isnull().sum()

deviceid               0
week_no                0
segment                0
length                 0
direction              0
month                  0
day                    0
day_of_week            0
time_of_day            0
dt(n-1)                0
rt(w-1)                0
rt(w-2)                0
rt(w-3)                0
rt(t-1)                0
rt(t-2)                0
rt(n-1)                0
rt(n-2)                0
rt(n-3)                0
precip                 0
windspeed              0
temp                   0
run_time_in_seconds    0
dtype: int64

In [69]:
def predict(model, dt, t):
    preds = model.predict(dt)
    rmse = np.sqrt(mean_squared_error(t['run_time_in_seconds'], preds)) 
    mae = mean_absolute_error(t['run_time_in_seconds'], preds)
    print("MAE (1): %f" % (mae)) 
    

<hr>

In [70]:
batch = []
dbatch =[]
batch.append(df[df['week_no'] <= 19]) 
for i in range(20,24):
    batch.append(df[df['week_no'] == i])
for i in batch:
    dbatch.append(xgb.DMatrix(i.drop(columns=['week_no','run_time_in_seconds']), label=i['run_time_in_seconds']))

In [71]:
def inc_train(is_initial, params, data_batch, prev_model):
    if is_initial:
        curr_model = xgb.train(params, data_batch, num_boost_round=10)
    else:
        curr_model = xgb.train(params, dbatch[i], num_boost_round=5, xgb_model=prev_model)
    return curr_model

In [72]:
params = {'objective': 'reg:squarederror', 'verbose': False, 'colsample_bytree': 0.7,
    'learning_rate': 0.1,
    'max_depth': 6,
    'alpha': 10,
    'subsample':0.7,
    
    'n_estimators': 100}

In [73]:
batch[0]

Unnamed: 0,deviceid,week_no,segment,length,direction,month,day,day_of_week,time_of_day,dt(n-1),...,rt(w-3),rt(t-1),rt(t-2),rt(n-1),rt(n-2),rt(n-3),precip,windspeed,temp,run_time_in_seconds
0,262.0,1.0,1.0,0.6261,1.0,10.0,1.0,4.0,6.50,0.0,...,96.0,96.0,96.0,96.0,96.0,96.0,0.0,6.1,20.0,69.0
1,262.0,1.0,2.0,1.2808,1.0,10.0,1.0,4.0,6.50,74.0,...,247.0,247.0,247.0,69.0,247.0,247.0,0.0,6.1,20.0,210.0
2,262.0,1.0,3.0,2.1125,1.0,10.0,1.0,4.0,6.75,0.0,...,506.0,506.0,506.0,210.0,69.0,506.0,0.0,6.1,20.0,496.0
3,262.0,1.0,4.0,1.5513,1.0,10.0,1.0,4.0,6.75,6.0,...,192.0,192.0,192.0,496.0,210.0,69.0,0.0,6.1,20.0,195.0
4,262.0,1.0,5.0,0.8450,1.0,10.0,1.0,4.0,6.75,0.0,...,114.0,114.0,114.0,195.0,496.0,210.0,0.0,6.1,20.0,97.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130455,1358.0,19.0,30.0,2.5612,2.0,2.0,6.0,6.0,17.25,143.0,...,414.0,436.0,436.0,233.0,103.0,45.0,0.0,10.4,25.7,473.0
130456,1358.0,19.0,31.0,0.4169,2.0,2.0,6.0,6.0,17.50,0.0,...,84.0,59.0,60.0,473.0,233.0,103.0,0.0,10.4,25.7,60.0
130457,1358.0,19.0,32.0,1.2960,2.0,2.0,6.0,6.0,17.50,0.0,...,266.0,223.0,245.0,60.0,473.0,233.0,0.0,10.4,25.7,281.0
130458,1358.0,19.0,33.0,1.2245,2.0,2.0,6.0,6.0,17.50,59.0,...,180.0,198.0,150.0,281.0,60.0,473.0,0.0,10.4,25.7,135.0


In [74]:
models = []
for i in range(len(batch)):
    if i!=0:
        curr_model = inc_train(False, params, dbatch[i], models[i-1])
    else:
        curr_model = inc_train(True,params, dbatch[i], None)
    models.append(curr_model)
    for j in range(len(batch)):
        print(f"Results of data batch {j+1}  with model {i+1}")
        predict(curr_model, dbatch[j], batch[j])

Results of data batch 1  with model 1
MAE (1): 45.564690
Results of data batch 2  with model 1
MAE (1): 44.561479
Results of data batch 3  with model 1
MAE (1): 50.460889
Results of data batch 4  with model 1
MAE (1): 46.569083
Results of data batch 5  with model 1
MAE (1): 51.609299
Results of data batch 1  with model 2
MAE (1): 38.412658
Results of data batch 2  with model 2
MAE (1): 36.470178
Results of data batch 3  with model 2
MAE (1): 42.705692
Results of data batch 4  with model 2
MAE (1): 38.776352
Results of data batch 5  with model 2
MAE (1): 44.543309
Results of data batch 1  with model 3
MAE (1): 35.601157
Results of data batch 2  with model 3
MAE (1): 33.303125
Results of data batch 3  with model 3
MAE (1): 37.083068
Results of data batch 4  with model 3
MAE (1): 34.765023
Results of data batch 5  with model 3
MAE (1): 40.623367
Results of data batch 1  with model 4
MAE (1): 34.587223
Results of data batch 2  with model 4
MAE (1): 32.008578
Results of data batch 3  with m