## Using LGBM
To Do's
- add visualizations
- add feature importance  [done] 
    - for 4 weeks of data [done] 
    - for 8 weeks of data [done] 
- test 8 weeks of data [done] 

### 4 weeks of data

In [1]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')
df

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,...,2.0,0.0,1.0,0.0,0.0,0.0,0.0,2022-01-29,1,0.0
1,0.0,0.0,0.0,0.0,0.0,4.0,1.0,2.0,1.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2022-01-30,1,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,1.0,2.0,0.0,0.0,0.0,0.0,2022-01-31,1,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,2.0,1.0,0.0,1.0,1.0,0.0,0.0,2022-02-01,1,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-02-02,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89300,3.0,0.0,2.0,3.0,2.0,3.0,13.0,8.0,9.0,9.0,...,6.0,5.0,3.0,1.0,6.0,1.0,3.0,2022-12-27,265,3.0
89301,6.0,4.0,0.0,0.0,2.0,0.0,14.0,7.0,8.0,4.0,...,4.0,2.0,1.0,2.0,2.0,2.0,8.0,2022-12-28,265,1.0
89302,7.0,2.0,3.0,4.0,7.0,4.0,10.0,9.0,7.0,11.0,...,2.0,3.0,5.0,1.0,1.0,0.0,8.0,2022-12-29,265,3.0
89303,6.0,5.0,4.0,3.0,0.0,3.0,11.0,12.0,9.0,10.0,...,3.0,3.0,1.0,2.0,0.0,1.0,2.0,2022-12-30,265,7.0


In [2]:
from datetime import datetime
from src.data_split import train_test_split

X_train, y_train, X_test, y_test = train_test_split(
    df,
    cutoff_date=datetime(2022, 6, 1, 0, 0, 0),
    target_column_name='target_rides_next_hour'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(32595, 674)
y_train.shape=(32595,)
X_test.shape=(56710, 674)
y_test.shape=(56710,)


In [3]:
import lightgbm as lgb

In [4]:
# use only past rides data - input features
past_rides_columns = [col for col in X_train.columns if col.startswith('rides_')]
X_train_only_numeric = X_train[past_rides_columns]

In [5]:
# train model
model = lgb.LGBMRegressor()
model.fit(X_train_only_numeric, y_train)

In [6]:
X_test_only_numeric = X_test[past_rides_columns]
predictions = model.predict(X_test_only_numeric)
predictions

array([0.10154606, 0.10154606, 0.13547459, ..., 5.78329344, 7.54141115,
       4.14833214])

In [7]:
from sklearn.metrics import mean_absolute_error
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=}')

test_mae=2.577599250450685


### Feature Importance for 4 weeks of data

In [17]:
# feature importance
import numpy as np

feature_importance = pd.DataFrame(
    model.feature_importances_,
    index=X_train_only_numeric.columns,
    columns=['importance']
)
feature_importance.sort_values('importance', ascending=False, inplace=True)
feature_importance.head(20) 

Unnamed: 0,importance
rides_previous_1_hour,224
rides_previous_335_hour,58
rides_previous_504_hour,42
rides_previous_168_hour,41
rides_previous_167_hour,35
rides_previous_672_hour,34
rides_previous_503_hour,33
rides_previous_24_hour,32
rides_previous_336_hour,30
rides_previous_169_hour,25


TOP 3
- rides_previous_1_hour	  = 224
- rides_previous_335_hour = 58
- rides_previous_504_hour = 42

In [12]:
# use only important features in the model and retrain
important_features = feature_importance[feature_importance['importance'] > 10].index # for XGBOOST > 0.001
X_train_important = X_train[important_features]
X_test_important = X_test[important_features]

model_important = lgb.LGBMRegressor()
model_important.fit(X_train_important, y_train)

In [13]:
important_features

Index(['rides_previous_1_hour', 'rides_previous_335_hour',
       'rides_previous_504_hour', 'rides_previous_168_hour',
       'rides_previous_167_hour', 'rides_previous_672_hour',
       'rides_previous_503_hour', 'rides_previous_24_hour',
       'rides_previous_336_hour', 'rides_previous_169_hour',
       'rides_previous_170_hour', 'rides_previous_671_hour',
       'rides_previous_2_hour', 'rides_previous_23_hour',
       'rides_previous_337_hour', 'rides_previous_192_hour',
       'rides_previous_338_hour', 'rides_previous_166_hour',
       'rides_previous_362_hour', 'rides_previous_648_hour',
       'rides_previous_506_hour', 'rides_previous_3_hour',
       'rides_previous_502_hour', 'rides_previous_529_hour',
       'rides_previous_260_hour', 'rides_previous_497_hour',
       'rides_previous_669_hour', 'rides_previous_360_hour',
       'rides_previous_4_hour', 'rides_previous_446_hour',
       'rides_previous_449_hour', 'rides_previous_574_hour',
       'rides_previous_222_hour'],

In [14]:
important_features.size, X_train_important.shape[1], X_test_important.shape[1]

(33, 33, 33)

In [15]:
# evaluate the model
predictions_important = model_important.predict(X_test_important)
test_mae_important = mean_absolute_error(y_test, predictions_important)
print(f'{test_mae_important=}')

test_mae_important=2.546605798384063


### Summary 4 weeks 
[w\o feature importance] 
- mae = 2.57
- feature_size = 675
- train time = 5 sec

[with feature importance] 
- mae_important = 2.54
- feature_size = 33
- train time = 2 sec

### 8 weeks of data

In [40]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data_2_months.parquet')
df

Unnamed: 0,rides_previous_1344_hour,rides_previous_1343_hour,rides_previous_1342_hour,rides_previous_1341_hour,rides_previous_1340_hour,rides_previous_1339_hour,rides_previous_1338_hour,rides_previous_1337_hour,rides_previous_1336_hour,rides_previous_1335_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,...,1.0,2.0,0.0,0.0,0.0,0.0,0.0,2022-02-26,1,1.0
1,0.0,0.0,0.0,0.0,0.0,4.0,1.0,2.0,1.0,2.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,2022-02-27,1,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,3.0,2.0,0.0,2.0,0.0,0.0,0.0,2022-02-28,1,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-03-01,1,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,4.0,0.0,0.0,0.0,0.0,0.0,2022-03-02,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81880,4.0,6.0,0.0,1.0,1.0,7.0,22.0,18.0,25.0,17.0,...,6.0,5.0,3.0,1.0,6.0,1.0,3.0,2022-12-27,265,3.0
81881,2.0,6.0,4.0,2.0,4.0,10.0,16.0,17.0,20.0,12.0,...,4.0,2.0,1.0,2.0,2.0,2.0,8.0,2022-12-28,265,1.0
81882,5.0,1.0,3.0,1.0,2.0,8.0,18.0,9.0,18.0,19.0,...,2.0,3.0,5.0,1.0,1.0,0.0,8.0,2022-12-29,265,3.0
81883,8.0,6.0,3.0,3.0,2.0,8.0,15.0,16.0,8.0,8.0,...,3.0,3.0,1.0,2.0,0.0,1.0,2.0,2022-12-30,265,7.0


In [41]:
from datetime import datetime
from src.data_split import train_test_split

X_train, y_train, X_test, y_test = train_test_split(
    df,
    cutoff_date=datetime(2022, 6, 1, 0, 0, 0),
    target_column_name='target_rides_next_hour'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(25175, 1346)
y_train.shape=(25175,)
X_test.shape=(56710, 1346)
y_test.shape=(56710,)


In [42]:
# use only past rides data
past_rides_columns = [col for col in X_train.columns if col.startswith('rides_')]
X_train_only_numeric = X_train[past_rides_columns]

In [43]:
# train model
model = lgb.LGBMRegressor()
model.fit(X_train_only_numeric, y_train)

In [44]:
X_test_only_numeric = X_test[past_rides_columns]
predictions = model.predict(X_test_only_numeric)
predictions

array([0.10231421, 0.10231421, 0.10231421, ..., 5.50707188, 7.19027161,
       4.20759656])

In [45]:
from sklearn.metrics import mean_absolute_error
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=}')

test_mae=2.745466295622436


### Feature Importance for 8 weeks of data

In [46]:
# feature importance
import numpy as np

feature_importance = pd.DataFrame(
    model.feature_importances_,
    index=X_train_only_numeric.columns,
    columns=['importance']
)
feature_importance.sort_values('importance', ascending=False, inplace=True)
feature_importance.head(20) 

Unnamed: 0,importance
rides_previous_1_hour,180
rides_previous_504_hour,43
rides_previous_168_hour,35
rides_previous_838_hour,32
rides_previous_1008_hour,29
rides_previous_24_hour,28
rides_previous_167_hour,28
rides_previous_336_hour,24
rides_previous_335_hour,21
rides_previous_337_hour,20


In [47]:
# use only important features in the model and retrain
important_features = feature_importance[feature_importance['importance'] > 10].index # for XGBOOST > 0.001
X_train_important = X_train[important_features]
X_test_important = X_test[important_features]

model_important = lgb.LGBMRegressor()
model_important.fit(X_train_important, y_train)

In [48]:
important_features.size, X_train_important.shape[1], X_test_important.shape[1]

(24, 24, 24)

In [49]:
# evaluate the model
predictions_important = model_important.predict(X_test_important)
test_mae_important = mean_absolute_error(y_test, predictions_important)
print(f'{test_mae_important=}')

test_mae_important=2.6692208294709654


### Summary 8 weeks 
[w\o feature importance] 
- mae = 2.74
- feature_size = 1347
- train time = 7.5 sec

[with feature importance] 
- mae_important = 2.66 
- feature_size = 24
- train time = 1.5 sec
