In [27]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
import src
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', None)
import matplotlib.pyplot as plt
import logging
src.logging_config.setup_logging(logging.INFO)

Logging has been configured.


# Data Pipeline

In [29]:
covid_data_loader = src.features.CovidDataLoader(level=1)
target_creator = src.features.TargetCreator(shift_day=0)

pipeline = [covid_data_loader, 
            target_creator]
df = None
for t in pipeline:
    df = t.transform(df)

2024-10-03 01:27:51,385 - src.features - INFO - ---- Covid Data Loader --------
2024-10-03 01:27:51,387 - src.features - INFO -   loading dataset ...
2024-10-03 01:27:56,272 - src.features - INFO -   cleaning dataset ...
2024-10-03 01:28:01,065 - src.features - INFO -   sorting dataset by location and date...
2024-10-03 01:28:01,376 - src.features - INFO - ---- Target Creator (new_confirmed / population * 100) --------
2024-10-03 01:28:01,378 - src.features - INFO -  shifting 0 day(s)...
2024-10-03 01:28:01,379 - src.util - INFO - truncating target outliers ...


# Base Model
Use latest date `target` as the prediction of the next 7 days

### Load Config

In [30]:
CONFIG = src.util.load_config()

2024-10-03 01:28:01,939 - src.util - INFO - Loaded CONFIG:
{
    "pred_days": 7,
    "pred_date": "2021-07-01",
    "features": [
        "loc_roll7_mean",
        "loc_roll14_mean",
        "loc_roll28_mean",
        "loc_roll56_mean",
        "loc_roll7_std",
        "loc_roll14_std",
        "loc_roll28_std",
        "loc_roll56_std",
        "location_duration_days",
        "population",
        "weekday",
        "is_sunday",
        "gap_days",
        "loc_lag0",
        "loc_lag1",
        "loc_lag2",
        "loc_lag3",
        "loc_lag4",
        "loc_lag5",
        "loc_lag6",
        "new_vaccinated_rate",
        "cumsum_vaccinated_rate",
        "loc_rain_roll7_sum",
        "loc_rain_roll14_sum",
        "loc_rain_roll28_sum",
        "loc_rain_roll56_sum",
        "location_key_encoded",
        "latitude",
        "longitude",
        "relative_humidity",
        "loc_same_weekday_roll2_mean",
        "loc_same_weekday_roll3_mean",
        "loc_same_weekday_roll4_mean

### Cross Validation Model Train & Test

In [31]:
score = {}
model = {}
# Cross Validation Score
for i, cv_date in enumerate(["2021-06-03", "2021-06-10", "2021-06-17", "2021-06-24"]):
    # train / test split
    cv_train_df, cv_test_df = src.util.train_test_split(df, ref_date=cv_date, pred_days=CONFIG["pred_days"], gap_days=0)

    # Model train & prediction
    base_model = src.models.BaseModel(pred_days=7)
    base_model.fit(cv_train_df)
    cv_pred_df = base_model.transform(cv_test_df)

    # compute agg MAE score
    score[f'cv{i}'] = src.util.score(cv_pred_df['population'], cv_pred_df['prediction'], cv_pred_df['target'])
    model[f'cv{i}'] = base_model

# Test Score
train_df, test_df = src.util.train_test_split(df, ref_date=CONFIG["pred_date"], pred_days=CONFIG["pred_days"])
base_model = src.models.BaseModel(pred_days=7)
base_model.fit(train_df)
pred_df = base_model.transform(test_df)
score[f'test'] = src.util.score(pred_df['population'], pred_df['prediction'], pred_df['target'])
model[f'test'] = base_model

2024-10-03 01:28:01,987 - src.util - INFO - train / test split: 
  gap days = 0
  2020-05-01 <= train <= 2021-06-02
  2021-06-03 <= test <= 2021-06-09
2024-10-03 01:28:02,050 - src.util - INFO - train size: 282730; test_size: 5415
2024-10-03 01:28:02,062 - src.models - INFO - ---- Base Model fitting --------
2024-10-03 01:28:02,108 - src.models - INFO - ---- Base Model predicting --------
2024-10-03 01:28:02,119 - src.util - INFO - train / test split: 
  gap days = 0
  2020-05-01 <= train <= 2021-06-09
  2021-06-10 <= test <= 2021-06-16
2024-10-03 01:28:02,187 - src.util - INFO - train size: 288145; test_size: 5266
2024-10-03 01:28:02,196 - src.models - INFO - ---- Base Model fitting --------
2024-10-03 01:28:02,242 - src.models - INFO - ---- Base Model predicting --------
2024-10-03 01:28:02,253 - src.util - INFO - train / test split: 
  gap days = 0
  2020-05-01 <= train <= 2021-06-16
  2021-06-17 <= test <= 2021-06-23
2024-10-03 01:28:02,313 - src.util - INFO - train size: 293411; t

### Evaluation Score

In [33]:
pred_df.to_csv("./predictions/base_model.csv", index=False)

In [34]:
src.util.display_score(score)

2024-10-03 01:28:21,825 - src.util - INFO - ---- cv / test scores ----


cv0 score = 0.0035
cv1 score = 0.0037
cv2 score = 0.0030
cv3 score = 0.0027
cv score = 0.0032
test score = 0.0029
cv score std  = 0.0004


In [40]:
tuning_df = pd.DataFrame({'cv_score_mean': [np.mean([score['cv0'], score['cv1'], score['cv2'], score['cv3']])],
                          'cv_score_std': [np.std([score['cv0'], score['cv1'], score['cv2'], score['cv3']])],
                         })
display(tuning_df)
tuning_df.to_csv("./scores/base_model.csv", index=False)

Unnamed: 0,cv_score_mean,cv_score_std
0,0.003216,0.000395


In [37]:
score

{'cv0': 0.003471805840295558,
 'cv1': 0.0036958101146007337,
 'cv2': 0.003020672227133181,
 'cv3': 0.002675863354043167,
 'test': 0.002906537865053964}

### Save Model

In [26]:
src.util.save_model(model, "base_model.jolib")

2024-10-03 01:11:31,544 - src.util - INFO - Saving model to ./models/base_model.jolib
