In [1]:
import numpy as np
import pandas as pd
from datetime import datetime as dt
import plotly.express as px

In [2]:
from scripts.preproc import load_data, split_df, get_xxyy
from scripts.train import train_grid, predict_val
from scripts.submit import get_submitable
train_cols = ['precipitation_amt_mm', 'weekofyear_sin', 'weekofyear_cos']
target_col = 'total_cases'
test_size = 0

# 1.Preprocessing

## 1.1 Train features and labes are merged, both train and test DFs are devided by cities

In [3]:
train_sj, train_iq, test_sj, test_iq = load_data()

## 1.2. Visualisation

In [4]:
# fig = px.line(train_sj, x='date', y=["precipitation_amt_mm", 'station_avg_temp_c', 'total_cases', 'weekofyear_sin', 'weekofyear_cos'], title='San Juan')
# fig.show()
# fig = px.line(train_iq, x='date', y=["precipitation_amt_mm", 'station_avg_temp_c', 'total_cases', 'weekofyear_sin', 'weekofyear_cos'], title='Iquitos')
# fig.show()

## 1.3. Spliting into training and validation sets

I suggest that at the end of preprocessing for either city we get **one** dictionary with **four** dataframes: X_train, X_val, y_train, y_val

In [5]:
xxyy_sj = get_xxyy(train_sj, test_size, train_cols, target_col)
xxyy_iq = get_xxyy(train_iq, test_size, train_cols, target_col)

Train size: 936, validation size: 0
Train size: 520, validation size: 0


# 2. Training

At the end of training we need:
1. a model compatible with scikit-learn api
2. (maybe???) an encoder

In [6]:
from sklearn.model_selection import TimeSeriesSplit

cv = TimeSeriesSplit(5)
scoring = 'neg_mean_absolute_error'

In [7]:
from sklearn.ensemble import RandomForestRegressor
estimator = RandomForestRegressor()
params = dict(
	n_estimators = [3000, 2500, 2000],
	max_depth = [3, 4, 5],
	# eta = [0.3, 0.1, 0.01],
	# subsample = [0.3, 0.5, 0.8, 1],
	# colsample_bytree = 1,
)

In [8]:
model_sj = train_grid(xxyy_sj, estimator, params, scoring)
model_iq = train_grid(xxyy_iq, estimator, params, scoring)

In [9]:
pd.DataFrame(model_sj.cv_results_).columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_max_depth', 'param_n_estimators', 'params', 'split0_test_score',
       'split1_test_score', 'split2_test_score', 'split3_test_score',
       'split4_test_score', 'mean_test_score', 'std_test_score',
       'rank_test_score', 'split0_train_score', 'split1_train_score',
       'split2_train_score', 'split3_train_score', 'split4_train_score',
       'mean_train_score', 'std_train_score'],
      dtype='object')

In [10]:
print('San Jose')
print(pd.DataFrame(model_sj.cv_results_)
	.sort_values(by='rank_test_score')[[
		'param_max_depth', 'param_n_estimators', 'mean_train_score']])
print()
print('Iquitos')
print(pd.DataFrame(model_iq.cv_results_)
	.sort_values(by='rank_test_score')[[
		'param_max_depth', 'param_n_estimators', 'mean_train_score']])

San Jose
  param_max_depth param_n_estimators  mean_train_score
0               3               3000        -24.043547
1               3               2500        -24.045690
4               4               2500        -23.018336
2               3               2000        -24.056120
5               4               2000        -23.028105
3               4               3000        -23.044900
6               5               3000        -21.734006
8               5               2000        -21.736993
7               5               2500        -21.743137

Iquitos
  param_max_depth param_n_estimators  mean_train_score
2               3               2000         -5.892714
0               3               3000         -5.895742
1               3               2500         -5.899944
5               4               2000         -5.544294
4               4               2500         -5.540441
3               4               3000         -5.544559
7               5               2500         -5

# 3. Predict for test features and format for submition

For that we need:
1. [from precrocessing] - (list of) test DFs
2. [from training]      - (list of) models

In [11]:
get_submitable([test_sj, test_iq], [model_sj, model_iq], train_cols)