## Generate submission

So far, the best model seems to be SVR with linear kernel

Features had been selected by looking at feature importance and testing different MA windows and shifts

In [1]:
import pandas as pd
import os
import seaborn as sns
from dateutil import parser
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
tqdm.pandas()
import numpy as np
import hydra
from sklearn.metrics import mean_absolute_error
from sklearn import preprocessing
import lightgbm as lgb
import copy
from sklearn.model_selection import train_test_split

In [2]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

### Load data

In [3]:
data_folder = 'data'
features_train_path = 'dengue_features_train.csv'
labels_train_path = 'dengue_labels_train.csv'
features_test_path = 'dengue_features_test.csv'
subm_format_path = "submission_format.csv"

In [4]:
features_train_df = pd.read_csv(os.path.join(data_folder, features_train_path))
labels_train = pd.read_csv(os.path.join(data_folder, labels_train_path))
features_test_df = pd.read_csv(os.path.join(data_folder, features_test_path))
subm_format = pd.read_csv(os.path.join(data_folder, subm_format_path))

### Basic Pre-processing

In [5]:
# transform week_start_date into a datetime feature
features_train_df['week_start_date'] = features_train_df.progress_apply(lambda x: parser.parse(x.week_start_date), axis=1)
features_test_df['week_start_date'] = features_test_df.progress_apply(lambda x: parser.parse(x.week_start_date), axis=1)

  0%|          | 0/1456 [00:00<?, ?it/s]

  0%|          | 0/416 [00:00<?, ?it/s]

In [6]:
# Merge features and labels for easier exploration
train_df = features_train_df.merge(labels_train, on=['city', 'year', 'weekofyear'])

# SJ

In [9]:
city_df = copy.deepcopy(train_df[train_df.city == 'sj'])

In [10]:
sj_df_test = copy.deepcopy(features_test_df[features_test_df.city == 'sj'])

In [11]:
fill_ma_cols = [col for col in city_df.columns if city_df[city_df[col].isna()].shape[0] > 0]
for col in tqdm(fill_ma_cols):
    city_df[col].fillna(method='ffill', inplace=True)

  0%|          | 0/20 [00:00<?, ?it/s]

In [12]:
fill_ma_cols = [col for col in sj_df_test.columns if sj_df_test[sj_df_test[col].isna()].shape[0] > 0]
for col in tqdm(fill_ma_cols):
    sj_df_test[col].fillna(method='ffill', inplace=True)

  0%|          | 0/20 [00:00<?, ?it/s]

In [13]:
for col in list(city_df.select_dtypes(include=np.number).columns):
    if col not in ["year", "weekofyear", "total_cases"]:
        for s in [10]:
            city_df[col + "_SHIFT_" + str(s)] = city_df.groupby("city")[col].shift(s).fillna(city_df[col].mean())
            city_df[col + "_MA_" + str(s)] = city_df[col].rolling(s).mean().fillna(city_df[col].mean())

In [14]:
for col in list(sj_df_test.select_dtypes(include=np.number).columns):
    if col not in ["year", "weekofyear", "total_cases"]:
        for s in [10]:
            sj_df_test[col + "_SHIFT_" + str(s)] = sj_df_test.groupby("city")[col].shift(s).fillna(sj_df_test[col].mean())
            sj_df_test[col + "_MA_" + str(s)] = sj_df_test[col].rolling(s).mean().fillna(sj_df_test[col].mean())

In [15]:
train_years = sorted(city_df['year'].drop_duplicates().values)[:int(city_df['year'].nunique() * 0.8)]
eval_years = sorted(city_df['year'].drop_duplicates().values)[int(city_df['year'].nunique() * 0.8):]

city_train_df = city_df[city_df.year.isin(train_years)]
city_eval_df = city_df[city_df.year.isin(eval_years)]

In [16]:
initial_list = [
    'weekofyear',
#      'ndvi_ne',
     'ndvi_nw',
     'ndvi_se',
#      'ndvi_sw',
#      'precipitation_amt_mm',
#      'reanalysis_air_temp_k',
#      'reanalysis_avg_temp_k',
     'reanalysis_dew_point_temp_k',
     'reanalysis_max_air_temp_k',
     'reanalysis_min_air_temp_k',
#      'reanalysis_precip_amt_kg_per_m2',
#      'reanalysis_relative_humidity_percent',
#      'reanalysis_sat_precip_amt_mm',
     'reanalysis_specific_humidity_g_per_kg',
     'reanalysis_tdtr_k',
     'station_avg_temp_c',
     'station_diur_temp_rng_c',
#      'station_max_temp_c',
#      'station_min_temp_c',
#      'station_precip_mm',
     'total_cases']
final_list = []
for col in initial_list:
    final_list.append(col)
    if col not in ['total_cases', 'weekofyear']:
        for s in [10]:
            final_list.append(col + '_MA_' + str(s))
            final_list.append(col + '_SHIFT_' + str(s))
initial_list = final_list

In [17]:
TARGET = 'total_cases'
ignore_cols = ['week_start_date'] + [TARGET]
num_cols = [col for col in initial_list if col not in ignore_cols]
cat_cols = [col for col in [col for col in city_train_df.columns if col not in list(city_train_df.select_dtypes(include=np.number).columns)] if col not in ignore_cols]

In [18]:
features_train = city_train_df[num_cols].join(pd.get_dummies(city_train_df[cat_cols])).values
label_train = city_train_df[TARGET].values
features_eval = city_eval_df[num_cols].join(pd.get_dummies(city_eval_df[cat_cols])).values
label_eval = city_eval_df[TARGET].values
features_test = sj_df_test[num_cols].join(pd.get_dummies(sj_df_test[cat_cols])).values

In [20]:
scaler = 'sklearn.preprocessing.MinMaxScaler'
scaler = hydra.utils.get_class(scaler)()
scaler.fit(features_train)
features_train = scaler.transform(features_train)
features_eval = scaler.transform(features_eval)
features_test = scaler.transform(features_test)

In [21]:
model = 'sklearn.svm.SVR'
clf = hydra.utils.get_class(model)(kernel="linear")
clf = clf.fit(features_train, label_train)
y_hat = clf.predict(features_eval)
print(model, mean_absolute_error(label_eval, y_hat))

sklearn.svm.SVR 16.181309392124227


In [22]:
y_pred =  clf.predict(features_test)

In [23]:
y_pred.round()

array([22., 20., 22., 19., 22., 17., 19., 20., 18., 16., 12., 12., 16.,
       18., 19., 24., 22., 25., 26., 34., 27., 28., 32., 28., 31., 27.,
       30., 30., 27., 29., 25., 28., 24., 22., 23., 21., 17., 16., 16.,
       14., 13.,  6.,  7.,  9.,  4.,  5.,  3.,  9.,  8.,  7.,  6.,  6.,
        3.,  4.,  3.,  6.,  1.,  4.,  9., 11., 13., 13., 17., 13., 18.,
       18., 19., 23., 21., 19., 22., 23., 28., 29., 27., 28., 25., 25.,
       26., 27., 24., 29., 24., 23., 25., 26., 26., 23., 21., 21., 19.,
       19., 22., 21., 20., 19., 14., 17., 14., 11., 11., 11.,  7.,  7.,
        8., 10., 13., 10., 12., 16., 16., 13., 14., 21., 23., 24., 25.,
       27., 23., 30., 29., 28., 28., 29., 29., 33., 29., 29., 33., 31.,
       26., 32., 32., 33., 28., 27., 27., 21., 22., 17., 15., 20., 14.,
       11.,  9.,  8., -2.,  8.,  9.,  7.,  3.,  6.,  6.,  6.,  4.,  4.,
        3.,  3.,  0.,  5.,  6.,  7., 17., 20., 17., 19., 20., 26., 28.,
       29., 31., 32., 33., 34., 35., 35., 31., 34., 30., 30., 31

In [24]:
sj_df_test['total_cases'] = [int(y) if y > 0 else 0 for y in y_pred.round()]

In [25]:
sj_df_test.head()

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,...,station_avg_temp_c_MA_10,station_diur_temp_rng_c_SHIFT_10,station_diur_temp_rng_c_MA_10,station_max_temp_c_SHIFT_10,station_max_temp_c_MA_10,station_min_temp_c_SHIFT_10,station_min_temp_c_MA_10,station_precip_mm_SHIFT_10,station_precip_mm_MA_10,total_cases
0,sj,2008,18,2008-04-29,-0.0189,-0.0189,0.102729,0.0912,78.6,298.492857,...,27.261923,6.151538,6.151538,31.666538,31.666538,23.096538,23.096538,34.001538,34.001538,22
1,sj,2008,19,2008-05-06,-0.018,-0.0124,0.082043,0.072314,12.56,298.475714,...,27.261923,6.151538,6.151538,31.666538,31.666538,23.096538,23.096538,34.001538,34.001538,20
2,sj,2008,20,2008-05-13,-0.0015,-0.0124,0.151083,0.091529,3.66,299.455714,...,27.261923,6.151538,6.151538,31.666538,31.666538,23.096538,23.096538,34.001538,34.001538,22
3,sj,2008,21,2008-05-20,-0.0015,-0.019867,0.124329,0.125686,0.0,299.69,...,27.261923,6.151538,6.151538,31.666538,31.666538,23.096538,23.096538,34.001538,34.001538,19
4,sj,2008,22,2008-05-27,0.0568,0.039833,0.062267,0.075914,0.76,299.78,...,27.261923,6.151538,6.151538,31.666538,31.666538,23.096538,23.096538,34.001538,34.001538,22


# iq

In [26]:
city_df = copy.deepcopy(train_df[train_df.city == 'iq'])

In [27]:
iq_df_test = copy.deepcopy(features_test_df[features_test_df.city == 'iq'])

In [28]:
fill_ma_cols = [col for col in city_df.columns if city_df[city_df[col].isna()].shape[0] > 0]
for col in tqdm(fill_ma_cols):
    city_df[col].fillna(method='ffill', inplace=True)

  0%|          | 0/20 [00:00<?, ?it/s]

In [29]:
fill_ma_cols = [col for col in iq_df_test.columns if iq_df_test[iq_df_test[col].isna()].shape[0] > 0]
for col in tqdm(fill_ma_cols):
    iq_df_test[col].fillna(method='ffill', inplace=True)

  0%|          | 0/5 [00:00<?, ?it/s]

In [30]:
for col in list(city_df.select_dtypes(include=np.number).columns):
    if col not in ["year", "weekofyear", "total_cases"]:
        for s in [10]:
            city_df[col + "_SHIFT_" + str(s)] = city_df.groupby("city")[col].shift(s).fillna(city_df[col].mean())
            city_df[col + "_MA_" + str(s)] = city_df[col].rolling(s).mean().fillna(city_df[col].mean())

In [31]:
for col in list(iq_df_test.select_dtypes(include=np.number).columns):
    if col not in ["year", "weekofyear", "total_cases"]:
        for s in [10]:
            iq_df_test[col + "_SHIFT_" + str(s)] = iq_df_test.groupby("city")[col].shift(s).fillna(iq_df_test[col].mean())
            iq_df_test[col + "_MA_" + str(s)] = iq_df_test[col].rolling(s).mean().fillna(iq_df_test[col].mean())

In [32]:
train_years = sorted(city_df['year'].drop_duplicates().values)[:int(city_df['year'].nunique() * 0.8)]
eval_years = sorted(city_df['year'].drop_duplicates().values)[int(city_df['year'].nunique() * 0.8):]

city_train_df = city_df[city_df.year.isin(train_years)]
city_eval_df = city_df[city_df.year.isin(eval_years)]

In [33]:
initial_list = [
    'weekofyear',
#      'ndvi_ne',
     'ndvi_nw',
     'ndvi_se',
#      'ndvi_sw',
#      'precipitation_amt_mm',
#      'reanalysis_air_temp_k',
#      'reanalysis_avg_temp_k',
     'reanalysis_dew_point_temp_k',
     'reanalysis_max_air_temp_k',
     'reanalysis_min_air_temp_k',
#      'reanalysis_precip_amt_kg_per_m2',
#      'reanalysis_relative_humidity_percent',
#      'reanalysis_sat_precip_amt_mm',
     'reanalysis_specific_humidity_g_per_kg',
     'reanalysis_tdtr_k',
     'station_avg_temp_c',
     'station_diur_temp_rng_c',
#      'station_max_temp_c',
#      'station_min_temp_c',
#      'station_precip_mm',
     'total_cases']
final_list = []
for col in initial_list:
    final_list.append(col)
    if col not in ['total_cases', 'weekofyear']:
        for s in [10]:
            final_list.append(col + '_MA_' + str(s))
            final_list.append(col + '_SHIFT_' + str(s))
initial_list = final_list

In [34]:
TARGET = 'total_cases'
ignore_cols = ['week_start_date'] + [TARGET]
num_cols = [col for col in initial_list if col not in ignore_cols]
cat_cols = [col for col in [col for col in city_train_df.columns if col not in list(city_train_df.select_dtypes(include=np.number).columns)] if col not in ignore_cols]

In [35]:
features_train = city_train_df[num_cols].join(pd.get_dummies(city_train_df[cat_cols])).values
label_train = city_train_df[TARGET].values
features_eval = city_eval_df[num_cols].join(pd.get_dummies(city_eval_df[cat_cols])).values
label_eval = city_eval_df[TARGET].values
features_test = iq_df_test[num_cols].join(pd.get_dummies(iq_df_test[cat_cols])).values

In [36]:
scaler = 'sklearn.preprocessing.MinMaxScaler'
scaler = hydra.utils.get_class(scaler)()
scaler.fit(features_train)
features_train = scaler.transform(features_train)
features_eval = scaler.transform(features_eval)
features_test = scaler.transform(features_test)

In [37]:
model = 'sklearn.svm.SVR'
clf = hydra.utils.get_class(model)(kernel="linear")
clf = clf.fit(features_train, label_train)
y_hat = clf.predict(features_eval)
print(model, mean_absolute_error(label_eval, y_hat))

sklearn.svm.SVR 8.079637742308044


In [38]:
y_pred = clf.predict(features_test)

In [39]:
y_pred.round()

array([5., 5., 5., 4., 5., 5., 4., 5., 5., 4., 4., 3., 3., 2., 3., 3., 3.,
       4., 4., 4., 5., 5., 6., 6., 6., 6., 6., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 6., 5., 6., 6., 6., 6., 7., 8., 8., 6., 7., 7., 7., 6., 6.,
       5., 5., 5., 3., 3., 3., 3., 3., 2., 2., 1., 2., 3., 2., 2., 3., 3.,
       4., 3., 4., 5., 6., 7., 7., 8., 7., 8., 8., 6., 6., 6., 6., 6., 6.,
       6., 6., 5., 5., 5., 5., 6., 5., 6., 6., 5., 6., 5., 5., 5., 5., 5.,
       4., 5., 5., 3., 4., 4., 2., 3., 3., 2., 2., 3., 2., 3., 2., 3., 2.,
       3., 3., 4., 5., 5., 6., 7., 7., 8., 9., 9., 8., 7., 7., 8., 7., 8.,
       7., 7., 7., 7., 7., 8., 8., 8., 8., 8., 7., 6., 7., 6., 6., 7., 6.,
       5., 5., 5.])

In [40]:
iq_df_test['total_cases'] = [int(y) if y > 0 else 0 for y in y_pred.round()]

In [41]:
iq_df_test.head()

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,...,station_avg_temp_c_MA_10,station_diur_temp_rng_c_SHIFT_10,station_diur_temp_rng_c_MA_10,station_max_temp_c_SHIFT_10,station_max_temp_c_MA_10,station_min_temp_c_SHIFT_10,station_min_temp_c_MA_10,station_precip_mm_SHIFT_10,station_precip_mm_MA_10,total_cases
260,iq,2010,26,2010-07-02,0.183783,0.1425,0.225129,0.150214,82.29,297.648571,...,27.520121,10.622212,10.622212,33.970513,33.970513,21.152564,21.152564,33.897436,33.897436,5
261,iq,2010,27,2010-07-09,0.291657,0.272267,0.3307,0.320914,25.3,298.224286,...,27.520121,10.622212,10.622212,33.970513,33.970513,21.152564,21.152564,33.897436,33.897436,5
262,iq,2010,28,2010-07-16,0.208543,0.366457,0.212629,0.255514,62.14,297.955714,...,27.520121,10.622212,10.622212,33.970513,33.970513,21.152564,21.152564,33.897436,33.897436,5
263,iq,2010,29,2010-07-23,0.089286,0.063214,0.122057,0.081957,47.8,295.715714,...,27.520121,10.622212,10.622212,33.970513,33.970513,21.152564,21.152564,33.897436,33.897436,4
264,iq,2010,30,2010-07-30,0.3061,0.327683,0.250086,0.267914,56.3,298.502857,...,27.520121,10.622212,10.622212,33.970513,33.970513,21.152564,21.152564,33.897436,33.897436,5


In [42]:
subm_format_1 = subm_format.merge(iq_df_test[['city', 'year', 'weekofyear', 'total_cases']], on=['city', 'year', 'weekofyear'], how='left', suffixes=('','_iq'))

In [43]:
subm_format_2 = subm_format_1.merge(sj_df_test[['city', 'year', 'weekofyear', 'total_cases']], on=['city', 'year', 'weekofyear'], how='left', suffixes=('','_sj'))

In [44]:
subm_format

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,0
1,sj,2008,19,0
2,sj,2008,20,0
3,sj,2008,21,0
4,sj,2008,22,0
...,...,...,...,...
411,iq,2013,22,0
412,iq,2013,23,0
413,iq,2013,24,0
414,iq,2013,25,0


In [45]:
subm_format_2['total_cases'] = subm_format_2.apply(lambda x: x.total_cases_iq if x.city == 'iq' else x.total_cases_sj, axis=1)

In [46]:
subm_format_3 = subm_format_2[['city', 'year', 'weekofyear', 'total_cases']]

In [47]:
subm_format_3

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,22.0
1,sj,2008,19,20.0
2,sj,2008,20,22.0
3,sj,2008,21,19.0
4,sj,2008,22,22.0
...,...,...,...,...
411,iq,2013,22,7.0
412,iq,2013,23,6.0
413,iq,2013,24,5.0
414,iq,2013,25,5.0


In [48]:
subm_format_3.total_cases = [int(y) for y in subm_format_3.total_cases]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subm_format_3.total_cases = [int(y) for y in subm_format_3.total_cases]


In [49]:
subm_format_3

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,22
1,sj,2008,19,20
2,sj,2008,20,22
3,sj,2008,21,19
4,sj,2008,22,22
...,...,...,...,...
411,iq,2013,22,7
412,iq,2013,23,6
413,iq,2013,24,5
414,iq,2013,25,5


In [50]:
subm_format_3.to_csv("subm4.csv", index=False) # --> SCORE 27.26

In [51]:
subm_format_3.describe()

Unnamed: 0,year,weekofyear,total_cases
count,416.0,416.0,416.0
mean,2010.766827,26.439904,14.137019
std,1.434835,14.978257,10.15983
min,2008.0,1.0,0.0
25%,2010.0,13.75,5.0
50%,2011.0,26.0,9.0
75%,2012.0,39.0,23.0
max,2013.0,53.0,35.0


After looking at the results, there are a lot of room for improvement.

Another submission using LightGBM and a different set of features and hyperparamenters scored 26.55

Now, one can start playing with different combination of features, different models, different combination of MA and SHIFTS

Also, here the same features and model configs are used for each city (just training a separate instance)
Since the distribution is slightly different, each city could have it's own model type and set of features.

Once this has been explored, next obvious step is to move into Recurrent Networks (LSTMs)

Combination (ensemble) of models and adding ARIMA as one of the models can also be a strong candidate.