In [8]:
import os
import numpy as np
import pandas as pd

from lightgbm import LGBMRegressor
from sklearn.preprocessing import OrdinalEncoder

In [15]:
DATA_DIR = 'new_data'
train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))
train['CRPS'] = train.Country_Region + train.Province_State.fillna('')
test['CRPS'] = test.Country_Region + test.Province_State.fillna('')
train

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,CRPS
0,1,,Afghanistan,2020-01-22,0.0,0.0,Afghanistan
1,2,,Afghanistan,2020-01-23,0.0,0.0,Afghanistan
2,3,,Afghanistan,2020-01-24,0.0,0.0,Afghanistan
3,4,,Afghanistan,2020-01-25,0.0,0.0,Afghanistan
4,5,,Afghanistan,2020-01-26,0.0,0.0,Afghanistan
5,6,,Afghanistan,2020-01-27,0.0,0.0,Afghanistan
6,7,,Afghanistan,2020-01-28,0.0,0.0,Afghanistan
7,8,,Afghanistan,2020-01-29,0.0,0.0,Afghanistan
8,9,,Afghanistan,2020-01-30,0.0,0.0,Afghanistan
9,10,,Afghanistan,2020-01-31,0.0,0.0,Afghanistan


In [None]:
def log_features(df: pd.DataFrame, label: str):
    df[f'L{label}'] = np.log1p(df[f'{label}'])
    df[f'LD{label}'] = df.groupby('CRPS')[[f'L{label}']].diff()
    return df

def exp_features(df: pd.DataFrame, label: str, serd):
    df.loc[(df.serd == serd) & (df[f'LD{label}'] < 0), 'LD{label}'] = 0
    df.loc[df.serd == serd, f'L{label}'] = df.loc[df.serd == serd, f'LD{label}'] + \
                                                 df.loc[df.serd == serd, f'L{label}1']
    df.loc[df.serd == serd, f'{label}'] = np.exp(df.loc[df.serd == serd, f'L{label}']) - 1
    return df


def group_features(df: pd.DataFrame, label: str):
    df[f'L{label}1'] = df.groupby('CRPS')[[f'L{label}']].shift(1)
    df[f'L{label}2'] = df.groupby('CRPS')[[f'L{label}']].shift(2)
    df[f'L{label}3'] = df.groupby('CRPS')[[f'L{label}']].shift(3)
    df[f'L{label}4'] = df.groupby('CRPS')[[f'L{label}']].shift(4)
    df[f'LD{label}1'] = df.groupby('CRPS')[[f'LD{label}']].shift(1)
    df[f'LD{label}2'] = df.groupby('CRPS')[[f'LD{label}']].shift(2)
    df[f'LD{label}3'] = df.groupby('CRPS')[[f'LD{label}']].shift(3)
    df[f'LD{label}4'] = df.groupby('CRPS')[[f'LD{label}']].shift(4)
    df[f'LD{label}MA'] = (df[f'LD{label}'] + df[f'LD{label}1'] + df[f'LD{label}2'] + df[f'LD{label}3'] +
                          df[f'LD{label}4']) / 5
    df[f'LD{label}MA1'] = df.groupby('CRPS')[[f'LD{label}MA']].shift(1)
    df[f'LD{label}MA2'] = df.groupby('CRPS')[[f'LD{label}MA']].shift(2)
    df[f'LD{label}MA3'] = df.groupby('CRPS')[[f'LD{label}MA']].shift(3)
    df[f'LD{label}MA4'] = df.groupby('CRPS')[[f'LD{label}MA']].shift(4)
    df[f'L{label}MA'] = (df[f'L{label}'] + df[f'L{label}1'] + df[f'L{label}2'] + df[f'L{label}3'] +
                         df[f'L{label}4']) / 5
    df[f'L{label}MA1'] = df.groupby('CRPS')[[f'L{label}MA']].shift(1)
    df[f'L{label}MA2'] = df.groupby('CRPS')[[f'L{label}MA']].shift(2)
    df[f'L{label}MA3'] = df.groupby('CRPS')[[f'L{label}MA']].shift(3)
    df[f'L{label}MA4'] = df.groupby('CRPS')[[f'L{label}MA']].shift(4)
    return df

In [16]:
train = log_features(train, 'ConfirmedCases')
train = group_features(train, 'ConfirmedCases')
train = log_features(train, 'Fatalities')
train = group_features(train, 'Fatalities')
train = log_features(train, 'RecoveredCases')
train = group_features(train, 'RecoveredCases')

train['serd'] = train.groupby('CRPS').cumcount()
train.loc[train.ConfirmedCases == 0, 'days_since_confirmed'] = 0
train.loc[train.ConfirmedCases > 0, 'days_since_confirmed'] = train[train.ConfirmedCases > 0].groupby(
    'CRPS').cumcount()  # The first is 0 to avoid leakakge

In [17]:
lgbm_cc = LGBMRegressor(num_leaves=85, learning_rate=10 ** -1.89, n_estimators=100,
                        min_sum_hessian_in_leaf=(10 ** -4.1), min_child_samples=2, subsample=0.97, subsample_freq=10,
                        colsample_bytree=0.68, reg_lambda=10 ** 1.4, random_state=1234, n_jobs=4)
lgbm_f = LGBMRegressor(num_leaves=26, learning_rate=10 ** -1.63, n_estimators=100,
                       min_sum_hessian_in_leaf=(10 ** -4.04), min_child_samples=14, subsample=0.66, subsample_freq=5,
                       colsample_bytree=0.8, reg_lambda=10 ** 1.92, random_state=1234, n_jobs=4)
lgbm_rc = LGBMRegressor(num_leaves=26, learning_rate=10 ** -1.63, n_estimators=100,
                       min_sum_hessian_in_leaf=(10 ** -4.04), min_child_samples=14, subsample=0.66, subsample_freq=5,
                       colsample_bytree=0.8, reg_lambda=10 ** 1.92, random_state=1234, n_jobs=4) 

In [18]:
oe = OrdinalEncoder()
X = oe.fit_transform(train[['Country_Region', 'Province_State']].fillna(''))
train['CR'] = X[:, 0]
train['PS'] = X[:, 1]

cc_train_cols = ['LDConfirmedCases1', 'LDConfirmedCases2', 'LDConfirmedCases3', 'LDConfirmedCases4',
                 'LDFatalities1', 'LDFatalities2', 'LDFatalities3', 'LDFatalities4', 'days_since_confirmed',
                 'CR', 'PS',
                 'LDConfirmedCasesMA1', 'LDConfirmedCasesMA2', 'LDConfirmedCasesMA3', 'LDConfirmedCasesMA4',
                 'LDFatalitiesMA1', 'LDFatalitiesMA2', 'LDFatalitiesMA3', 'LDFatalitiesMA4',
                 'LConfirmedCases1', 'LConfirmedCases2', 'LConfirmedCases3', 'LConfirmedCases4',
                 'LFatalities1', 'LFatalities2', 'LFatalities3', 'LFatalities4',
                 'LConfirmedCasesMA1', 'LConfirmedCasesMA2', 'LConfirmedCasesMA3', 'LConfirmedCasesMA4',
                 'LFatalitiesMA1', 'LFatalitiesMA2', 'LFatalitiesMA3', 'LFatalitiesMA4']
lgbm_cc.fit(train.loc[:, cc_train_cols], train.LDConfirmedCases, categorical_feature=['CR', 'PS'])

f_train_cols = ['LDConfirmedCases1', 'LDConfirmedCases2', 'LDConfirmedCases3', 'LDConfirmedCases4',
                'LDFatalities1', 'LDFatalities2', 'LDFatalities3', 'LDFatalities4', 'days_since_confirmed',
                'CR', 'PS',
                'LDConfirmedCasesMA1', 'LDConfirmedCasesMA2', 'LDConfirmedCasesMA3', 'LDConfirmedCasesMA4',
                'LDFatalitiesMA1', 'LDFatalitiesMA2', 'LDFatalitiesMA3', 'LDFatalitiesMA4',
                'LConfirmedCases1', 'LConfirmedCases2', 'LConfirmedCases3', 'LConfirmedCases4',
                'LFatalities1', 'LFatalities2', 'LFatalities3', 'LFatalities4',
                'LConfirmedCasesMA1', 'LConfirmedCasesMA2', 'LConfirmedCasesMA3', 'LConfirmedCasesMA4',
                'LFatalitiesMA1', 'LFatalitiesMA2', 'LFatalitiesMA3', 'LFatalitiesMA4', 'LConfirmedCases',
                'LDConfirmedCases']
lgbm_f.fit(train.loc[:, f_train_cols], train.LDFatalities, categorical_feature=['CR', 'PS'])

rc_train_cols = ['LDConfirmedCases1', 'LDConfirmedCases2', 'LDConfirmedCases3', 'LDConfirmedCases4',
                 'LDRecoveredCases1', 'LDRecoveredCases2', 'LDRecoveredCases3', 'LDRecoveredCases4',
                 'days_since_confirmed', 'CR', 'PS',
                 'LDConfirmedCasesMA1', 'LDConfirmedCasesMA2', 'LDConfirmedCasesMA3', 'LDConfirmedCasesMA4',
                 'LDRecoveredCasesMA1', 'LDRecoveredCasesMA2', 'LDRecoveredCasesMA3', 'LDRecoveredCasesMA4',
                 'LConfirmedCases1', 'LConfirmedCases2', 'LConfirmedCases3', 'LConfirmedCases4',
                 'LRecoveredCases1', 'LRecoveredCases2', 'LRecoveredCases3', 'LRecoveredCases4',
                 'LConfirmedCasesMA1', 'LConfirmedCasesMA2', 'LConfirmedCasesMA3', 'LConfirmedCasesMA4',
                 'LRecoveredCasesMA1', 'LRecoveredCasesMA2', 'LRecoveredCasesMA3', 'LRecoveredCasesMA4', 
                 'LConfirmedCases', 'LDConfirmedCases']
lgbm_rc.fit(train.loc[:, rc_train_cols], train.LDRecoveredCases, categorical_feature=['CR', 'PS'])

New categorical_feature is ['CR', 'PS']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.8,
              importance_type='split', learning_rate=0.023442288153199226,
              max_depth=-1, min_child_samples=14, min_child_weight=0.001,
              min_split_gain=0.0, min_sum_hessian_in_leaf=9.120108393559096e-05,
              n_estimators=100, n_jobs=4, num_leaves=26, objective=None,
              random_state=1234, reg_alpha=0.0, reg_lambda=83.17637711026708,
              silent=True, subsample=0.66, subsample_for_bin=200000,
              subsample_freq=5)

In [19]:
train['serd'] = train.groupby('CRPS').cumcount()
trainpred = pd.concat((train, test[test.Date > train.Date.max()])).reset_index(drop=True)
trainpred.sort_values(['Country_Region', 'Province_State', 'Date'], inplace=True)
X = oe.transform(trainpred[['Country_Region', 'Province_State']].fillna(''))
trainpred['CR'] = X[:, 0]
trainpred['PS'] = X[:, 1]
trainpred['serd'] = trainpred.groupby('CRPS').cumcount()
trainpred.loc[trainpred.ConfirmedCases.isnull(), 'ConfirmedCases'] = 1  # Heuristic
trainpred.loc[trainpred.ConfirmedCases == 0, 'days_since_confirmed'] = 0
trainpred.loc[trainpred.ConfirmedCases > 0, 'days_since_confirmed'] = trainpred[trainpred.ConfirmedCases > 0].groupby(
    'CRPS').cumcount()  # The first is 0 to avoid leakakge

trainpred = log_features(trainpred, 'ConfirmedCases')
trainpred = log_features(trainpred, 'Fatalities')
trainpred = log_features(trainpred, 'RecoveredCases')
for serd in range(train.serd.max() + 1, trainpred.serd.max() + 1):
    print(serd)
    trainpred = group_features(trainpred, 'ConfirmedCases')
    trainpred = group_features(trainpred, 'Fatalities')
    trainpred = group_features(trainpred, 'RecoveredCases')

    trainpred.loc[trainpred.serd == serd, 'LDConfirmedCases'] = lgbm_cc.predict(trainpred.loc[trainpred.serd == serd, 
                                                                                cc_train_cols])
    trainpred = exp_features(trainpred, 'ConfirmedCases', serd)
    
    trainpred.loc[trainpred.serd == serd, 'LDFatalities'] = lgbm_f.predict(trainpred.loc[trainpred.serd == serd, 
                                                                           f_train_cols])
    trainpred = exp_features(trainpred, 'Fatalities', serd)

    trainpred.loc[trainpred.serd == serd, 'LDRecoveredCases'] = lgbm_rc.predict(trainpred.loc[trainpred.serd == serd, 
                                                                                rc_train_cols])
    trainpred = exp_features(trainpred, 'RecoveredCases', serd)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113


In [20]:
trainpred.loc[(trainpred.Date <= max(test.Date)) & (trainpred.Date >= min(test.Date)), 'ForecastId'] = test.loc[:,
                                                                                                       'ForecastId'].values
submission = trainpred.loc[trainpred.Date >= min(test.Date)][['ForecastId', 'ConfirmedCases', 'Fatalities', 'RecoveredCases']]
submission.ForecastId = submission.ForecastId.astype('int')
submission.sort_values('ForecastId', inplace=True)
submission.to_csv('new_submission.csv', index=False)