In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv("../input/covid19-global-forecasting-week-4/train.csv")

test = pd.read_csv("../input/covid19-global-forecasting-week-4/test.csv")

submission = pd.read_csv("../input/covid19-global-forecasting-week-4/submission.csv")

In [3]:
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])

In [4]:
train = train[train['Date']<='2020-04-14']
train['part'] = 'train'
test['part'] = 'test'

In [5]:
train['Location'] = train['Province_State'].astype(str) + train['Country_Region'].astype(str)
test['Location'] = test['Province_State'].astype(str) + test['Country_Region'].astype(str)

In [6]:
test = test.merge(train[['ConfirmedCases','Fatalities','Location','Date']],how='left',on=['Location','Date'])

In [7]:
train = train[train['Date']<='2020-04-01']

In [8]:
data = pd.concat([train,test],axis=0)
data = data.sort_values(['Country_Region','Date'])

In [9]:
data = data.melt(id_vars=['ForecastId','Date','Location','part'],value_vars=['ConfirmedCases','Fatalities'],value_name='Target').sort_values(['Location','Date'])

In [10]:
data

Unnamed: 0,ForecastId,Date,Location,part,variable,Target
27018,,2020-01-22,AlabamaUS,train,ConfirmedCases,0.0
62700,,2020-01-22,AlabamaUS,train,Fatalities,0.0
27072,,2020-01-23,AlabamaUS,train,ConfirmedCases,0.0
62754,,2020-01-23,AlabamaUS,train,Fatalities,0.0
27126,,2020-01-24,AlabamaUS,train,ConfirmedCases,0.0
...,...,...,...,...,...,...
71361,13457.0,2020-05-12,nanZimbabwe,test,Fatalities,
35680,13458.0,2020-05-13,nanZimbabwe,test,ConfirmedCases,
71362,13458.0,2020-05-13,nanZimbabwe,test,Fatalities,
35681,13459.0,2020-05-14,nanZimbabwe,test,ConfirmedCases,


In [11]:
data['Day'] = data['Date'].astype(str).apply(lambda x: int(''.join(x.split('-')[1:])))
data['Month'] = data.Date.dt.month

In [12]:
data['lag_1'] = data.groupby(['Location','variable'])['Target'].transform(lambda x: x.shift(1))
data['lag_2'] = data.groupby(['Location','variable'])['Target'].transform(lambda x: x.shift(2))
data['lag_3'] = data.groupby(['Location','variable'])['Target'].transform(lambda x: x.shift(3))
data['lag_4'] = data.groupby(['Location','variable'])['Target'].transform(lambda x: x.shift(4))

In [13]:
data['Diff1'] = data['lag_1'] - data['lag_2']
data['Diff2'] = data['lag_2'] - data['lag_3']
data['Diff3'] = data['lag_3'] - data['lag_4']
data['Diffavg'] = (data['Diff1'] + data['Diff2'] +data['Diff3'])/3

In [14]:
data["Inc1"] = (data['Diff1'] / data['lag_2'])*100
data["Inc2"] = (data['Diff2'] / data['lag_3'])*100
data["Inc3"] = (data['Diff3'] / data['lag_4'])*100
data['Incavg'] = (data['Inc1'] + data['Inc2'] +data['Inc3'])/3

In [15]:
data = data[data['Date']>'2020-02-19']
data.drop(['Diff1','Diff2','Diff3'],axis=1,inplace=True)

In [16]:
data

Unnamed: 0,ForecastId,Date,Location,part,variable,Target,Day,Month,lag_1,lag_2,lag_3,lag_4,Diffavg,Inc1,Inc2,Inc3,Incavg
28584,,2020-02-20,AlabamaUS,train,ConfirmedCases,0.0,220,2,0.0,0.0,0.0,0.0,0.0,,,,
64266,,2020-02-20,AlabamaUS,train,Fatalities,0.0,220,2,0.0,0.0,0.0,0.0,0.0,,,,
28638,,2020-02-21,AlabamaUS,train,ConfirmedCases,0.0,221,2,0.0,0.0,0.0,0.0,0.0,,,,
64320,,2020-02-21,AlabamaUS,train,Fatalities,0.0,221,2,0.0,0.0,0.0,0.0,0.0,,,,
28692,,2020-02-22,AlabamaUS,train,ConfirmedCases,0.0,222,2,0.0,0.0,0.0,0.0,0.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71361,13457.0,2020-05-12,nanZimbabwe,test,Fatalities,,512,5,,,,,,,,,
35680,13458.0,2020-05-13,nanZimbabwe,test,ConfirmedCases,,513,5,,,,,,,,,
71362,13458.0,2020-05-13,nanZimbabwe,test,Fatalities,,513,5,,,,,,,,,
35681,13459.0,2020-05-14,nanZimbabwe,test,ConfirmedCases,,514,5,,,,,,,,,


In [17]:
from sklearn.preprocessing import LabelEncoder

encoderloc = LabelEncoder()
encodervar = LabelEncoder()

data['Location'] = encoderloc.fit_transform(data['Location'])
data['variable'] = encodervar.fit_transform(data['variable'])

data.head()

Unnamed: 0,ForecastId,Date,Location,part,variable,Target,Day,Month,lag_1,lag_2,lag_3,lag_4,Diffavg,Inc1,Inc2,Inc3,Incavg
28584,,2020-02-20,0,train,0,0.0,220,2,0.0,0.0,0.0,0.0,0.0,,,,
64266,,2020-02-20,0,train,1,0.0,220,2,0.0,0.0,0.0,0.0,0.0,,,,
28638,,2020-02-21,0,train,0,0.0,221,2,0.0,0.0,0.0,0.0,0.0,,,,
64320,,2020-02-21,0,train,1,0.0,221,2,0.0,0.0,0.0,0.0,0.0,,,,
28692,,2020-02-22,0,train,0,0.0,222,2,0.0,0.0,0.0,0.0,0.0,,,,


In [18]:
features = ['Day','Location','variable','lag_1','lag_2','Diffavg','Inc1','Inc2','Incavg','Month']

In [19]:
x_train = data[data['Date']<='2020-04-14']
#x_train.sort_values(['Day','Location'],inplace=True)
y_train = x_train['Target']
x_val = data[(data['Date']>='2020-04-02')&(data['Date']<='2020-04-14')]
#x_val.sort_values(['Day','Location'],inplace=True)
y_val = x_val['Target']
test_ = data[data['part']=='test']
#test_.sort_values(['Day','Location'],inplace=True)

In [20]:
from xgboost import DMatrix,train,plot_importance,XGBRegressor

In [21]:
params = {'objective': 'reg:squarederror',
         'n_jobs': -1,
         'seed': 236,
         }


In [22]:
from tqdm import tqdm

In [23]:
def feature(test_,ctr):
    if ctr==1:
        lags = [1]
    elif ctr==2:
        lags = [1,2]
    elif ctr==3:
        lags = [1,2,3]
    else:
        lags = [1,2,3,4]
    for i in lags:
        test_['lag_'+str(i)] = test_.groupby(['Location','variable'])['Target'].transform(lambda x: x.shift(i))
    test_['Diff1'] = test_['lag_1'] - test_['lag_2']
    test_['Diff2'] = test_['lag_2'] - test_['lag_3']
    test_['Diff3'] = test_['lag_3'] - test_['lag_4']
    test_['Diffavg'] = (test_['Diff1'] + test_['Diff2'] +test_['Diff3'])/3
    test_["Inc1"] = (test_['Diff1'] / test_['lag_2'])*100
    test_["Inc2"] = (test_['Diff2'] / test_['lag_3'])*100
    test_["Inc3"] = (test_['Diff3'] / test_['lag_4'])*100
    test_['Incavg'] = (test_['Inc1'] + test_['Inc2'] +test_['Inc3'])/3
    test_.drop(['Diff1','Diff2','Diff3'],axis=1,inplace=True)
    ctr+=1
    
    return test_

In [24]:
days = test_['Day'].unique().tolist()
days = days[13:]

In [25]:
train_set = DMatrix(x_train[features],y_train)
val_set = DMatrix(x_val[features],y_val)
model = train(params,train_set,num_boost_round=100,evals=[(val_set,'validation')],verbose_eval=50)
ctr = 1
for j in days:
    test_set = DMatrix(test_[test_['Day']==j][features])
    test_.loc[test_['Day']==j,'Target'] = model.predict(test_set)
    test_ = feature(test_,ctr)
    ctr+=1

[0]	validation-rmse:9490.96875
[50]	validation-rmse:67.65117
[99]	validation-rmse:34.63649


In [26]:
sub = test_[['ForecastId','variable','Target']]

In [27]:
sub = pd.pivot(sub,index='ForecastId',columns='variable',values='Target').reset_index()

In [28]:
sub['ForecastId'] = sub['ForecastId'].astype(int)

sub.columns = ['ForecastId','ConfirmedCases','Fatalities']

In [29]:
sub.to_csv("submission.csv",index=False)