In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
cc_data = pd.read_csv('data/cleaned_confirmed_cases.csv')

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import r2_score

In [4]:
cc_data.head()

Unnamed: 0,Country_Region,Population,Weight,Date,TargetValue,Med. Age,Density (P/Km²),Urban Pop %,Week,Day,Weekday,first_infection,days_since_1st_infect
0,Afghanistan,27657145,0.058359,2020-01-23,0.0,18.0,60,25.0,4,23,3,2020-02-24,-32
1,Afghanistan,27657145,0.058359,2020-01-24,0.0,18.0,60,25.0,4,24,4,2020-02-24,-31
2,Afghanistan,27657145,0.058359,2020-01-25,0.0,18.0,60,25.0,4,25,5,2020-02-24,-30
3,Afghanistan,27657145,0.058359,2020-01-26,0.0,18.0,60,25.0,4,26,6,2020-02-24,-29
4,Afghanistan,27657145,0.058359,2020-01-27,0.0,18.0,60,25.0,5,27,0,2020-02-24,-28


In [5]:
cc_data = pd.concat([cc_data,pd.get_dummies(cc_data['Country_Region'], prefix='country')],axis=1)

In [6]:
cc_data.head()

Unnamed: 0,Country_Region,Population,Weight,Date,TargetValue,Med. Age,Density (P/Km²),Urban Pop %,Week,Day,...,country_United States,country_Uruguay,country_Uzbekistan,country_Venezuela,country_Vietnam,country_West Bank and Gaza,country_Western Sahara,country_Yemen,country_Zambia,country_Zimbabwe
0,Afghanistan,27657145,0.058359,2020-01-23,0.0,18.0,60,25.0,4,23,...,0,0,0,0,0,0,0,0,0,0
1,Afghanistan,27657145,0.058359,2020-01-24,0.0,18.0,60,25.0,4,24,...,0,0,0,0,0,0,0,0,0,0
2,Afghanistan,27657145,0.058359,2020-01-25,0.0,18.0,60,25.0,4,25,...,0,0,0,0,0,0,0,0,0,0
3,Afghanistan,27657145,0.058359,2020-01-26,0.0,18.0,60,25.0,4,26,...,0,0,0,0,0,0,0,0,0,0
4,Afghanistan,27657145,0.058359,2020-01-27,0.0,18.0,60,25.0,5,27,...,0,0,0,0,0,0,0,0,0,0


In [7]:
diff_split_cc_data = cc_data.copy()
# dropping first infection along with country and date
cc_data.drop(['Country_Region', 'Date', 'first_infection'],axis=1, inplace=True)

In [8]:
feature_cols = list(cc_data)
feature_cols.remove('TargetValue')

In [9]:
cc_data.sort_values('Day')

Unnamed: 0,Population,Weight,TargetValue,Med. Age,Density (P/Km²),Urban Pop %,Week,Day,Weekday,days_since_1st_infect,...,country_United States,country_Uruguay,country_Uzbekistan,country_Venezuela,country_Vietnam,country_West Bank and Gaza,country_Western Sahara,country_Yemen,country_Zambia,country_Zimbabwe
16934,3814672,0.065988,0.0,30.0,58,68.0,9,1,6,-9,...,0,0,0,0,0,0,0,0,0,0
8325,1882450,0.069213,0.0,18.0,239,59.0,5,1,5,-45,...,0,0,0,0,0,0,0,0,0,0
993,2994400,0.067059,39.0,35.0,104,63.0,14,1,2,31,...,0,0,0,0,0,0,0,0,0,0
20733,39598700,0.057161,0.0,20.0,25,35.0,5,1,5,-41,...,0,0,0,0,0,0,0,0,0,0
20722,20966000,0.059318,10.0,34.0,341,18.0,23,1,0,126,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12284,6047800,0.064040,0.0,26.0,34,36.0,5,31,4,-47,...,0,0,0,0,0,0,0,0,0,0
14924,3093100,0.066913,0.0,28.0,2,67.0,5,31,4,-39,...,0,0,0,0,0,0,0,0,0,0
23900,510713,0.076083,0.0,28.0,2,87.0,5,31,4,-65,...,0,0,0,0,0,0,1,0,0,0
21980,1167242,0.071581,0.0,21.0,89,33.0,14,31,1,9,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# good results split

target = cc_data['TargetValue'] #target
features = cc_data[feature_cols]  #features


x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.35)

In [11]:
randomforest = RandomForestRegressor()
randomforest.fit(x_train, y_train)
y_pred = randomforest.predict(x_test)
score = r2_score(y_test, y_pred)

In [12]:
score

0.9395135249166944

In [13]:
mask_test = (diff_split_cc_data['Date'] >= '2020-05-20')
mask_train = (diff_split_cc_data['Date'] < '2020-05-20')

In [14]:
test_cc_df = diff_split_cc_data.loc[mask_test]
train_cc_df = diff_split_cc_data.loc[mask_train]

In [15]:
# time based split

y_time_split_cc = train_cc_df['TargetValue']
x_time_split_cc = train_cc_df[feature_cols] 

y_test_time_split_cc = test_cc_df['TargetValue']
x_test_time_split_cc = test_cc_df[feature_cols]

In [16]:
randomforest = RandomForestRegressor()
randomforest.fit(x_time_split_cc, y_time_split_cc)
y_time_split_pred = randomforest.predict(x_test_time_split_cc)
score_time_split = r2_score(y_test_time_split_cc, y_time_split_pred)

In [17]:
score_time_split

0.8900174361093435

In [18]:
import eli5



In [19]:
f_data = pd.read_csv('data/cleaned_fatalities.csv')

In [20]:
f_data.head()

Unnamed: 0,Country_Region,Population,Weight,Date,TargetValue,Med. Age,Density (P/Km²),Urban Pop %,Week,Day,Weekday,first_infection,days_since_1st_infect
0,Afghanistan,27657145,0.583587,2020-01-23,0.0,18.0,60,25.0,4,23,3,2020-02-24,-32
1,Afghanistan,27657145,0.583587,2020-01-24,0.0,18.0,60,25.0,4,24,4,2020-02-24,-31
2,Afghanistan,27657145,0.583587,2020-01-25,0.0,18.0,60,25.0,4,25,5,2020-02-24,-30
3,Afghanistan,27657145,0.583587,2020-01-26,0.0,18.0,60,25.0,4,26,6,2020-02-24,-29
4,Afghanistan,27657145,0.583587,2020-01-27,0.0,18.0,60,25.0,5,27,0,2020-02-24,-28


In [21]:
mask_f_test = (f_data['Date'] >= '2020-05-20')
mask_f_train = (f_data['Date'] < '2020-05-20')

In [22]:
test_f_df = diff_split_cc_data.loc[mask_f_test]
train_f_df = diff_split_cc_data.loc[mask_f_train]

In [23]:
# time based split

y_time_split_f = train_f_df['TargetValue']
x_time_split_f = train_f_df[feature_cols] 

y_test_time_split_f = test_f_df['TargetValue']
x_test_time_split_f = test_f_df[feature_cols]

In [24]:
randomforest = RandomForestRegressor()
randomforest.fit(x_time_split_f, y_time_split_f)
y_time_split_f_pred = randomforest.predict(x_test_time_split_f)
score_time_split = r2_score(y_test_time_split_f, y_time_split_f_pred)

In [25]:
score_time_split

0.8927538170393956