In [1]:
# Developer: Brady Lange
# Date: 05/30/2020
# Description: NASA Space Apps COVID-19 - Human Factors Challenge.

In [2]:
# Import required libraries
import datetime
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion

In [3]:
# Load and Explore data
# =============================================================================
# Load COVID-19 global cases
covid_19_cases_data = pd.read_csv("data/covid_19_cases/covid_19_global_cases.csv")
# Load COVID-19 global mobility percent changes
mobility_data = pd.read_csv("data/covid_19_mobility/covid_19_global_mobility_report.csv")

# Explore COVID-19 global cases
print(covid_19_cases_data.shape, "\n\n")
print(covid_19_cases_data.columns, "\n\n")
print(covid_19_cases_data.head(), "\n\n")
print(covid_19_cases_data.tail(), "\n\n")
print(covid_19_cases_data.info(), "\n\n")
print(covid_19_cases_data.describe(), "\n\n")

# Explore COVID-19 global mobility percent changes
print(mobility_data.shape, "\n\n")
print(mobility_data.columns, "\n\n")
print(mobility_data.head(), "\n\n")
print(mobility_data.tail(), "\n\n")
print(mobility_data.info(), "\n\n")
print(mobility_data.describe(), "\n\n")

  interactivity=interactivity, compiler=compiler, result=result)


(20293, 11) 


Index(['dateRep', 'day', 'month', 'year', 'cases', 'deaths',
       'countriesAndTerritories', 'geoId', 'countryterritoryCode',
       'popData2018', 'continentExp'],
      dtype='object') 


      dateRep  day  month  year  cases  deaths countriesAndTerritories geoId  \
0  30/05/2020   30      5  2020    623      11             Afghanistan    AF   
1  29/05/2020   29      5  2020    580       8             Afghanistan    AF   
2  28/05/2020   28      5  2020    625       7             Afghanistan    AF   
3  27/05/2020   27      5  2020    658       1             Afghanistan    AF   
4  26/05/2020   26      5  2020    591       1             Afghanistan    AF   

  countryterritoryCode  popData2018 continentExp  
0                  AFG   37172386.0         Asia  
1                  AFG   37172386.0         Asia  
2                  AFG   37172386.0         Asia  
3                  AFG   37172386.0         Asia  
4                  AFG   37172386.0         Asia   


   

None 


       retail_and_recreation_percent_change_from_baseline  \
count                                      358425.000000    
mean                                          -24.307278    
std                                            29.503302    
min                                          -100.000000    
25%                                           -45.000000    
50%                                           -22.000000    
75%                                             1.000000    
max                                           313.000000    

       grocery_and_pharmacy_percent_change_from_baseline  \
count                                      345821.000000   
mean                                           -7.353452   
std                                            21.897202   
min                                          -100.000000   
25%                                           -18.000000   
50%                                            -3.000000   
75%                   

In [4]:
# Preprocess Data
# =============================================================================
# Sum null values
print(pd.isnull(covid_19_cases_data).sum(), "\n\n")
print(pd.isnull(mobility_data).sum(), "\n\n")

# Drop unnecessary features ("popData2018" may be useful)
cases = covid_19_cases_data.drop(["day", "month", "year", "geoId", 
                                  "countryterritoryCode", "popData2018"], 
                                 axis = 1)
mobility = mobility_data.drop(["country_region_code", "sub_region_1", 
                               "sub_region_2"], 
                              axis = 1)
cases["dateRep"] = pd.to_datetime(cases["dateRep"])
mobility["date"] = pd.to_datetime(mobility["date"])

# Inner join COVID-19 cases and mobility
cases_mobility = cases.merge(
    mobility, how = "inner", 
    left_on = ["dateRep", "countriesAndTerritories"], 
    right_on = ["date", "country_region"]
).drop(["dateRep", "countriesAndTerritories"], axis = 1)

print(cases_mobility.columns)

num_feat_names = ["deaths", "retail_and_recreation_percent_change_from_baseline",
                 "grocery_and_pharmacy_percent_change_from_baseline",
                 "parks_percent_change_from_baseline",
                 "transit_stations_percent_change_from_baseline",
                 "workplaces_percent_change_from_baseline",
                 "residential_percent_change_from_baseline"]
cat_feat_names = ["continentExp", "country_region", "date"]
target_names = ["cases"]

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, df):
        return df[self.attribute_names].values

num_pipeline = Pipeline(
    steps = [
        ("selector", DataFrameSelector(num_feat_names)),
        ("mean_imputer", SimpleImputer(strategy = "mean")),
        ("std_scaler", StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps = [
        ("selector", DataFrameSelector(cat_feat_names)),
        ("one_hot_encoder", OneHotEncoder(sparse = False))
    ]
)

target_pipeline = Pipeline(
    steps = [
        ("selector", DataFrameSelector(target_names))
    ]
)

ml_pipeline = FeatureUnion(
    transformer_list = [
        ("numeric_pipeline", num_pipeline),
        ("categoric_pipeline", cat_pipeline),
        ("target_pipeline", target_pipeline)
    ]
)

c_m = ml_pipeline.fit_transform(cases_mobility)
print(c_m.shape)

# Sum null values
print(pd.isnull(c_m).sum() == 1)

dateRep                      0
day                          0
month                        0
year                         0
cases                        0
deaths                       0
countriesAndTerritories      0
geoId                       77
countryterritoryCode       280
popData2018                286
continentExp                 0
dtype: int64 


country_region_code                                      788
country_region                                             0
sub_region_1                                           13332
sub_region_2                                          192954
date                                                       0
retail_and_recreation_percent_change_from_baseline    100140
grocery_and_pharmacy_percent_change_from_baseline     112744
parks_percent_change_from_baseline                    245784
transit_stations_percent_change_from_baseline         214521
workplaces_percent_change_from_baseline                12830
residential_percent_change_from_b