# Data Cleaning
***

## Data Aquisition

### Pandemic Data

In [1]:
import pandas as pd
import numpy as np
from numba import njit, jit
from typing import TypeVar
import multiprocessing
from joblib import Parallel, delayed
import time

num_cores = multiprocessing.cpu_count()
PandasDataFrame = TypeVar('pandas.core.frame.DataFrame')
NaN = np.nan
highlighted_countries = ["US", "Australia", "Canada", "China", "Netherlands", "UK", "France", "Denmark"]

#### US Cases

In [2]:
cases_US = pd.read_csv("../data/pandemic/time_series_covid19_confirmed_US.csv")

cases_US = cases_US[5:] # exclude US territories
cases_US = cases_US.drop(["FIPS","Combined_Key","code3","iso2", "iso3","UID"], axis=1)
cases_US = cases_US.rename(columns={"Admin2": "County", "Long_": "Long"})

cases_US.head(10)

Unnamed: 0,County,Province_State,Country_Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,...,6/19/20,6/20/20,6/21/20,6/22/20,6/23/20,6/24/20,6/25/20,6/26/20,6/27/20,6/28/20
5,Autauga,Alabama,US,32.539527,-86.644082,0,0,0,0,0,...,405,425,428,436,447,463,473,482,492,497
6,Baldwin,Alabama,US,30.72775,-87.722071,0,0,0,0,0,...,398,405,415,422,435,449,462,500,539,559
7,Barbour,Alabama,US,31.868263,-85.387129,0,0,0,0,0,...,265,271,271,276,279,287,303,309,314,314
8,Bibb,Alabama,US,32.996421,-87.125115,0,0,0,0,0,...,123,123,124,126,132,138,146,150,158,159
9,Blount,Alabama,US,33.982109,-86.567906,0,0,0,0,0,...,136,140,146,150,156,165,173,181,185,186
10,Bullock,Alabama,US,32.100305,-85.712655,0,0,0,0,0,...,318,324,324,325,325,332,347,347,354,353
11,Butler,Alabama,US,31.753001,-86.680575,0,0,0,0,0,...,567,570,574,576,579,582,586,592,597,599
12,Calhoun,Alabama,US,33.774837,-85.826304,0,0,0,0,0,...,202,203,205,207,208,212,225,228,237,237
13,Chambers,Alabama,US,32.913601,-85.390727,0,0,0,0,0,...,493,502,507,514,520,529,535,545,547,547
14,Cherokee,Alabama,US,34.17806,-85.60639,0,0,0,0,0,...,56,56,56,56,56,56,62,65,66,67


#### Global Cases

In [3]:
cases_global = pd.read_csv("../data/pandemic/time_series_covid19_confirmed_global.csv")
cases_global = cases_global.rename(columns={"Province/State": "Province_State", 
                                            "Country/Region": "Country_Region"})
cases_global["County"] = NaN
cases_global.head(10)

Unnamed: 0,Province_State,Country_Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,6/20/20,6/21/20,6/22/20,6/23/20,6/24/20,6/25/20,6/26/20,6/27/20,6/28/20,County
0,,Afghanistan,33.0,65.0,0,0,0,0,0,0,...,28424,28833,29157,29481,29640,30175,30451,30616,30967,
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,1891,1962,1995,2047,2114,2192,2269,2330,2402,
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,11631,11771,11920,12076,12248,12445,12685,12968,13273,
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,855,855,855,855,855,855,855,855,855,
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,176,183,186,189,197,212,212,259,267,
5,,Antigua and Barbuda,17.0608,-61.7964,0,0,0,0,0,0,...,26,26,26,26,26,65,65,65,69,
6,,Argentina,-38.4161,-63.6167,0,0,0,0,0,0,...,41204,42785,44931,47203,49851,52457,55343,57744,59933,
7,,Armenia,40.0691,45.0382,0,0,0,0,0,0,...,19708,20268,20588,21006,21717,22488,23247,23909,24645,
8,Australian Capital Territory,Australia,-35.4735,149.0124,0,0,0,0,0,0,...,108,108,108,108,108,108,108,108,108,
9,New South Wales,Australia,-33.8688,151.2093,0,0,0,0,3,4,...,3149,3151,3150,3159,3162,3168,3174,3177,3184,


#### Combine Data and Change Time Series Dimension

**Here I wanted to treat the time series data as one feature. I explored several ways to approach this, but with a lack of user friendly solutions, I iteratively expanded each row. I was able to minimize the runtime through multiprocessing.**

In [4]:
cases_total_temp = pd.concat([cases_US, cases_global], sort=False)
cases_total_temp = cases_total_temp[cases_total_temp['Country_Region'].isin(highlighted_countries)]

In [5]:
def get_rows(row):
    temp = pd.DataFrame(columns=pd.DataFrame(columns=['County','Province_State','Country_Region', 
                                                      'Lat','Long','Date','Total_Cases']))
    for date in row[5:].iteritems():
            new_row = row[:5]
            new_row["Date"] = date[0]
            new_row["Total_Cases"] = date[1]
            temp = pd.concat([temp, new_row.to_frame().transpose()])
    return temp

def convert_time_series():
    cols = cases_total_temp.columns[:5].append(pd.Index(["Date","Total_Cases"]))
    temp = pd.DataFrame(columns=cols)

    row_n = 0
    result = Parallel(n_jobs=num_cores-1)(delayed(get_rows)(j) for i, j in cases_total_temp.iterrows())
    return pd.concat(result)

start_time = time.time()
cases_total = convert_time_series()
end_time = time.time() - start_time
# print("--- %s seconds ---" % (end_time))

In [6]:
cases_total

Unnamed: 0,County,Province_State,Country_Region,Lat,Long,Date,Total_Cases
5,Autauga,Alabama,US,32.5395,-86.6441,1/22/20,0
5,Autauga,Alabama,US,32.5395,-86.6441,1/23/20,0
5,Autauga,Alabama,US,32.5395,-86.6441,1/24/20,0
5,Autauga,Alabama,US,32.5395,-86.6441,1/25/20,0
5,Autauga,Alabama,US,32.5395,-86.6441,1/26/20,0
5,Autauga,Alabama,US,32.5395,-86.6441,1/27/20,0
5,Autauga,Alabama,US,32.5395,-86.6441,1/28/20,0
5,Autauga,Alabama,US,32.5395,-86.6441,1/29/20,0
5,Autauga,Alabama,US,32.5395,-86.6441,1/30/20,0
5,Autauga,Alabama,US,32.5395,-86.6441,1/31/20,0


In [7]:
cases_total.to_csv("../data/pandemic/covid_19_time_series_all.csv")