In [1]:
import pandas as pd
from statistics import mean 

In [2]:
individual = pd.read_csv('./data/processed_individual_cases_Sep20th2020.csv', parse_dates=True)
loc = pd.read_csv('./data/processed_location_Sep20th2020.csv', parse_dates=True)

In [3]:
list(individual.columns)

['age',
 'sex',
 'province',
 'country',
 'latitude',
 'longitude',
 'date_confirmation',
 'additional_information',
 'source',
 'outcome']

In [4]:
list(loc.columns)

['Province_State',
 'Country_Region',
 'Last_Update',
 'Lat',
 'Long_',
 'Confirmed',
 'Deaths',
 'Recovered',
 'Active',
 'Combined_Key',
 'Incidence_Rate',
 'Case-Fatality_Ratio']

In [7]:
for col in individual.columns:
    print(f'Column {col} has {individual[col].isna().sum()} values')

Column age has 296874 values
Column sex has 293734 values
Column province has 6568 values
Column country has 24 values
Column latitude has 2 values
Column longitude has 2 values
Column date_confirmation has 462 values
Column additional_information has 522969 values
Column source has 209191 values
Column outcome has 0 values


In [8]:
for col in loc.columns:
    print(f'Column {col} has {loc[col].isna().sum()} values')

Column Province_State has 168 values
Column Country_Region has 0 values
Column Last_Update has 0 values
Column Lat has 80 values
Column Long_ has 80 values
Column Confirmed has 0 values
Column Deaths has 0 values
Column Recovered has 0 values
Column Active has 2 values
Column Combined_Key has 0 values
Column Incidence_Rate has 80 values
Column Case-Fatality_Ratio has 48 values


In [9]:
def try_convert(x):
    try:
        pd.to_numeric(x)
    except:
        if '-' in x:
            try:
                return mean([int(i) for i in x.split('-')])
            except:
                return int(x.split('-')[0])
        elif '+' in x:
            return int(x.split('+')[0])
        else:
            return int(x.split()[0])/12.

In [10]:
individual['age'] = individual['age'].apply(try_convert,)

In [11]:
individual[individual['age'].notna()]

Unnamed: 0,age,sex,province,country,latitude,longitude,date_confirmation,additional_information,source,outcome
8,47.0,female,Mecklenburg-Vorpommern,Germany,53.792330,13.801800,19.03.2020,,,nonhospitalized
10,47.0,male,Bayern,Germany,49.246810,11.090850,04.04.2020,,,nonhospitalized
16,24.5,male,New Brunswick,Canada,45.949033,-66.689325,15.03.2020,Close Contact of New Brunswick Case 1,https://www2.gnb.ca/content/gnb/en/news/news_r...,nonhospitalized
20,34.5,female,Vermont,United States,44.461123,-73.081581,16.03.2020,Self-isolating at home,https://www.healthvermont.gov/media/newsroom/v...,nonhospitalized
41,47.0,male,Bayern,Germany,48.370340,10.897880,30.04.2020,,,nonhospitalized
...,...,...,...,...,...,...,...,...,...,...
557318,47.0,female,Bayern,Germany,48.049410,10.876780,15.04.2020,,,nonhospitalized
557330,69.5,female,Bayern,Germany,49.640009,10.914864,26.03.2020,,,nonhospitalized
557339,80.0,female,Nordrhein-Westfalen,Germany,52.150090,7.338950,28.04.2020,,,nonhospitalized
557343,47.0,female,Sachsen,Germany,51.179640,14.424260,02.04.2020,,,nonhospitalized


In [13]:
tmp = individual[individual['date_confirmation'].notna()]
individual['date_confirmation'] = pd.to_datetime(tmp[~tmp['date_confirmation'].str.contains('-')]['date_confirmation'])

In [16]:
new_tmp = tmp[tmp['date_confirmation'].str.contains('-')]['date_confirmation'].iloc[:5].apply(func)

In [15]:
def func(x):
    new_tmp = pd.DataFrame(x.split('-'))
    return pd.DataFrame(new_tmp.apply(pd.to_datetime, result_type='expand'))


In [17]:
loc[loc['Country_Region'] == 'US'].groupby(['Province_State', 'Last_Update']).agg({'Confirmed':'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,Confirmed
Province_State,Last_Update,Unnamed: 2_level_1
Alabama,2020-09-20 04:22:56,144164
Alaska,2020-09-20 04:22:56,6729
Arizona,2020-09-20 04:22:56,212942
Arkansas,2020-09-20 04:22:56,75160
California,2020-09-20 04:22:56,783313
Colorado,2020-09-20 04:22:56,64336
Connecticut,2020-09-20 04:22:56,55527
Delaware,2020-09-20 04:22:56,19449
Diamond Princess,2020-08-04 02:27:56,49
District of Columbia,2020-09-20 04:22:56,14902


In [18]:
individual[individual['country'] == 'United States']

Unnamed: 0,age,sex,province,country,latitude,longitude,date_confirmation,additional_information,source,outcome
20,34.5,female,Vermont,United States,44.461123,-73.081581,2020-03-16,Self-isolating at home,https://www.healthvermont.gov/media/newsroom/v...,nonhospitalized
89,,female,Florida,United States,29.057307,-81.184136,2020-07-03,in isolation; recent travel history,http://www.floridahealth.gov/diseases-and-cond...,nonhospitalized
151,,,Mississippi,United States,30.768985,-89.589712,2020-03-19,,"https://msdh.ms.gov/msdhsite/_static/14,0,420....",nonhospitalized
260,84.5,male,Washington,United States,47.491332,-121.803640,2020-06-03,Resident of Ida Culver House,https://www.kiro7.com/news/local/coronavirus-w...,deceased
294,,male,Florida,United States,27.928952,-82.721790,2020-03-26,,https://floridahealthcovid19.gov/,nonhospitalized
...,...,...,...,...,...,...,...,...,...,...
557190,0.5,,Florida,United States,26.152188,-80.487715,2020-03-24,,https://floridahealthcovid19.gov/,nonhospitalized
557195,,,Virginia,United States,37.325836,-76.782770,2020-12-03,,http://www.vdh.virginia.gov/surveillance-and-i...,nonhospitalized
557235,,female,Florida,United States,26.152188,-80.487715,2020-03-18,,http://floridadisaster.org/globalassets/covid-19,nonhospitalized
557314,,male,Florida,United States,26.152188,-80.487715,2020-03-15,,,nonhospitalized
