In [1]:
import pandas as pd
import numpy as np

In [2]:
#set up path to all the files
path = '../Data/Raw/covid_19_data 20200307.csv'
df = pd.read_csv(path)

In [3]:
df.head()

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,1,01/22/2020,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,2,01/22/2020,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,3,01/22/2020,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0
3,4,01/22/2020,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
4,5,01/22/2020,Gansu,Mainland China,1/22/2020 17:00,0.0,0.0,0.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3992 entries, 0 to 3991
Data columns (total 8 columns):
SNo                3992 non-null int64
ObservationDate    3992 non-null object
Province/State     2594 non-null object
Country/Region     3992 non-null object
Last Update        3992 non-null object
Confirmed          3992 non-null float64
Deaths             3992 non-null float64
Recovered          3992 non-null float64
dtypes: float64(3), int64(1), object(4)
memory usage: 249.6+ KB


In [5]:
#change last update to datetime
df['ObservationDate'] = pd.to_datetime(df['ObservationDate'])
df['Last Update'] = pd.to_datetime(df['Last Update'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3992 entries, 0 to 3991
Data columns (total 8 columns):
SNo                3992 non-null int64
ObservationDate    3992 non-null datetime64[ns]
Province/State     2594 non-null object
Country/Region     3992 non-null object
Last Update        3992 non-null datetime64[ns]
Confirmed          3992 non-null float64
Deaths             3992 non-null float64
Recovered          3992 non-null float64
dtypes: datetime64[ns](2), float64(3), int64(1), object(2)
memory usage: 249.6+ KB


In [6]:
#remove rows where confirmed, suspected, recovered and death is all 0

remove_list = []

for ind, row in df.iterrows():
    if (row.Confirmed == 0 and row.Recovered == 0 and row.Deaths == 0):
        remove_list.append(ind)

if not remove_list:
    boolean = True
else:
    boolean = False
print('Remove list is empty {}'.format(boolean))

Remove list is empty False


In [7]:
#remove all items that have 0 confirmed, deaths and recovered
df = df.drop(remove_list)
df = df.reset_index()
df = df.drop(columns = ['index', 'SNo'])

In [8]:
#first we should sort the values by ObservationDate
df = df.sort_values('ObservationDate').reset_index()
df = df.drop(columns = ['index'])

In [9]:
#Verify if data is consistent with data for all days
verification_array = df[df['ObservationDate'] == '01/22/2020']['Province/State'].unique()

print(verification_array)

len_first = len(df[df['Province/State'] == verification_array[0]])

check_list = []

#check if the length of days is the same for all to verify the data is kept updated with latest information
for state in verification_array[1:]:
    #nan is ignored and given an arbitary boolean
    if type(state) == str:
        len_state = len(df[df['Province/State'] == state])
        boolean = len_first == len_state
    check_list.append(boolean)
all(check_list)

['Anhui' nan 'Zhejiang' 'Yunnan' 'Tianjin' 'Taiwan' 'Sichuan' 'Shanxi'
 'Shanghai' 'Shandong' 'Ningxia' 'Macau' 'Washington' 'Jiangxi' 'Liaoning'
 'Beijing' 'Chongqing' 'Fujian' 'Guangxi' 'Guizhou' 'Guangdong' 'Hebei'
 'Henan' 'Hubei' 'Hunan' 'Jiangsu' 'Hainan']


False

In [10]:
#check what is wrong with array
for ind, boolean in enumerate(check_list):
    if not boolean:
        print(verification_array[ind+1])

nan
Washington


In [11]:
#take a look at Washington
#shows there is inconsistent update to data and could result in incorrect sums
df[df['Province/State'] == 'Washington']

Unnamed: 0,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
14,2020-01-22,Washington,US,2020-01-22 17:00:00,1.0,0.0,0.0
45,2020-01-23,Washington,US,2020-01-23 17:00:00,1.0,0.0,0.0
76,2020-01-24,Washington,US,2020-01-24 17:00:00,1.0,0.0,0.0
114,2020-01-25,Washington,US,2020-01-25 17:00:00,1.0,0.0,0.0
158,2020-01-26,Washington,US,2020-01-26 16:00:00,1.0,0.0,0.0
197,2020-01-27,Washington,US,2020-01-27 23:59:00,1.0,0.0,0.0
252,2020-01-28,Washington,US,2020-01-28 23:00:00,1.0,0.0,0.0
305,2020-01-29,Washington,US,2020-01-29 19:30:00,1.0,0.0,0.0
361,2020-01-30,Washington,US,2020-01-30 16:00:00,1.0,0.0,0.0
431,2020-01-31,Washington,US,2020-01-31 23:59:00,1.0,0.0,0.0


In [12]:
#function to fix the inconsistency by forward filling values of missing location during each time frame


#create a dictionary to update the new dataframe
update = {}

#inital state of dataframe
new_df = {'Province/State': [], 'Country/Region': [], 'ObservationDate': [], 
          'Last Update': [], 'Confirmed': [], 'Recovered':[], 'Deaths': []}

#the current time
curr = df['ObservationDate'][0]

#loop through the dataframe and if the ObservationDate time does not equal to current time we update the new dataframe 
for index, row in df.iterrows():
    if curr != row['ObservationDate']:
        
        #loop through the dictionary and update the all ObservationDate with curr
        for key1, value1 in update.items():
            update[key1]['ObservationDate'] = curr

            #loopthough the dictionary inside update dictionary and append the values onto the new dataframe
            for key2, value2 in value1.items():
                new_df[key2].append(value2)
            
        #update the current time to the new update        
        curr = row['ObservationDate']
        
    #add the values to updated dataframe
    key = (row['Province/State'], row['Country/Region'])
    update[key] = row

#the last part is missing from data is should be updated
for key1, value1 in update.items():
            update[key1]['ObservationDate'] = curr
            for key2, value2 in value1.items():
                new_df[key2].append(value2)

In [13]:
df = pd.DataFrame(new_df)
df.head()

Unnamed: 0,Province/State,Country/Region,ObservationDate,Last Update,Confirmed,Recovered,Deaths
0,Anhui,Mainland China,2020-01-22,2020-01-22 17:00:00,1.0,0.0,0.0
1,,South Korea,2020-01-22,2020-01-22 17:00:00,1.0,0.0,0.0
2,,Thailand,2020-01-22,2020-01-22 17:00:00,2.0,0.0,0.0
3,,Japan,2020-01-22,2020-01-22 17:00:00,2.0,0.0,0.0
4,Zhejiang,Mainland China,2020-01-22,2020-01-22 17:00:00,10.0,0.0,0.0


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4466 entries, 0 to 4465
Data columns (total 7 columns):
Province/State     2977 non-null object
Country/Region     4466 non-null object
ObservationDate    4466 non-null datetime64[ns]
Last Update        4466 non-null datetime64[ns]
Confirmed          4466 non-null float64
Recovered          4466 non-null float64
Deaths             4466 non-null float64
dtypes: datetime64[ns](2), float64(3), object(2)
memory usage: 244.4+ KB


In [15]:
#Feature Engineer change column
df['Confirmed_Change'] = np.nan
df['Recovered_Change'] = np.nan
df['Deaths_Change'] = np.nan

tracking_dictionary = {}

for ind, row in df.iterrows():
    key = (row['Country/Region'], row['Province/State'])
    if key not in tracking_dictionary:
        df['Confirmed_Change'][ind] = row['Confirmed']
        df['Recovered_Change'][ind] = row['Recovered']
        df['Deaths_Change'][ind] = row['Deaths']
    else:
        df['Confirmed_Change'][ind] = row['Confirmed'] - tracking_dictionary[key][0]
        df['Recovered_Change'][ind] = row['Recovered'] - tracking_dictionary[key][1]
        df['Deaths_Change'][ind] = row['Deaths'] - tracking_dictionary[key][2]
    tracking_dictionary[key] = (row['Confirmed'], row['Recovered'], row['Deaths'])
df.info()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy o

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4466 entries, 0 to 4465
Data columns (total 10 columns):
Province/State      2977 non-null object
Country/Region      4466 non-null object
ObservationDate     4466 non-null datetime64[ns]
Last Update         4466 non-null datetime64[ns]
Confirmed           4466 non-null float64
Recovered           4466 non-null float64
Deaths              4466 non-null float64
Confirmed_Change    4466 non-null float64
Recovered_Change    4466 non-null float64
Deaths_Change       4466 non-null float64
dtypes: datetime64[ns](2), float64(6), object(2)
memory usage: 349.0+ KB


In [16]:
df.to_csv('../Data/Processed/Processed.csv')