In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing Tasks

### Loading all datasets

In [31]:
cases_test = pd.read_csv('./datasets/cases_2021_test.csv')
cases_location = pd.read_csv('./datasets/location_2021.csv')

## 1.1 Cleaning messy outcome labels
Datasets involved: cases_2021_train.csv

In [32]:
cases_train = pd.read_csv('./datasets/cases_2021_train.csv')

In [33]:
cases_train.groupby('outcome').size()

outcome
Alive                               127
Dead                                  9
Death                                 4
Deceased                           3361
Died                                  4
Discharged                            2
Discharged from hospital              3
Hospitalized                     135524
Receiving Treatment                 183
Recovered                         62875
Stable                               24
Under treatment                     243
critical condition                    1
death                                29
died                                624
discharge                           114
discharged                           82
recovered                          2435
recovering at home 03.03.2020         2
released from quarantine              3
stable                              107
stable condition                     90
dtype: int64

In [34]:
# labels_test = {
#     'hospitalized': {'Discharged', 'Discharged from hospital', 'Hospitalized', 'critical condition','discharge', 'discharged'},
#     'nonhospitalized': {'Alive', 'Receiving Treatment', 'Stable', 'Under treatment', 'recovering at home 03.03.2020', 'released from quarantine', 'stable', 'stable condition'},
#     'deceased': {'Dead', 'Death', 'Deceased', 'Died', 'death', 'died'},
#     'recovered': {'Recovered', 'recovered'}
# }

### Mapping similar outcomes

In [35]:
labels = {
    'Discharged': 'hospitalized', 'Discharged from hospital': 'hospitalized', 'Hospitalized': 'hospitalized', 'critical condition': 'hospitalized', 'discharge': 'hospitalized', 'discharged': 'hospitalized', 
    'Alive': 'nonhospitalized', 'Receiving Treatment': 'nonhospitalized', 'Stable': 'nonhospitalized', 'Under treatment': 'nonhospitalized', 'recovering at home 03.03.2020': 'nonhospitalized', 'released from quarantine': 'nonhospitalized', 'stable': 'nonhospitalized', 'stable condition': 'nonhospitalized', 
    'Dead': 'deceased', 'Death': 'deceased', 'Deceased': 'deceased', 'Died': 'deceased', 'death': 'deceased', 'died': 'deceased',
     'Recovered': 'recovered', 'recovered': 'recovered'
}

In [36]:
cases_train['outcome_group'] = cases_train['outcome'].map(labels)

In [37]:
cases_train

Unnamed: 0,age,sex,province,country,latitude,longitude,date_confirmation,additional_information,source,chronic_disease_binary,outcome,outcome_group
0,,,Maharashtra,India,19.20000,72.96667,23.05.2020,,,False,Hospitalized,hospitalized
1,,,Maharashtra,India,18.94017,72.83483,18.05.2020,,https://t.me/indiacovid/5075,False,Recovered,recovered
2,,,Maharashtra,India,19.20000,72.96667,28.05.2020,,https://phdmah.maps.arcgis.com/apps/opsdashboa...,False,Hospitalized,hospitalized
3,,,West Bengal,India,22.80862,88.79242,09.05.2020,,https://www.wbhealth.gov.in/uploaded_files/cor...,False,Hospitalized,hospitalized
4,,,Maharashtra,India,18.94017,72.83483,17.05.2020,,,False,Hospitalized,hospitalized
...,...,...,...,...,...,...,...,...,...,...,...,...
205841,,,Maharashtra,India,16.70446,74.24137,21.05.2020,,,False,Hospitalized,hospitalized
205842,,,Telangana,India,17.39487,78.47076,26.04.2020,,https://twitter.com/Eatala_Rajender/status/125...,False,Hospitalized,hospitalized
205843,,,Maharashtra,India,19.20000,72.96667,22.05.2020,,,False,Recovered,recovered
205844,,,Maharashtra,India,18.94017,72.83483,24.05.2020,,https://t.me/Allindiacovid/3814,False,Hospitalized,hospitalized


In [38]:
cases_train = cases_train.drop(columns=['outcome'])

In [39]:
cases_train

Unnamed: 0,age,sex,province,country,latitude,longitude,date_confirmation,additional_information,source,chronic_disease_binary,outcome_group
0,,,Maharashtra,India,19.20000,72.96667,23.05.2020,,,False,hospitalized
1,,,Maharashtra,India,18.94017,72.83483,18.05.2020,,https://t.me/indiacovid/5075,False,recovered
2,,,Maharashtra,India,19.20000,72.96667,28.05.2020,,https://phdmah.maps.arcgis.com/apps/opsdashboa...,False,hospitalized
3,,,West Bengal,India,22.80862,88.79242,09.05.2020,,https://www.wbhealth.gov.in/uploaded_files/cor...,False,hospitalized
4,,,Maharashtra,India,18.94017,72.83483,17.05.2020,,,False,hospitalized
...,...,...,...,...,...,...,...,...,...,...,...
205841,,,Maharashtra,India,16.70446,74.24137,21.05.2020,,,False,hospitalized
205842,,,Telangana,India,17.39487,78.47076,26.04.2020,,https://twitter.com/Eatala_Rajender/status/125...,False,hospitalized
205843,,,Maharashtra,India,19.20000,72.96667,22.05.2020,,,False,recovered
205844,,,Maharashtra,India,18.94017,72.83483,24.05.2020,,https://t.me/Allindiacovid/3814,False,hospitalized


In [40]:
cases_train.groupby('outcome_group').size()

outcome_group
deceased             4031
hospitalized       135726
nonhospitalized       779
recovered           65310
dtype: int64

## 1.3 Exploratory Data Analysis 

Ideas to explore

Probably want to convert date_information to a datetime object 

Categorical Attributes: Province, Country, Chronic disease, outcome_group, Sex
Numerical: Age, Date
By: country, Age, Sex
Age range: 18 - 30, < 18, 60 >


## 1.4 Data Cleaning and Imputing Missing Values
Datasets involved: cases_2021_train.csv, cases_2021_test.csv, location_2021.csv

In [41]:
# Primary: age column -> remove all entries that are NaN
# format age to standard integer
# potential approaches: 

# other columns -> impute, what strategies can we use to impute missing values of different columns
# additional information: best action is replace NaN with ""

In [42]:
# Removing NaN's from age column in train and test datasets

cases_train = cases_train[cases_train['age'].notna()].reset_index()
cases_test = cases_test[cases_test['age'].notna()].reset_index()


In [108]:
# Strip all whitespace from 'age' columns
cases_train['age'] = cases_train['age'].str.strip()
cases_test['age'] = cases_test['age'].str.strip()

In [115]:
# age range greater than 10 should be removed?

# FIX: Remove empty spaces from 'age' column
# FOUND EDGE CASE: ['80', ''] -> 80 - 

def range_to_num(age):
    range = 10

    a_list = age.split('-')
    # print(a_list)

    if (a_list[0] == ''):
        return round(float(a_list[1]))
    elif ((len(a_list)) == 2 and (a_list[1] == '')):
        return round(float(a_list[0]))

    map_ints = map(float, a_list)
    map_list = list(map_ints)

    if (len(map_list) == 1):
        return round(map_list[0])  

    if ((map_list[1] - map_list[0]) > range):
        return 'remove'
    else:
        return round((map_list[1] + map_list[0]) / 2)


In [113]:
range_to_num('8-')

['8', '']


8

In [None]:
cases_train = cases_train.drop(columns=['index'])
cases_test = cases_test.drop(columns=['index'])

In [47]:
# STEPS TO CLEAN VALUES IN AGE COLUMN
#1. Format all values with 'x-x' range to a single int value or 'remove'
#2. Remove all entries labelled 'remove' 
#3. Convert all column entries to int
#4. Use round() on all values to get rid of 0.3, 0.5, 0.8 values -> some babies

In [116]:
# Cleaning train dataset - age

cases_train['age'] = cases_train['age'].apply(lambda x: range_to_num(x))
cases_train = cases_train[cases_train['age'] != 'remove']
cases_train['age'] = pd.to_numeric(cases_train['age'])
cases_train['age'] = cases_train['age'].apply(lambda x: round(x))

['40']
['37']
['40']
['27']
['54']
['39']
['58']
['37']
['25']
['36']
['45']
['70']
['32']
['34']
['27']
['36']
['30']
['22']
['23']
['1']
['14']
['26']
['45']
['45']
['11']
['28']
['40']
['54']
['47']
['44']
['28']
['22']
['30']
['20']
['46']
['46']
['25']
['42']
['26']
['58']
['24']
['45']
['30']
['32']
['41']
['53']
['24']
['4']
['22', '80']
['34']
['43']
['32']
['50']
['31']
['28']
['75']
['21']
['36']
['28']
['65']
['27']
['40']
['54']
['67']
['63']
['43']
['53']
['46']
['39']
['48']
['22']
['43']
['37']
['16']
['19']
['40']
['29']
['50']
['45']
['53']
['19']
['34']
['9']
['41']
['48']
['43']
['52']
['1']
['55']
['39']
['44']
['32']
['29']
['38']
['46']
['53']
['42']
['3.5']
['43']
['15']
['29']
['18']
['26']
['18']
['26']
['29']
['18']
['11']
['1']
['37']
['30']
['21', '72']
['44']
['30']
['46']
['20']
['29']
['22']
['25']
['33']
['35']
['43']
['42']
['17']
['38']
['1']
['24']
['20']
['22']
['32']
['60']
['29']
['24']
['23']
['40']
['49']
['33']
['47']
['70']
['18']
['25']
['38']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
# Cleaning test dataset - age

cases_test['age'] = cases_test['age'].apply(lambda x: range_to_num(x))
cases_test = cases_test[cases_test['age'] != 'remove']
cases_test['age'] = pd.to_numeric(cases_test['age'])
cases_test['age'] = cases_test['age'].apply(lambda x: round(x))

In [None]:
# sex	province	country	latitude	longitude	date_confirmation	additional_information	source	chronic_disease_binary	outcome_group
# sex: NaN convert to 'unknown'
# province: can use latitude and longitude to get province -> need to find appropriate dataset / function to determine geographical location
# country: has no NaN values -> is okay
# date_confirmation: some NaN values -> think its okay to keep these entries -> convert NaN's to 'unknown'
# additional_information: some NaN values -> convert to 'unknown'
# source: some NaN values -> convert to 'unknown'
# chronic_disease_binary: no NaN values
# outcome_group: no NaN values



In [None]:
cases_train[cases_train['outcome_group'].isna()]

Unnamed: 0,age,sex,province,country,latitude,longitude,date_confirmation,additional_information,source,chronic_disease_binary,outcome_group


### 1.5 Dealing With Outliers

#### Ideas
- For the attributes in our dataset, not many could be determined outliers
- Age could be an outlier, if the majority of deceased are "older" and there is a few random "young" cases -> wouldn't want to remove these though
- Date_confirmation could be an outlier -> this could occur due to misinput and if the dates are way before Covid-19 was detected throughout the world, can remove the entry
- Source could be an attribute to use and remove entries -> if the entry has no source how can we "trust" that it is truthful/accurate

In [None]:
cases_location

Unnamed: 0,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio
0,,Afghanistan,2021-04-01 04:27:05,33.93911,67.709953,56454,2484,51550.0,2420.0,Afghanistan,145.020308,4.400043
1,,Albania,2021-04-01 04:27:05,41.15330,20.168300,125157,2235,91271.0,31651.0,Albania,4349.051359,1.785757
2,,Algeria,2021-04-01 04:27:05,28.03390,1.659600,117192,3093,81538.0,32561.0,Algeria,267.250200,2.639259
3,,Andorra,2021-04-01 04:27:05,42.50630,1.521800,12010,115,11315.0,580.0,Andorra,15543.907332,0.957535
4,,Angola,2021-04-01 04:27:05,-11.20270,17.873900,22311,537,20493.0,1281.0,Angola,67.884191,2.406884
...,...,...,...,...,...,...,...,...,...,...,...,...
3999,W.P. Kuala Lumpur,Malaysia,2021-04-01 04:27:05,3.13900,101.686900,37819,118,36631.0,1070.0,"W.P. Kuala Lumpur, Malaysia",2126.574449,0.312012
4000,W.P. Labuan,Malaysia,2021-04-01 04:27:05,5.28310,115.230800,2327,12,2282.0,33.0,"W.P. Labuan, Malaysia",2341.046278,0.515685
4001,W.P. Putrajaya,Malaysia,2021-04-01 04:27:05,2.92640,101.696400,1142,8,1102.0,32.0,"W.P. Putrajaya, Malaysia",1083.491461,0.700525
4002,Unknown,Malaysia,2021-04-01 04:27:05,,,0,0,0.0,0.0,"Unknown, Malaysia",,


In [None]:
cases_train[cases_train['country'].isna()]

Unnamed: 0,age,sex,province,country,latitude,longitude,date_confirmation,additional_information,source,chronic_disease_binary,outcome_group
