In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing Tasks

### Loading all datasets

In [13]:
cases_test = pd.read_csv('./datasets/cases_2021_test.csv')
cases_location = pd.read_csv('./datasets/location_2021.csv')

## 1.1 Cleaning messy outcome labels
Datasets involved: cases_2021_train.csv

In [2]:
cases_train = pd.read_csv('./datasets/cases_2021_train.csv')

In [3]:
cases_train.groupby('outcome').size()

outcome
Alive                     25
Dead                       1
Death                      1
Deceased                 581
Died                       1
Hospitalized           24519
Receiving Treatment       24
Recovered              11364
Stable                     1
Under treatment           45
death                      6
died                     111
discharge                 28
discharged                16
recovered                425
stable                    21
stable condition          22
dtype: int64

In [4]:
# labels_test = {
#     'hospitalized': {'Discharged', 'Discharged from hospital', 'Hospitalized', 'critical condition','discharge', 'discharged'},
#     'nonhospitalized': {'Alive', 'Receiving Treatment', 'Stable', 'Under treatment', 'recovering at home 03.03.2020', 'released from quarantine', 'stable', 'stable condition'},
#     'deceased': {'Dead', 'Death', 'Deceased', 'Died', 'death', 'died'},
#     'recovered': {'Recovered', 'recovered'}
# }

### Mapping similar outcomes

In [5]:
labels = {
    'Discharged': 'hospitalized', 'Discharged from hospital': 'hospitalized', 'Hospitalized': 'hospitalized', 'critical condition': 'hospitalized', 'discharge': 'hospitalized', 'discharged': 'hospitalized', 
    'Alive': 'nonhospitalized', 'Receiving Treatment': 'nonhospitalized', 'Stable': 'nonhospitalized', 'Under treatment': 'nonhospitalized', 'recovering at home 03.03.2020': 'nonhospitalized', 'released from quarantine': 'nonhospitalized', 'stable': 'nonhospitalized', 'stable condition': 'nonhospitalized', 
    'Dead': 'deceased', 'Death': 'deceased', 'Deceased': 'deceased', 'Died': 'deceased', 'death': 'deceased', 'died': 'deceased',
     'Recovered': 'recovered', 'recovered': 'recovered'
}

In [6]:
cases_train['outcome_group'] = cases_train['outcome'].map(labels)

In [7]:
cases_train

Unnamed: 0,age,sex,province,country,latitude,longitude,date_confirmation,additional_information,source,chronic_disease_binary,outcome,outcome_group
0,,,Maharashtra,India,19.20000,72.96667,23.05.2020,,,False,Hospitalized,hospitalized
1,,,Maharashtra,India,18.94017,72.83483,18.05.2020,,https://t.me/indiacovid/5075,False,Recovered,recovered
2,,,Maharashtra,India,19.20000,72.96667,28.05.2020,,https://phdmah.maps.arcgis.com/apps/opsdashboa...,False,Hospitalized,hospitalized
3,,,West Bengal,India,22.80862,88.79242,09.05.2020,,https://www.wbhealth.gov.in/uploaded_files/cor...,False,Hospitalized,hospitalized
4,,,Maharashtra,India,18.94017,72.83483,17.05.2020,,,False,Hospitalized,hospitalized
...,...,...,...,...,...,...,...,...,...,...,...,...
37187,,,Maharashtra,India,18.94017,72.83483,28.05.2020,,https://phdmah.maps.arcgis.com/apps/opsdashboa...,False,Hospitalized,hospitalized
37188,,,Madhya Pradesh,India,23.26466,77.40518,12.05.2020,,https://twitter.com/ANI/status/126023457988702...,False,Recovered,recovered
37189,,,Maharashtra,India,18.94017,72.83483,29.05.2020,,https://arogya.maharashtra.gov.in/pdf/ncovidep...,False,Recovered,recovered
37190,,,Madhya Pradesh,India,22.71622,75.86512,05.05.2020,,https://twitter.com/JansamparkMP/status/125767...,False,Recovered,recovered


In [8]:
cases_train = cases_train.drop(columns=['outcome'])

In [10]:
cases_train

Unnamed: 0,age,sex,province,country,latitude,longitude,date_confirmation,additional_information,source,chronic_disease_binary,outcome_group
0,,,Maharashtra,India,19.20000,72.96667,23.05.2020,,,False,hospitalized
1,,,Maharashtra,India,18.94017,72.83483,18.05.2020,,https://t.me/indiacovid/5075,False,recovered
2,,,Maharashtra,India,19.20000,72.96667,28.05.2020,,https://phdmah.maps.arcgis.com/apps/opsdashboa...,False,hospitalized
3,,,West Bengal,India,22.80862,88.79242,09.05.2020,,https://www.wbhealth.gov.in/uploaded_files/cor...,False,hospitalized
4,,,Maharashtra,India,18.94017,72.83483,17.05.2020,,,False,hospitalized
...,...,...,...,...,...,...,...,...,...,...,...
37187,,,Maharashtra,India,18.94017,72.83483,28.05.2020,,https://phdmah.maps.arcgis.com/apps/opsdashboa...,False,hospitalized
37188,,,Madhya Pradesh,India,23.26466,77.40518,12.05.2020,,https://twitter.com/ANI/status/126023457988702...,False,recovered
37189,,,Maharashtra,India,18.94017,72.83483,29.05.2020,,https://arogya.maharashtra.gov.in/pdf/ncovidep...,False,recovered
37190,,,Madhya Pradesh,India,22.71622,75.86512,05.05.2020,,https://twitter.com/JansamparkMP/status/125767...,False,recovered


In [9]:
cases_train.groupby('outcome_group').size()

outcome_group
deceased             701
hospitalized       24563
nonhospitalized      138
recovered          11789
dtype: int64

## 1.3 Exploratory Data Analysis 

Ideas to explore

Probably want to convert date_information to a datetime object 

Categorical Attributes: Province, Country, Chronic disease, outcome_group, Sex
Numerical: Age, Date
By: country, Age, Sex
Age range: 18 - 30, < 18, 60 >


## 1.4 Data Cleaning and Imputing Missing Values
Datasets involved: cases_2021_train.csv, cases_2021_test.csv, location_2021.csv

In [None]:
# Primary: age column -> remove all entries that are NaN
# format age to standard integer
# potential approaches: 

# other columns -> impute, what strategies can we use to impute missing values of different columns
# additional information: best action is replace NaN with ""

In [24]:
# Removing NaN's from age column in train and test datasets

cases_train = cases_train[cases_train['age'].notna()].reset_index()
cases_test = cases_test[cases_test['age'].notna()].reset_index()


In [50]:
age_range = cases_train[cases_train['age'].str.contains('-') == True].reset_index(drop=True)

In [57]:
age_range.iloc[0]['age']

'22-80'

In [77]:
# age range greater than 10 should be removed?

def range_to_num(age):
    range = 10

    a_list = age.split('-')
    map_ints = map(float, a_list)
    map_list = list(map_ints)

    if (len(map_list) == 1):
        return round(map_list[0])

    if ((map_list[1] - map_list[0]) > range):
        return 'remove'
    else:
        return round((map_list[1] + map_list[0]) / 2)


In [78]:
print(range_to_num('22.3'))

22


In [67]:
cases_train = cases_train.drop(columns=['index'])

In [79]:
cases_train['age'] = cases_train['age'].apply(lambda x: range_to_num(x))

In [None]:
#1. Format all values with 'x-x' range to a single int value or 'remove'
#2. Remove all entries labelled 'remove' 
#3. Convert all column entries to int
#4. Use round() on all values to get rid of 0.3, 0.5, 0.8 values -> some babies

In [85]:
cases_train = cases_train[cases_train['age'] != 'remove']

In [None]:
cases_train['age'] = pd.to_numeric(cases_train['age'])

In [88]:
cases_train['age'] = cases_train['age'].apply(lambda x: round(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
