In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from math import cos, asin, sqrt, pi #calculate distance

# Preprocessing Tasks

### Loading all datasets

In [None]:
cases_train = pd.read_csv('./datasets/cases_2021_train.csv')
cases_test = pd.read_csv('./datasets/cases_2021_test.csv')
cases_location = pd.read_csv('./datasets/location_2021.csv')

## 1.1 Cleaning messy outcome labels

In [None]:
labels = {
    'Discharged': 'hospitalized', 'Discharged from hospital': 'hospitalized', 'Hospitalized': 'hospitalized', 'critical condition': 'hospitalized', 'discharge': 'hospitalized', 'discharged': 'hospitalized', 
    'Alive': 'nonhospitalized', 'Receiving Treatment': 'nonhospitalized', 'Stable': 'nonhospitalized', 'Under treatment': 'nonhospitalized', 'recovering at home 03.03.2020': 'nonhospitalized', 'released from quarantine': 'nonhospitalized', 'stable': 'nonhospitalized', 'stable condition': 'nonhospitalized', 
    'Dead': 'deceased', 'Death': 'deceased', 'Deceased': 'deceased', 'Died': 'deceased', 'death': 'deceased', 'died': 'deceased',
     'Recovered': 'recovered', 'recovered': 'recovered'
}

In [None]:
cases_train['outcome_group'] = cases_train['outcome'].map(labels)

In [None]:
cases_train = cases_train.drop(columns=['outcome'])

## 1.3 Exploratory Data Analysis 

### 1.3.1 train / test dataset

In [None]:
print('cases_2021_train.csv:')
stats_cases_train = cases_train.describe(include='all')
stats_cases_train = stats_cases_train.append(pd.DataFrame([cases_train.isna().sum()], index=['null'])) # add count of null values
stats_cases_train = stats_cases_train.iloc[[0,11,1]] # keep [count, null, unique]
stats_cases_train

In [None]:
print('cases_2021_test.csv')
stats_cases_test = cases_test.describe(include='all').round(2)
stats_cases_test = stats_cases_test.append(pd.DataFrame([cases_test.isna().sum()], index=['null'])) # add count of null values
stats_cases_test = stats_cases_test.iloc[[0,11,1]] # keep [count, null, unique]
stats_cases_test

In [None]:
# age attribute
plt.figure(figsize=(14,5))
plt.subplot(1,2,1)
plt.hist(pd.to_numeric(cases_train['age'], errors='coerce'), bins=10)
plt.title('Train Cases by Age')
plt.xlabel('Age')
plt.ylabel('Count')

plt.subplot(1,2,2)
plt.hist(pd.to_numeric(cases_test['age'], errors='coerce'), bins=10)
plt.title('Test Cases by Age')
plt.xlabel('Age')
plt.ylabel('Count')

plt.savefig('plots/task-1.3/Train_Test_Age.svg')

In [None]:
# sex attribute
plt.figure(figsize=(6,3))
plt.subplot(1,2,1)
plt.bar(cases_train['sex'].value_counts().index,cases_train['sex'].value_counts())
plt.title('Train Cases by Sex')
plt.ylabel('Count')

plt.subplot(1,2,2)
plt.bar(cases_test['sex'].value_counts().index,cases_test['sex'].value_counts())
plt.title('Test Cases by Sex')
plt.ylabel('Count')
plt.tight_layout()

plt.savefig('plots/task-1.3/Train_Test_Sex.svg')

In [None]:
# age & sex attribute
plt.figure(figsize=(14,5))
plt.subplot(1,2,1)
values = cases_train[['age', 'sex']]
plt.hist(pd.to_numeric(values.loc[values['sex']=='male', 'age'], errors='coerce'), bins=10)
plt.hist(pd.to_numeric(values.loc[values['sex']=='female', 'age'], errors='coerce'), bins=10)
plt.title('Train Cases by Age and Sex')
plt.legend(['male', 'female'])
plt.xlabel('Age')
plt.ylabel('Count')

plt.subplot(1,2,2)
values = cases_test[['age', 'sex']]
plt.hist(pd.to_numeric(values.loc[values['sex']=='male', 'age'], errors='coerce'), bins=10)
plt.hist(pd.to_numeric(values.loc[values['sex']=='female', 'age'], errors='coerce'), bins=10)
plt.title('Test Cases by Age and Sex')
plt.legend(['male', 'female'])
plt.xlabel('Age')
plt.ylabel('Count')

plt.savefig('plots/task-1.3/Train_Test_Age_Sex.svg')

In [None]:
# country attribute
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
values = pd.DataFrame(cases_train['country'].value_counts()).reset_index()
values['index'] = values.apply(lambda x: x['index'] if x.country>100 else 'Other', axis=1)# sumarize others
plt.bar(values['index'], values['country'])
plt.title('Train Cases by Country')
plt.ylabel('Count')

plt.subplot(1,2,2)
values = pd.DataFrame(cases_test['country'].value_counts()).reset_index()
values['index'] = values.apply(lambda x: x['index'] if x.country>60 else 'Other', axis=1)# sumarize others
plt.bar(values['index'], values['country'])
plt.title('Test Cases by Country')
plt.ylabel('Count')
plt.tight_layout()

plt.savefig('plots/task-1.3/Train_Test_Country.svg')

In [None]:
# country attribute
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
values = pd.DataFrame(cases_train['country'].value_counts()).reset_index()
values = values.drop(index=[0, 1])
values['index'] = values.apply(lambda x: x['index'] if x.country>60 else 'Other', axis=1)# sumarize others
plt.bar(values['index'], values['country'])
plt.title('Train Cases by Country (w/o India, Philippines)')
plt.ylabel('Count')

plt.subplot(1,2,2)
values = pd.DataFrame(cases_test['country'].value_counts()).reset_index()
values = values.drop(index=[0, 1])
values['index'] = values.apply(lambda x: x['index'] if x.country>30 else 'Other', axis=1)# sumarize others
plt.bar(values['index'], values['country'])
plt.title('Test Cases by Country (w/o India, Philippines)')
plt.ylabel('Count')
#plt.show()
plt.savefig('plots/task-1.3/Train_Test_Country2.svg')

In [None]:
# province attribute - India
plt.figure(figsize=(20,6))
plt.subplot(1,2,1)
values = cases_train[cases_train['country']=='India']
values = pd.DataFrame(values['province'].value_counts().sort_values(ascending=True)).reset_index()
values['index'] = values.apply(lambda x: x['index'] if x.province>500 else 'Other', axis=1)#sumarize others
plt.barh(values['index'], values['province'])
plt.title('Train Cases by Province for India')
plt.xlabel('Count')

plt.subplot(1,2,2)
values = cases_test[cases_test['country']=='India']
values = pd.DataFrame(values['province'].value_counts().sort_values(ascending=True)).reset_index()
values['index'] = values.apply(lambda x: x['index'] if x.province>500 else 'Other', axis=1)#sumarize others
plt.barh(values['index'], values['province'])
plt.title('Test Cases by Province for India')
plt.xlabel('Count')
plt.tight_layout()

plt.savefig('plots/task-1.3/Train_Test_Provinces_India.svg')

In [None]:
# province attribute - Philippines
plt.figure(figsize=(20,6))
plt.subplot(1,2,1)
values = cases_train[cases_train['country']=='Philippines']
values = pd.DataFrame(values['province'].value_counts().sort_values(ascending=True)).reset_index()
values['index'] = values.apply(lambda x: x['index'] if x.province>10 else 'Other', axis=1)#sumarize others
plt.barh(values['index'], values['province'])
plt.title('Train Cases by Province for Philippines')
plt.xlabel('Count')

plt.subplot(1,2,2)
values = cases_test[cases_test['country']=='Philippines']
values = pd.DataFrame(values['province'].value_counts().sort_values(ascending=True)).reset_index()
values['index'] = values.apply(lambda x: x['index'] if x.province>10 else 'Other', axis=1)#sumarize others
plt.barh(values['index'], values['province'])
plt.title('Test Cases by Province for Philippines')
plt.xlabel('Count')
plt.tight_layout()

plt.savefig('plots/task-1.3/Train_Test_Provinces_Philippines.svg')

In [None]:
# latitude / longitude attribute
plt.figure(figsize=(16,5))
plt.subplot(1,2,1)
plt.plot(cases_train['latitude'], cases_train['longitude'], '.')
plt.title('Train Cases by Latitude / Longitude')
plt.xlabel('Latitude')
plt.ylabel('Longitude')

plt.subplot(1,2,2)
plt.plot(cases_test['latitude'], cases_test['longitude'], '.')
plt.title('Test Cases by Latitude / Longitude')
plt.xlabel('Latitude')
plt.ylabel('Longitude')

plt.savefig('plots/task-1.3/Train_Test_Lat_Lon.svg')

In [None]:
# date_confirmation attribute
# add month attribute
cases_train['month'] = cases_train.apply(lambda x: str(x.date_confirmation)[3:5] + str(x.date_confirmation)[6:10], axis=1)
cases_train = cases_train.replace("", np.NaN)
cases_test['month'] = cases_test.apply(lambda x: str(x.date_confirmation)[3:5] + str(x.date_confirmation)[6:10], axis=1)
cases_test = cases_test.replace("", np.NaN)

plt.figure(figsize=(16, 5))
plt.subplot(1,2,1)
plt.bar(cases_train['month'].value_counts().index, cases_train['month'].value_counts())
plt.title('Train Cases by Confirmation Date')
plt.xlabel('Month')
plt.ylabel('Count')

plt.subplot(1,2,2)
plt.bar(cases_test['month'].value_counts().index, cases_test['month'].value_counts())
plt.title('Test Cases by Confirmation Date')
plt.xlabel('Month')
plt.ylabel('Count')

plt.savefig('plots/task-1.3/Train_Test_Date.svg')

In [None]:
# chronic disease  attribute
plt.figure(figsize=(9,4))
plt.subplot(1,2,1)
plt.bar(['True','False'],cases_train['chronic_disease_binary'].value_counts())
plt.title('Train Cases by Chronic Disease')
plt.ylabel('Count')

plt.subplot(1,2,2)
plt.bar(['True','False'],cases_test['chronic_disease_binary'].value_counts())
plt.title('Test Cases by Chronic Disease')
plt.ylabel('Count')
plt.tight_layout()

plt.savefig('plots/task-1.3/Train_Test_Chronic_Disease.svg')

In [None]:
#outcome group attribute
plt.bar(cases_train['outcome_group'].value_counts().index,cases_train['outcome_group'].value_counts())
plt.title('Train Cases by Outcome Group')
plt.ylabel('Count')

plt.savefig('plots/task-1.3/Train_Test_Outcome_Group.svg')

### 1.3.2 location dataset

In [None]:
stats_cases_location = cases_location.describe(include="all").round(2)
stats_cases_location = stats_cases_location.append(pd.DataFrame([cases_location.isna().sum()], index=['null'])) # add count of null values
stats_cases_location = stats_cases_location.iloc[[0,11,1, 4, 5, 6, 10]] # keep [count, null, unique]
stats_cases_location

In [None]:
# country / region attribute
plt.figure(figsize=(16,5))
values = pd.DataFrame(cases_location['Country_Region'].value_counts()).reset_index()
values['index'] = values.apply(lambda x: x['index'] if x['Country_Region']>15 else 'Other', axis=1)# sumarize others
plt.bar(values['index'], values['Country_Region'])
plt.title('Data Points per Country / Region')
plt.ylabel('Count')
plt.xticks(rotation=45)

plt.savefig('plots/task-1.3/Location_Region.svg')

In [None]:
# latitude / longitude attribute
plt.figure(figsize=(10,7))
plt.plot(cases_location['Lat'], cases_location['Long_'], '.')
plt.title('Data Points by Latitude / Longitude')
plt.xlabel('Latitude')
plt.ylabel('Longitude')

plt.savefig('plots/task-1.3/Location_Lat_Lon.svg')

In [None]:
# confirmed, recovered, active, deaths attribute
values = cases_location.groupby('Country_Region').sum()
values = values.nlargest(25, 'Confirmed') #get Top 25 countries

plt.figure(figsize=(16,5))
plt.bar(values.index, values['Recovered'], color='green')
plt.bar(values.index, values['Active'], color='orange')
plt.bar(values.index, values['Deaths'],  color='black')
plt.xticks(rotation=45)
plt.legend(['Recovered', 'Active', 'Deaths'])
plt.title('Confirmed Cases by Country')
plt.ylabel('Count')
plt.ticklabel_format(axis='y', style='plain') # prevent scientific notation

plt.savefig('plots/task-1.3/Location_Confirmed.svg')

In [None]:
# incident rate attribute
values = cases_location.groupby('Country_Region').mean()
values = values.nlargest(25, 'Incident_Rate')

plt.figure(figsize=(16,5))
plt.plot(values.index, values['Incident_Rate'])
plt.xticks(rotation=45)
plt.title('Incident Rate by Country')
plt.ylabel('Incident Rate')

plt.savefig('plots/task-1.3/Location_Incident_Rate.svg')

In [None]:
# case_fatality attribute
values = cases_location.groupby('Country_Region').mean()
values = values.nlargest(25, 'Case_Fatality_Ratio')

plt.figure(figsize=(16,5))
plt.plot(values.index, values['Case_Fatality_Ratio'])
plt.xticks(rotation=45)
plt.title('Case Fatality Ratio by Country')
plt.ylabel('Fatality Rate')

plt.savefig('plots/task-1.3/Location_Fatality_Rate.svg')