In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import CategoricalNB, GaussianNB

In [2]:
train_data = pd.read_excel("data/cases_2021_train_processed.xlsx")
test_data = pd.read_excel("data/cases_2021_test_processed_unlabelled.xlsx")

# 1.2: Mapping the features

1. Converted uneccesary float values to integer
2. Categorical values that are binary in nature converted to 0's and 1's
3. One-hot encoding done on 'province' and 'country'

In [3]:
cols = ['age', 'Confirmed', 'Deaths', 'Recovered', 'Active']

In [4]:
train_data[cols] = train_data[cols].apply(pd.to_numeric, downcast='integer', axis=1)
test_data[cols] = test_data[cols].apply(pd.to_numeric, downcast='integer', axis=1)

In [5]:
outcome_groups = {'deceased': 0, 'hospitalized': 1,'nonhospitalized': 2}
sex = {'male': 0, 'female': 1}

In [6]:
train_data['outcome_group'] = train_data['outcome_group'].map(outcome_groups)
train_data['sex'] = train_data['sex'].map(sex)
train_data['province'] = train_data['province'].fillna('Philippines')
train_data['chronic_disease_binary'] = train_data['chronic_disease_binary'].astype(int)


In [7]:
test_data['sex'] = test_data['sex'].map(sex)
test_data['province'] = test_data['province'].fillna('Philippines')
test_data['chronic_disease_binary'] = test_data['chronic_disease_binary'].astype(int)

In [8]:
train_data['outcome_group'].value_counts()

1    13241
2     2974
0      997
Name: outcome_group, dtype: int64

In [22]:
dummy_cols = ['province', 'country']

In [25]:
train_data = pd.get_dummies(train_data, columns=dummy_cols)
test_data = pd.get_dummies(test_data, columns=dummy_cols)

Train and validation split - 80% Train / 20% Validation

In [26]:
X_train, X_val = train_test_split(train_data, test_size = 0.2, random_state =1)

In [27]:
X_val

Unnamed: 0,age,sex,latitude,longitude,date_confirmation,chronic_disease_binary,Confirmed,Deaths,Recovered,Active,...,country_Romania,country_San Marino,country_Singapore,country_South Korea,country_Sudan,country_Taiwan,country_Tanzania,country_Togo,country_Vietnam,country_Zimbabwe
1409,36,1,14.630000,121.03000,2020-04-29,0,747288,13297,603746,130245,...,0,0,0,0,0,0,0,0,0,0
15419,62,1,27.118700,75.64369,2020-04-27,0,333149,2818,321668,8663,...,0,0,0,0,0,0,0,0,0,0
1941,40,0,25.424420,86.13367,2020-04-29,0,265527,1576,262371,1580,...,0,0,0,0,0,0,0,0,0,0
9587,49,0,13.083620,80.28252,2020-05-22,0,886673,12719,858075,15879,...,0,0,0,0,0,0,0,0,0,0
11933,33,1,14.658180,120.94755,2020-04-11,0,747288,13297,603746,130245,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2731,44,0,25.224530,85.23014,2020-05-30,0,265527,1576,262371,1580,...,0,0,0,0,0,0,0,0,0,0
9788,66,0,13.083620,80.28252,2020-05-21,0,886673,12719,858075,15879,...,0,0,0,0,0,0,0,0,0,0
2001,55,1,25.578040,85.07260,2020-04-24,0,265527,1576,262371,1580,...,0,0,0,0,0,0,0,0,0,0
14330,39,1,1.395495,103.89340,2020-02-06,0,60381,30,60149,202,...,0,0,1,0,0,0,0,0,0,0
