In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import CategoricalNB, GaussianNB

In [2]:
train_data = pd.read_excel("data/cases_2021_train_processed.xlsx")
test_data = pd.read_excel("data/cases_2021_test_processed_unlabelled.xlsx")

# 1.2: Mapping the features

1. Converted uneccesary float values to integer
2. Categorical values that are binary in nature converted to 0's and 1's
3. One-hot encoding done on 'province' and 'country'

In [3]:
cols = ['age', 'Confirmed', 'Deaths', 'Recovered', 'Active']

In [4]:
train_data[cols] = train_data[cols].apply(pd.to_numeric, downcast='integer', axis=1)
test_data[cols] = test_data[cols].apply(pd.to_numeric, downcast='integer', axis=1)

In [5]:
outcome_groups = {'deceased': 0, 'hospitalized': 1,'nonhospitalized': 2}
sex = {'male': 0, 'female': 1}

In [6]:
train_data['outcome_group'] = train_data['outcome_group'].map(outcome_groups)
train_data['sex'] = train_data['sex'].map(sex)
train_data['province'] = train_data['province'].fillna('Philippines')
train_data['chronic_disease_binary'] = train_data['chronic_disease_binary'].astype(int)


In [7]:
test_data['sex'] = test_data['sex'].map(sex)
test_data['province'] = test_data['province'].fillna('Philippines')
test_data['chronic_disease_binary'] = test_data['chronic_disease_binary'].astype(int)

In [8]:
train_data['outcome_group'].value_counts()

1    13241
2     2974
0      997
Name: outcome_group, dtype: int64

In [9]:
dummy_cols = ['province', 'country']
train_data = pd.get_dummies(train_data, columns=dummy_cols)
test_data = pd.get_dummies(test_data, columns=dummy_cols)

In [10]:
train_data.drop(['date_confirmation'], axis=1, inplace=True)

In [11]:
def printClassificationResults(model):
    X = train_data.drop('outcome_group', axis=1)
    y = train_data['outcome_group']
    kfold = KFold(n_splits=5, shuffle=True, random_state=1)
    all_labels = []
    all_predictions = []
    for train_index, test_index in kfold.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        predictions = model.predict(X_test)

        all_labels = all_labels + list(y_test)
        all_predictions = all_predictions + list(predictions)

    report = classification_report(all_labels, all_predictions)
    print(report)

In [12]:
# Testing Random Forest
random_forest = RandomForestClassifier()
printClassificationResults(random_forest)

              precision    recall  f1-score   support

           0       0.49      0.35      0.41       997
           1       0.98      0.99      0.98     13241
           2       0.85      0.88      0.86      2974

    accuracy                           0.93     17212
   macro avg       0.77      0.74      0.75     17212
weighted avg       0.93      0.93      0.93     17212

