In [221]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import CategoricalNB, GaussianNB, MultinomialNB
from imblearn.combine import SMOTEENN 
from imblearn.pipeline import make_pipeline 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler 
from imblearn.under_sampling import RandomUnderSampler 
from sklearn.model_selection import GridSearchCV

In [202]:
train_data = pd.read_excel("data/cases_2021_train_processed.xlsx")
test_data = pd.read_excel("data/cases_2021_test_processed_unlabelled.xlsx")

# 1.2: Mapping the features

1. Converted uneccesary float values to integer
2. Categorical values that are binary in nature converted to 0's and 1's
3. One-hot encoding done on 'province' and 'country'

In [203]:
cols = ['age', 'Confirmed', 'Deaths', 'Recovered', 'Active']

In [204]:
train_data[cols] = train_data[cols].apply(pd.to_numeric, downcast='integer', axis=1)
test_data[cols] = test_data[cols].apply(pd.to_numeric, downcast='integer', axis=1)

In [205]:
outcome_groups = {'deceased': 0, 'hospitalized': 1,'nonhospitalized': 2}
outcome_groups_inverse = {0: 'deceased', 1: 'hospitalized', 2: 'nonhospitalized'}
sex = {'male': 0, 'female': 1}

In [206]:
train_data['outcome_group'] = train_data['outcome_group'].map(outcome_groups)
train_data['sex'] = train_data['sex'].map(sex)
train_data['province'] = train_data['province'].fillna('Philippines')
train_data['chronic_disease_binary'] = train_data['chronic_disease_binary'].astype(int)


In [207]:
test_data['sex'] = test_data['sex'].map(sex)
test_data['province'] = test_data['province'].fillna('Philippines')
test_data['chronic_disease_binary'] = test_data['chronic_disease_binary'].astype(int)

In [208]:
train_data['outcome_group'].value_counts()

1    13241
2     2974
0      997
Name: outcome_group, dtype: int64

In [209]:
dummy_cols = ['province', 'country']
train_data = pd.get_dummies(train_data, columns=dummy_cols)
test_data = pd.get_dummies(test_data, columns=dummy_cols)

# Need to make sure the columns are the same in train and test data
test_data = test_data.reindex(columns=train_data.columns, fill_value=0)
test_data.drop('outcome_group', axis=1, inplace=True)

In [210]:
train_data['date_confirmation_int'] = train_data['date_confirmation'].dt.strftime("%Y%m%d").astype(int)
test_data['date_confirmation_int'] = test_data['date_confirmation'].dt.strftime("%Y%m%d").astype(int)
train_data.drop(['date_confirmation', 'Confirmed', 'Deaths', 'Recovered', 'Active'], axis=1, inplace=True)
test_data.drop(['date_confirmation', 'Confirmed', 'Deaths', 'Recovered', 'Active'], axis=1, inplace=True)

In [211]:
# Balance the data using SMOTE (generates new data similar to minority class data)
X = train_data.drop('outcome_group', axis=1)
y = train_data['outcome_group']

In [212]:
# The counts of each class labels are now the same
y.value_counts()

1    13241
2     2974
0      997
Name: outcome_group, dtype: int64

In [213]:
def printClassificationResults(model):
    kfold = KFold(n_splits=5, shuffle=True, random_state=1)
    all_labels = []
    all_predictions = []
    for train_index, test_index in kfold.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        predictions = model.predict(X_test)

        all_labels = all_labels + list(y_test)
        all_predictions = all_predictions + list(predictions)

    report = classification_report(all_labels, all_predictions)
    print(report)

In [214]:
def createSubmissionFile(model, filename):
    model.fit(X, y)
    predictions = model.predict(test_data)
    submission = pd.DataFrame({'Prediction': predictions})
    submission.index.name = 'Id'
    submission.to_csv('submissions/{filename}'.format(filename=filename))

In [216]:
# Testing Random Forest
params = {
    "randomforestclassifier__max_depth": [i for i in range(5, 50, 5)],
    "randomforestclassifier__min_samples_split": [i for i in range(2, 12, 2)],
    "randomforestclassifier__min_samples_leaf": [i for i in range(1, 6, 1)],
}
random_forest = make_pipeline(StandardScaler(), RandomForestClassifier(max_depth=15, min_samples_leaf=2, min_samples_split=4, class_weight='balanced'))
search = GridSearchCV(random_forest, params, scoring='f1_macro')
# Best params: max_depth=15, min_samples_leaf=2, min_samples_split=4
#search.fit(X, y)
#print("Best parameter (CV score=%0.3f):" % search.best_score_)
#print(search.best_params_)
printClassificationResults(random_forest)
createSubmissionFile(random_forest, 'random_forest.csv')

              precision    recall  f1-score   support

           0       0.53      0.53      0.53       997
           1       0.98      1.00      0.99     13241
           2       0.89      0.84      0.87      2974

    accuracy                           0.94     17212
   macro avg       0.80      0.79      0.80     17212
weighted avg       0.94      0.94      0.94     17212



In [217]:
# Testing Neural Network
neural_network = make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(50,), max_iter=500, random_state=1))
printClassificationResults(neural_network)
createSubmissionFile(neural_network, 'neural_network.csv')

              precision    recall  f1-score   support

           0       0.68      0.37      0.48       997
           1       0.98      1.00      0.99     13241
           2       0.85      0.93      0.89      2974

    accuracy                           0.95     17212
   macro avg       0.84      0.76      0.79     17212
weighted avg       0.94      0.95      0.94     17212



In [None]:
# Testing XGBoost
xgboost = make_pipeline(StandardScaler(), XGBClassifier(use_label_encoder=False))
printClassificationResults(xgboost)
createSubmissionFile(xgboost, 'xgboost.csv')

In [218]:
# Testing Support Vector Machine
svlinear = make_pipeline(StandardScaler(), SVC(kernel='linear'))
printClassificationResults(svmachine)

              precision    recall  f1-score   support

           0       0.93      0.19      0.32       997
           1       0.98      1.00      0.99     13241
           2       0.82      0.96      0.89      2974

    accuracy                           0.95     17212
   macro avg       0.91      0.72      0.73     17212
weighted avg       0.95      0.95      0.93     17212



In [223]:
# Testing Naive Bayes
mnb = make_pipeline(MinMaxScaler(), MultinomialNB())
printClassificationResults(mnb)

              precision    recall  f1-score   support

           0       0.75      0.08      0.15       997
           1       0.97      0.99      0.98     13241
           2       0.80      0.96      0.88      2974

    accuracy                           0.93     17212
   macro avg       0.84      0.68      0.67     17212
weighted avg       0.93      0.93      0.92     17212

