In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from imblearn.combine import SMOTEENN 
from imblearn.pipeline import make_pipeline 
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier, XGBRFClassifier, train
from imblearn.over_sampling import RandomOverSampler 
from imblearn.under_sampling import RandomUnderSampler 
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.ensemble import AdaBoostClassifier

In [None]:
train_data = pd.read_excel("data/cases_2021_train_processed.xlsx")
test_data = pd.read_excel("data/cases_2021_test_processed_unlabelled.xlsx")

# 1.2: Mapping the features

1. Converted uneccesary float values to integer
2. Categorical values that are binary in nature converted to 0's and 1's
3. One-hot encoding done on 'province' and 'country'

In [None]:
cols = ['age', 'Confirmed', 'Deaths', 'Recovered', 'Active']

In [None]:
train_data[cols] = train_data[cols].apply(pd.to_numeric, downcast='integer', axis=1)
test_data[cols] = test_data[cols].apply(pd.to_numeric, downcast='integer', axis=1)

In [None]:
outcome_groups = {'deceased': 0, 'hospitalized': 1,'nonhospitalized': 2}
outcome_groups_inverse = {0: 'deceased', 1: 'hospitalized', 2: 'nonhospitalized'}
sex = {'male': 0, 'female': 1}

In [None]:
train_data['outcome_group'] = train_data['outcome_group'].map(outcome_groups)
train_data['sex'] = train_data['sex'].map(sex)
train_data['province'] = train_data['province'].fillna('Philippines')
train_data['chronic_disease_binary'] = train_data['chronic_disease_binary'].astype(int)


In [None]:
test_data['sex'] = test_data['sex'].map(sex)
test_data['province'] = test_data['province'].fillna('Philippines')
test_data['chronic_disease_binary'] = test_data['chronic_disease_binary'].astype(int)

In [None]:
train_data['outcome_group'].value_counts()

In [None]:
dummy_cols = ['province', 'country']
train_data = pd.get_dummies(train_data, columns=dummy_cols)
test_data = pd.get_dummies(test_data, columns=dummy_cols)

# Need to make sure the columns are the same in train and test data
test_data = test_data.reindex(columns=train_data.columns, fill_value=0)
test_data.drop('outcome_group', axis=1, inplace=True)

In [None]:
train_data['date_confirmation_int'] = train_data['date_confirmation'].dt.strftime("%Y%m%d").astype(int)
test_data['date_confirmation_int'] = test_data['date_confirmation'].dt.strftime("%Y%m%d").astype(int)
train_data.drop(['date_confirmation', 'Confirmed', 'Deaths', 'Recovered', 'Active'], axis=1, inplace=True)
test_data.drop(['date_confirmation', 'Confirmed', 'Deaths', 'Recovered', 'Active'], axis=1, inplace=True)

In [None]:
X = train_data.drop('outcome_group', axis=1)
y = train_data['outcome_group']

In [None]:
y.value_counts()

In [None]:
def printClassificationResults(models):
    kfold = KFold(n_splits=5, shuffle=True, random_state=1)
    all_labels = []
    all_predictions = []
    for train_index, test_index in kfold.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        ensemble_predictions = pd.DataFrame(index=range(y_test.shape[0]))
        
        for i in range(len(models)):
            model = models[i]
            model.fit(X_train, y_train)
            ensemble_predictions[i] = model.predict(X_test)

        predictions = ensemble_predictions.mode(axis=1)[0].astype(int)
        all_labels = all_labels + list(y_test)
        all_predictions = all_predictions + list(predictions)

    report = classification_report(all_labels, all_predictions)
    print(report)

In [None]:
def createSubmissionFile(models, filename):
    ensemble_predictions = pd.DataFrame(index=range(test_data.shape[0]))

    for i in range(len(models)):
        model = models[i]
        model.fit(X, y)
        ensemble_predictions[i] = model.predict(test_data)

    predictions = ensemble_predictions.mode(axis=1)[0].astype(int)
    submission = pd.DataFrame({'Prediction': predictions})
    submission.index.name = 'Id'
    submission.to_csv('submissions/{filename}'.format(filename=filename))

In [None]:
# Testing Random Forest Best params: max_depth=14, max_samples=0.56, min_samples_leaf=1, min_samples_split=5, class_weight='balanced', random_state=1
params = {
    "randomforestclassifier__max_depth": [i for i in range(12, 15)],
    "randomforestclassifier__min_samples_split": [i for i in range(3, 10, 2)],
    "randomforestclassifier__min_samples_leaf": [i for i in range(1, 3)],
    'randomforestclassifier__max_samples': [i/100 for i in range(50, 70, 3)]
}
random_forest = make_pipeline(StandardScaler(), RandomForestClassifier(max_depth=14, max_samples=0.56, min_samples_leaf=1, min_samples_split=5, class_weight='balanced', random_state=1))
printClassificationResults([random_forest])
createSubmissionFile([random_forest], 'random_forest.csv')

#search = GridSearchCV(random_forest, params, scoring='f1_macro')
#search.fit(X, y)
#print("Best parameter (CV score=%0.3f):" % search.best_score_)
#print(search.best_params_)

In [None]:
# Testing Neural Network
neural_network = make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(50,), max_iter=500, random_state=1))
printClassificationResults([neural_network])
createSubmissionFile([neural_network], 'neural_network.csv')

In [None]:
# Testing XGBoost
xgboost = make_pipeline(StandardScaler(), XGBRFClassifier(use_label_encoder=False, max_depth=15, eval_metric="mlogloss"))
printClassificationResults([xgboost])
createSubmissionFile([xgboost], 'xgboost.csv')

In [None]:
# Testing Enseble of Random Forest, Neural Network, XGBoost
printClassificationResults([random_forest, neural_network, xgboost])
createSubmissionFile([random_forest, neural_network, xgboost], 'ensemble.csv')