In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier, train
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import seaborn

seaborn.set()

In [None]:
train_data = pd.read_excel("data/cases_2021_train_processed.xlsx")
test_data = pd.read_excel("data/cases_2021_test_processed_unlabelled.xlsx")

# 1.2: Mapping the features

1. Converted uneccesary float values to integer
2. Categorical values that are binary in nature converted to 0's and 1's
3. One-hot encoding done on 'province' and 'country'

In [None]:
# Cast features with floats to integers
cols = ['age', 'Confirmed', 'Deaths', 'Recovered', 'Active']
train_data[cols] = train_data[cols].apply(pd.to_numeric, downcast='integer', axis=1)
test_data[cols] = test_data[cols].apply(pd.to_numeric, downcast='integer', axis=1)

In [None]:
# Create the mappings for categorical variables
outcome_groups = {'deceased': 0, 'hospitalized': 1,'nonhospitalized': 2}
sex = {'male': 0, 'female': 1}

In [None]:
# Map outcome_group to 0, 1, 2
train_data['outcome_group'] = train_data['outcome_group'].map(outcome_groups)

In [None]:
# Convert sex to 0, 1
train_data['sex'] = train_data['sex'].map(sex)
test_data['sex'] = test_data['sex'].map(sex)

In [None]:
# Fill in province for Philippines
train_data['province'] = train_data['province'].fillna('Philippines')
test_data['province'] = test_data['province'].fillna('Philippines')

In [None]:
# Convert chronic_disease_binary to 0, 1
train_data['chronic_disease_binary'] = train_data['chronic_disease_binary'].astype(int)
test_data['chronic_disease_binary'] = test_data['chronic_disease_binary'].astype(int)

In [None]:
# Create dummy columns for province and country
dummy_cols = ['province', 'country']
train_data = pd.get_dummies(train_data, columns=dummy_cols)
test_data = pd.get_dummies(test_data, columns=dummy_cols)

# Need to make sure the columns are the same in train and test data
test_data = test_data.reindex(columns=train_data.columns, fill_value=0)
test_data.drop('outcome_group', axis=1, inplace=True)

In [None]:
# Convert date_confirmation to int
train_data['date_confirmation_int'] = train_data['date_confirmation'].dt.strftime("%Y%m%d").astype(int)
test_data['date_confirmation_int'] = test_data['date_confirmation'].dt.strftime("%Y%m%d").astype(int)

In [None]:
# Remove unnecessary features
train_data.drop(['date_confirmation', 'Confirmed', 'Deaths', 'Recovered', 'Active'], axis=1, inplace=True)
test_data.drop(['date_confirmation', 'Confirmed', 'Deaths', 'Recovered', 'Active'], axis=1, inplace=True)

In [None]:
# Plot the distribution of outcome groups
plt.bar(train_data['outcome_group'].unique(), train_data['outcome_group'].value_counts())
plt.locator_params(axis='x', nbins=4)
plt.title('Distribution of outcome groups')
plt.show()

In [None]:
X = train_data.drop('outcome_group', axis=1)
y = train_data['outcome_group']

In [None]:
# Train the model/models using 5-fold validation, and print the validation scores.
def printClassificationResults(models):
    kfold = KFold(n_splits=5, shuffle=True, random_state=1)
    all_labels = []
    all_predictions = []
    for train_index, test_index in kfold.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        ensemble_predictions = pd.DataFrame(index=range(y_test.shape[0]))
        
        for i in range(len(models)):
            model = models[i]
            model.fit(X_train, y_train)
            ensemble_predictions[i] = model.predict(X_test)

        predictions = ensemble_predictions.mode(axis=1)[0].astype(int)
        all_labels = all_labels + list(y_test)
        all_predictions = all_predictions + list(predictions)

    report = classification_report(all_labels, all_predictions)
    print(report)

In [None]:
# Train the model/models using 5-fold validation, and return the macro f1-score
def getMacroF1(models):
    kfold = KFold(n_splits=5, shuffle=True, random_state=1)
    all_labels = []
    all_predictions = []
    for train_index, test_index in kfold.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        ensemble_predictions = pd.DataFrame(index=range(y_test.shape[0]))
        
        for i in range(len(models)):
            model = models[i]
            model.fit(X_train, y_train)
            ensemble_predictions[i] = model.predict(X_test)

        predictions = ensemble_predictions.mode(axis=1)[0].astype(int)
        all_labels = all_labels + list(y_test)
        all_predictions = all_predictions + list(predictions)

    report = classification_report(all_labels, all_predictions, output_dict=True)
    return report['macro avg']['f1-score']

In [None]:
# Train the model/models on the full training dataset, and create a submission file with the predictions for the test dataset
def createSubmissionFile(models, filename):
    ensemble_predictions = pd.DataFrame(index=range(test_data.shape[0]))

    for i in range(len(models)):
        model = models[i]
        model.fit(X, y)
        ensemble_predictions[i] = model.predict(test_data)

    predictions = ensemble_predictions.mode(axis=1)[0].astype(int)
    submission = pd.DataFrame({'Prediction': predictions})
    submission.index.name = 'Id'
    submission.to_csv('submissions/{filename}'.format(filename=filename))

## Random Forest
Best params: max_depth=14, max_samples=0.56, min_samples_leaf=1, min_samples_split=5, class_weight='balanced', random_state=1

In [None]:
params = {
    "randomforestclassifier__max_depth": [i for i in range(13, 16)],
    "randomforestclassifier__min_samples_split": [i for i in range(5, 8)],
    "randomforestclassifier__min_samples_leaf": [i for i in range(1, 3)],
    'randomforestclassifier__max_samples': [i/100 for i in range(33, 38)]
}

random_forest = make_pipeline(StandardScaler(), RandomForestClassifier(max_depth=14, max_samples=0.56, min_samples_leaf=1, min_samples_split=5, class_weight='balanced', random_state=1))
printClassificationResults([random_forest])
createSubmissionFile([random_forest], 'random_forest.csv')

#search = GridSearchCV(random_forest, params, scoring='f1_macro')
#search.fit(X, y)
#print("Best parameter (CV score=%0.3f):" % search.best_score_)
#print(search.best_params_)

In [None]:
# Plot macro F1-score for various values of max_samples
plotX = [i/100 for i in range(10, 101, 10)]
plotY = []
for max_samples in plotX:
    random_forest = make_pipeline(StandardScaler(), RandomForestClassifier(max_depth=14, max_samples=max_samples, min_samples_leaf=1, min_samples_split=5, class_weight='balanced', random_state=1))
    macroF1 = getMacroF1([random_forest])
    plotY.append(macroF1)   

plt.plot(plotX, plotY)
plt.xlabel('Max_Samples')
plt.ylabel('Macro F1-Score')
plt.title('Random Forest Macro F1-Score vs Max_Samples')
plt.savefig('plots/random_forest_max_samples.png')

In [None]:
# Plot macro F1-score for various values of max_depth
plotX = [i for i in range(5, 51, 5)]
plotY = []
for max_depth in plotX:
    random_forest = make_pipeline(StandardScaler(), RandomForestClassifier(max_depth=max_depth, max_samples=0.56, min_samples_leaf=1, min_samples_split=5, class_weight='balanced', random_state=1))
    macroF1 = getMacroF1([random_forest])
    plotY.append(macroF1)   

plt.plot(plotX, plotY)
plt.xlabel('Max_Depth')
plt.ylabel('Macro F1-Score')
plt.title('Random Forest Macro F1-Score vs Max_Depth')
plt.savefig('plots/random_forest_max_depth.png')

## XGBoost

In [None]:
# Testing XGBoost

# Base model
xgboost = make_pipeline(StandardScaler(), XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))
printClassificationResults([xgboost])
# createSubmissionFile(xgboost, 'xgboost.csv')

In [None]:
import warnings
warnings.filterwarnings("ignore")
### tuning XGBoost
xg_params = {
        'xgbclassifier__min_child_weight': [3, 5, 7],
        'xgbclassifier__gamma': [0.6, 1, 1.3],
        'xgbclassifier__subsample': [0.3,0.5, 0.7],
        'xgbclassifier__colsample_bytree': [0.6, 0.8],
        'xgbclassifier__max_depth': [4, 5],
        'xgbclassifier__eta': [0.1, 0.3, 0.35]
        }

In [None]:
# Searching parameters

# Best params: eta=0.3222, max_depth=4, subsample=0.7, min_child_weight=5, gamma=1, colsample_bytree=0.6
xgboost_cv = make_pipeline(StandardScaler(), XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))
xgboost_grid = GridSearchCV(xgboost_cv, xg_params) 
# xgboost_random = RandomizedSearchCV(xgboost_cv, xg_params, refit=True, n_iter=10)
xgboost_random.fit(X,y)

In [None]:
xgboost_random.best_params_

In [None]:
xg_cv_results = pd.DataFrame(xgboost_random.cv_results_)
xg_cv_results.to_csv('results/xgboost_grid_tuning.csv')

In [None]:
# Predict on test data

xgboost_cv = make_pipeline(StandardScaler(), XGBClassifier(eta=0.3222,max_depth=4,subsample=0.7,min_child_weight=5, gamma = 1, colsample_bytree=0.6,use_label_encoder=False, eval_metric='mlogloss'))
printClassificationResults([xgboost_cv]) 
createSubmissionFile([xgboost_cv], 'xgboost.csv')

### SUPPORT VECTOR MACHINE

In [None]:
# Testing Support Vector Machine

#baseline scores
svlinear = make_pipeline(StandardScaler(), SVC(kernel='linear'))
printClassificationResults([svlinear])

In [None]:
# Testing SVM with GridsearchCV
# Kernel
# C - penalty, higher will create larger margins
# Gamma - high gamma will cause overfitting

# SVM best params for 'svc__C':[0.1,1,10],'svc__kernel':['linear'],'svc__gamma': [1, 0.1, 0.01, 0.001, 0.0001]
# {'svc__C': 1, 'svc__gamma': 1, 'svc__kernel': 'linear'}

# svm_tuned = make_pipeline(StandardScaler(), SVC(C=0.5, gamma=1, kernel='linear', class_weight='balanced'))
# svm_grid = GridSearchCV(svm_cv, svm_params)
# printClassificationResults([svm_tuned])
# createSubmissionFile(svm_tuned, 'linear_svm.csv')


In [None]:
#GridSearchCV on SVM
# svm_cv = make_pipeline(StandardScaler(), SVC())
# svm_params = { 'svc__C':[0.1,0.5,1, 1.5, 2],'svc__kernel':['linear'],'svc__gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'svc__class_weight':['balanced']}
# svm_grid = GridSearchCV(svm_cv, svm_params)

In [None]:
# svm_grid.fit(X,y)

In [None]:
# svm_cv_results = pd.DataFrame(svm_grid.cv_results_)
# svm_cv_results.to_csv('results/support_vector_machine.csv')

## Neural Network

In [None]:
# manually balance minorty class
balanced = pd.concat([train_data, train_data[train_data['outcome_group']==0]])

X = balanced.drop('outcome_group', axis=1)
y = balanced['outcome_group']
X['age_group'] = np.ceil(X['age'] / 10)
test_data['age_group'] = np.ceil(test_data['age'] / 10)

In [None]:
# create Bagging Classifier of MLP Classifiers
neural_network = make_pipeline(StandardScaler(),
                               MLPClassifier(activation='relu', solver='adam', max_iter=500, early_stopping=True))
bagging = BaggingClassifier(base_estimator = neural_network, n_estimators=5, random_state=0)

printClassificationResults([bagging])
createSubmissionFile([bagging], 'neural_network.csv')

## Ensemble

In [None]:
rf = pd.read_csv('submissions/random_forest.csv')
xg = pd.read_csv('submissions/xgboost.csv')
nn = pd.read_csv('submissions/neural_network.csv')
ensemble_predictions = pd.DataFrame(data={'rf': rf['Prediction'], 'xg': xg['Prediction'], 'nn': nn['Prediction']})
predictions = ensemble_predictions.mode(axis=1)[0].astype(int)
submission = pd.DataFrame({'Prediction': predictions})
submission.index.name = 'Id'
submission.to_csv('submissions/ensemble.csv')