In [None]:
import pandas as pd
import numpy as np
from sklearn import model_selection, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', 500)

cl_dict = {
    0: 'Critical condition',
    1: 'Deceased',
    2: 'Hospitalized',
    3: 'Recovered',
    4: 'Recovered (hospitalized)'
}

def print_full(x):
    try:
        v = len(x)
    except:
        v = 1000000
    pd.set_option('display.max_rows', v)
    print(x)
    pd.reset_option('display.max_rows')

In [3]:
df = pd.read_pickle('./processed-data.pkl')

X = df.drop(['outcome'], axis=1).values
y = df['outcome'].values

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, stratify=y, test_size=0.1, random_state=0)

In [None]:
rf = RandomForestClassifier(n_estimators=170, max_depth=int(2.038281164266072))
rf_score = model_selection.cross_val_score(rf, X, y, cv=8, scoring='accuracy')

print('RandomForestClassifier:', rf_score.mean(), rf_score.std())

# n_estimators=170, max_depth=int(2.038281164266072), 0.662, negli s.d. quite quick

In [None]:
clf = RandomForestClassifier(n_estimators=170, max_depth=int(2.038281164266072))
clf.fit(X_train, y_train)

In [None]:
def plot_multiclass_roc(clf, X_test, y_test, n_classes, figsize=(17, 6)):
    y_score = clf.predict_proba(X_test)

    # structures
    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    # calculate dummies once
    y_test_dummies = pd.get_dummies(y_test, drop_first=False).values
    for i in range(n_classes):
        fpr[i], tpr[i], _ = metrics.roc_curve(y_test_dummies[:, i], y_score[:, i])
        roc_auc[i] = metrics.auc(fpr[i], tpr[i])

    # roc for each class
    fig, ax = plt.subplots(figsize=figsize)
    ax.plot([0, 1], [0, 1], 'k--')
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('RandomForest ROC')
    for i in range(n_classes):
        ax.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f) for %s' % (roc_auc[i], cl_dict[i]))
    ax.legend(loc="best")
    ax.grid(alpha=.4)
    sns.despine()
    plt.show()

plot_multiclass_roc(clf, X_test, y_test, 5)

In [4]:
import xgboost as xgb

xgb_clf = xgb.XGBClassifier(
    gamma=0.0642050514904769,
    learning_rate=0.8997456965067644,
    max_depth=6,
    min_child_weight=20,
    n_estimators=478,
    n_jobs=4,
    objective="multi:softprob",
    reg_alpha=0.7141048442066745,
    reg_lambda=0.48953584540314277,
    silent=True,
    use_label_encoder=False,
    subsample=0.6305093851211034,
    verbosity=0,
    tree_method="hist",
)

xgb_score = model_selection.cross_val_score(xgb_clf, X, y, cv=5, scoring='accuracy')
print('XGBClassifier:', xgb_score.mean(), xgb_score.std())

XGBClassifier: 0.610825385650932 0.07351635905078435
