In [63]:
import pandas as pd
import numpy as np
from data import CleanedData
from models import nn
import sklearn.metrics
import torch
import matplotlib.pyplot as plt

# For Adaboost
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
import pickle

# For Adaboost Tuning
from sklearn import preprocessing
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

# For KNN
from sklearn.neighbors import KNeighborsClassifier

In [2]:
data = CleanedData(loc='./data/joined.csv.gz', test_ratio=0.2, impute_data=True, convert_non_numerical=True, normalize_data=True, drop_columns=['source', 'Last_Update','province','country','latitude', 'longitude'])

  return f(**kwargs)


In [3]:
X_test, y_test = data.test_data

In [48]:
X = data.data.drop(columns=['outcome', 'sex', 'date_confirmation', 'Combined_Key']).to_numpy()
y = data.data['outcome'].to_numpy()

In [13]:
X

array([[ 43.02504491,  22.694884  ,  71.590923  , ...,   0.        ,
        169.        ,   2.        ],
       [ 94.        , -11.766533  , -76.604498  , ...,   0.        ,
        238.        ,   0.        ],
       [ 40.59487903,  22.694884  ,  71.590923  , ...,   0.        ,
        169.        ,   2.        ],
       ...,
       [ 39.        , -11.00523373, -76.15759133, ...,   0.        ,
        352.        ,   0.        ],
       [ 66.        ,   6.02517268, -74.6520927 , ...,   0.        ,
        102.        ,   0.        ],
       [ 27.77693403,  20.06780113,  79.02780697, ...,   0.        ,
        195.        ,   2.        ]])

In [49]:
y

array([1, 2, 1, ..., 2, 2, 1])

In [37]:
def deceased_recall(y_true, y_pred):
    scores = recall_score(y_true, y_pred, average=None)
    return scores[0]

In [99]:
def overall_recall(y_true, y_pred):
    return recall_score(y_true, y_pred, average = 'micro')

In [100]:
param_grid = [{'n_estimators': [10, 50, 100],
'learning_rate':[1,2,5,10]}]

scoring = {'Accuracy': make_scorer(accuracy_score), 'Overall_Recall': make_scorer(overall_recall), 'deceased_Recall': make_scorer(deceased_recall)}

grid = GridSearchCV(AdaBoostClassifier(), param_grid=param_grid, scoring=scoring, refit='Overall_Recall', n_jobs = -1, cv = 4)

In [101]:
grid.fit(X,y)

GridSearchCV(cv=4, estimator=AdaBoostClassifier(), n_jobs=-1,
             param_grid=[{'learning_rate': [1, 2, 5, 10],
                          'n_estimators': [10, 50, 100]}],
             refit='Overall_Recall',
             scoring={'Accuracy': make_scorer(accuracy_score),
                      'Overall_Recall': make_scorer(overall_recall),
                      'deceased_Recall': make_scorer(deceased_recall)})

In [108]:
results = pd.DataFrame(grid.cv_results_)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_n_estimators,params,split0_test_Accuracy,split1_test_Accuracy,split2_test_Accuracy,...,mean_test_Overall_Recall,std_test_Overall_Recall,rank_test_Overall_Recall,split0_test_deceased_Recall,split1_test_deceased_Recall,split2_test_deceased_Recall,split3_test_deceased_Recall,mean_test_deceased_Recall,std_test_deceased_Recall,rank_test_deceased_Recall
0,21.244448,0.233944,1.387744,0.026093,1,10,"{'learning_rate': 1, 'n_estimators': 10}",0.809528,0.811665,0.766408,...,0.788615,0.021995,2,0.006,0.0,0.0,0.0,0.0015,0.002598,11
1,105.294025,1.605846,6.334218,0.099496,1,50,"{'learning_rate': 1, 'n_estimators': 50}",0.807282,0.783747,0.72166,...,0.745508,0.053964,5,0.001333,0.008005,0.008,0.326,0.085835,0.138686,8
2,210.577762,4.673932,12.694044,0.103358,1,100,"{'learning_rate': 1, 'n_estimators': 100}",0.803959,0.733624,0.701702,...,0.70425,0.081888,6,0.020667,0.038025,0.034667,0.582,0.16884,0.238627,6
3,21.124957,0.405476,1.359465,0.028504,2,10,"{'learning_rate': 2, 'n_estimators': 10}",0.812693,0.812706,0.812763,...,0.783235,0.051072,3,0.0,0.0,0.0,0.0,0.0,0.0,12
4,104.859853,2.201875,6.700093,0.222943,2,50,"{'learning_rate': 2, 'n_estimators': 50}",0.812693,0.812706,0.812763,...,0.797158,0.026956,1,0.0,0.0,0.0,0.081333,0.020333,0.035218,10
5,210.009666,4.477447,12.758247,0.264183,2,100,"{'learning_rate': 2, 'n_estimators': 100}",0.812693,0.812706,0.812763,...,0.781955,0.053287,4,0.0,0.0,0.0,0.500667,0.125167,0.216795,7
6,18.740341,1.532921,1.372905,0.044796,5,10,"{'learning_rate': 5, 'n_estimators': 10}",0.172774,0.361038,0.340103,...,0.409146,0.216755,7,0.014,0.0,0.095333,0.018667,0.032,0.037205,9
7,87.542291,5.480122,6.320621,0.07958,5,50,"{'learning_rate': 5, 'n_estimators': 50}",0.455202,0.277234,0.389817,...,0.352781,0.073577,8,0.056,0.043362,0.038667,0.742667,0.220174,0.301728,5
8,114.280915,33.700312,11.08394,1.352745,5,100,"{'learning_rate': 5, 'n_estimators': 100}",0.043455,0.454014,0.410121,...,0.350156,0.179487,9,0.809333,0.006004,0.488,0.272667,0.394001,0.294364,1
9,11.469687,1.132709,1.395216,0.036918,10,10,"{'learning_rate': 10, 'n_estimators': 10}",0.026978,0.007658,0.010765,...,0.124459,0.189499,10,0.0,0.0,1.0,0.375333,0.343833,0.408653,2


In [111]:
results.to_csv("output.csv")