In [1]:
%load_ext autoreload

%autoreload 2 
from datetime import timedelta
import pandas as pd
import numpy as np
from data import CleanedData
import sklearn
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import pickle
import matplotlib.pyplot as plts
import time
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [14]:
X = data.data.drop(columns=['outcome', 'sex', 'date_confirmation', 'Combined_Key']).to_numpy()
y = data.data['outcome'].to_numpy()

In [15]:
def overall_recall(y_true, y_pred):
    score = recall_score(y_true, y_pred, average='macro')
    return score

def deceased_recall(y_true, y_pred):
    score = recall_score(y_true, y_pred, average=None)
    return score[0]

In [16]:
grid_params = {
    'n_neighbors': [38,40,42,44],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

## Overall_Recall
#gs = GridSearchCV(
#    KNeighborsClassifier(),
#    grid_params,
#    verbose = 1,
#    cv = 4,
#    n_jobs= -1,
#    scoring=make_scorer(overall_recall, greater_is_better=True)
#    )

# Accuracy
#gs = GridSearchCV(
#    KNeighborsClassifier(),
#    grid_params,
#    verbose = 1,
#    cv = 4,
#    n_jobs= -1,
#    scoring='accuracy'
#    )

scoring = make_scorer(deceased_recall)

# Deceased Recall
gs = GridSearchCV(
    KNeighborsClassifier(),
    grid_params,
    verbose = 1,
    cv = 4,
    n_jobs= -1,
    scoring=scoring,
    refit='deceased_recall'
    )

gs_results = gs.fit(X_train, y_train)
gs_results = gs.fit(X, y)

Fitting 4 folds for each of 24 candidates, totalling 96 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed: 14.6min finished


Fitting 4 folds for each of 24 candidates, totalling 96 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed: 21.7min finished


In [17]:
gs_results.best_score_

0.02917233711363131

In [18]:
gs_results.best_estimator_

KNeighborsClassifier(metric='euclidean', n_neighbors=38, weights='distance')

In [19]:
gs_results.best_params_

{'metric': 'euclidean', 'n_neighbors': 38, 'weights': 'distance'}

In [20]:
gs_results.cv_results_

{'mean_fit_time': array([64.58770877, 60.2518034 , 59.24603933, 51.14301759, 50.15597928,
        56.9834137 , 50.97933781, 51.89557314, 53.83034408, 54.06148022,
        57.16432977, 65.94005066, 59.06041199, 57.58617723, 57.99922138,
        53.23973244, 52.86612064, 56.48976964, 52.7183131 , 52.5220204 ,
        54.38946033, 49.96923524, 49.87065458, 51.14834929]),
 'std_fit_time': array([ 8.18199606, 14.7865112 ,  7.86131595, 11.68970707, 15.69562905,
        10.35993335, 12.04552228, 12.75817545, 10.15747786, 13.22553548,
        16.26622019, 11.64051829, 12.18236146, 14.03997585,  8.64195733,
        11.22297248, 14.08947219,  9.29974242, 11.42180216, 13.2934202 ,
         9.44331187, 11.47998926, 14.27770746,  3.10663458]),
 'mean_score_time': array([20.1757769 , 22.96391505, 24.03899449, 25.31346542, 24.4475736 ,
        26.79005581, 25.96347296, 24.53499389, 25.15102512, 27.05219209,
        26.30720055, 29.20148289, 28.32852328, 26.66580778, 25.02888513,
        25.28358138, 

In [21]:
compression_opts = dict(method='zip',
                        archive_name='out.csv')

pd.DataFrame(gs_results.cv_results_).to_csv('out.zip', index=False,
          compression=compression_opts)