In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

np.random.seed(503)

In [2]:
# scoring = 'precision'  # Precision: TP / (TP + FN), or % of predicted positives that are actually positive
# scoring = 'balanced_accuracy'
# scoring = 'roc_auc'
scoring = 'average_precision'  # Average of precision at all possible thresholds. 1) We have many donors, so we would like a model whose positive predictions are mostly correct (so we don't waste scarce resources), but we don't need to return ALL condition positives. Precision is a more suitable metric than recall here. 2) We don't necessarily care about correctly classifying any individual donor - we care more about the relative likelihood of a donor compared to other donors, so we don't want to pick a single decision threshold, but summarize how well the classifier orders any given donor compared to others.

# Decide whether we're loading a subset or the full set
# dataset_size = 'partial'
dataset_size = 'full'

if dataset_size == 'full':
    file_names = {
        'X': 'X_train_full.csv',
        'y': 'y_train_full.csv'
    }
elif dataset_size == 'partial':
    file_names = {
        'X': 'X_train.csv',
        'y': 'y_train.csv'
    }

In [3]:
# Load data
with open('../../data/processed/dtypes.json') as in_file:
    non_date_dtypes = json.load(in_file)

with open('../../data/processed/date_types.json') as in_file:
    date_dtypes = json.load(in_file)

date_cols = list(date_dtypes)

# Read data, specifically parsing date columns as dates and only picking the features + target
X_train = pd.read_csv('../../data/processed/{0}'.format(file_names['X']), dtype=non_date_dtypes, index_col=0)
y_train = pd.read_csv('../../data/processed/{0}'.format(file_names['y']), index_col=0).squeeze()

  mask |= (ar1 == a)


In [4]:
X_train.head()

Unnamed: 0,DaysSinceLastRegistration,DaysSinceFirstRegistration,PastRegistrations,LastDonationLocation_Center,LastDonationType_Platelets,CenterRegistrationProportion,DonationsPerDay,PlateletRegistrationProportion
22582640,209,740,2,0.0,0.0,0.0,0.002703,0.0
13737826,592,592,1,0.0,0.0,0.0,0.001689,0.0
10728157,257,313,2,0.0,0.0,0.0,0.00639,0.0
13802601,153,515,4,0.0,0.0,0.0,0.007767,0.0
18546659,401,401,1,0.0,0.0,0.0,0.002494,0.0


In [5]:
y_train.head()

22582640    0
13737826    0
10728157    0
13802601    0
18546659    0
Name: RegisteredInTargetPeriod, dtype: int64

In [6]:
# This code from: http://www.davidsbatista.net/blog/2018/02/23/model_optimization/

class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

In [7]:
models = {
    'LogisticRegression': LogisticRegression(),
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'XGBClassifier': XGBClassifier(),    
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'GaussianNB': GaussianNB(),
}

params = {
    'LogisticRegression': { 'penalty': ['none', 'l2'] },  # Default params: dual=False, tol=1e-4, C=1.0, fit_intercept=True, intercept_scaling=1...
    'RandomForestClassifier': { 'n_estimators': [100], 'max_depth': [5] },  # Default params: criterion='gini', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0...
    'AdaBoostClassifier': { 'n_estimators': [100], 'learning_rate': [1] },  # Default params: base_estimator=None (DecisionTreeClassifier w/ max_depth=1), aglorithm='SAMME.R
    'XGBClassifier': { 'learning_rate': [0.5], 'n_estimators': [100], 'max_depth': [5], 'tree_method': ['hist'] },
    'DecisionTreeClassifier': {},  # Default params: criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1...
    'GaussianNB': {},  # sklearn implementation automatically calculates priors
}

In [8]:
# Run grid search CV across all estimators
helper = EstimatorSelectionHelper(models, params)
helper.fit(X_train, y_train, scoring=scoring, n_jobs=8)

Running GridSearchCV for LogisticRegression.
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   6 out of   6 | elapsed:  3.9min finished


Running GridSearchCV for RandomForestClassifier.
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   3 out of   3 | elapsed: 36.8min finished


Running GridSearchCV for AdaBoostClassifier.
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   3 out of   3 | elapsed: 37.8min finished


Running GridSearchCV for XGBClassifier.
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   3 out of   3 | elapsed:  5.6min finished


Running GridSearchCV for DecisionTreeClassifier.
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   3 out of   3 | elapsed:  5.7min finished


Running GridSearchCV for GaussianNB.
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   3 out of   3 | elapsed:   32.6s finished


In [9]:
helper.score_summary(sort_by='mean_score')

LogisticRegression
RandomForestClassifier
AdaBoostClassifier
XGBClassifier
DecisionTreeClassifier
GaussianNB


Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,penalty,max_depth,n_estimators,learning_rate,tree_method
4,XGBClassifier,0.305453,0.305535,0.305646,8.14249e-05,,5.0,100.0,0.5,hist
3,AdaBoostClassifier,0.285788,0.285867,0.285937,6.11006e-05,,,100.0,1.0,
2,RandomForestClassifier,0.283268,0.283771,0.284144,0.000369363,,5.0,100.0,,
1,LogisticRegression,0.252981,0.253321,0.253598,0.00025551,l2,,,,
0,LogisticRegression,0.252981,0.253311,0.253569,0.000245361,none,,,,
6,GaussianNB,0.181932,0.182073,0.182184,0.000104941,,,,,
5,DecisionTreeClassifier,0.142914,0.143243,0.143441,0.000234323,,,,,


# Results

## Run 1
* Partial dataset: `GradientBoostingClassifier` with `n_estimators` = 128, `learning_rate` = 0.5 had the best performance with a mean average precision of **0.674885**.
* Full dataset: `RandomForestClassifier` with `n_estimators` = 64 had the best performance with a mean average precision of **0.799422**.

## Run 2
* Full dataset: `GradientBoostingClassifier` with `n_estimators` = 128, `learning_rate` = 0.5, `mean_score` = 0.703001

