In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

np.random.seed(503)

In [2]:
# General parameters for the script
target_name = 'RegisteredInTargetPeriod'  # Target variable
features = [
    'DaysSinceLastRegistration', 'DaysSinceFirstRegistration',
    'PastRegistrations', 'LastDonationLocation_Center',
    'LastDonationType_Platelets', 'CenterRegistrationProportion', 'DonationsPerDay',
    'PlateletRegistrationProportion'
]

# scoring = 'precision'  # Precision: TP / (TP + FN), or % of predicted positives that are actually positive
# scoring = 'balanced_accuracy'
# scoring = 'roc_auc'
scoring = 'average_precision'  # Summarizes a precision-recall curve: weighted mean of precisions achieved at each threshold for binary classification, appropriate for imbalanced data

# Decide whether we're loading a subset or the full set
# dataset_size = 'partial'
dataset_size = 'full'

if dataset_size == 'full':
    file_names = {
        'X': 'X_train_full.csv',
        'y': 'y_train_full.csv'
    }
elif dataset_size == 'partial':
    file_names = {
        'X': 'X_train.csv',
        'y': 'y_train.csv'
    }

In [3]:
# Load data
with open('../../data/processed/dtypes.json') as in_file:
    non_date_dtypes = json.load(in_file)

with open('../../data/processed/date_types.json') as in_file:
    date_dtypes = json.load(in_file)

date_cols = list(date_dtypes)

# Read data, specifically parsing date columns as dates and only picking the features + target
X_train = pd.read_csv('../../data/processed/{0}'.format(file_names['X']), dtype=non_date_dtypes, index_col=0)
y_train = pd.read_csv('../../data/processed/{0}'.format(file_names['y']), index_col=0)

In [4]:
X_train.head()

Unnamed: 0,DaysSinceLastRegistration,DaysSinceFirstRegistration,PastRegistrations,LastDonationLocation_Center,LastDonationType_Platelets,CenterRegistrationProportion,DonationsPerDay,PlateletRegistrationProportion
0,1601,1601,1,0.0,1.0,0.0,0.000625,1.0
1,980,1140,2,0.0,0.0,0.5,0.001754,0.0
2,911,1270,3,0.0,0.0,0.0,0.002362,0.0
3,626,957,4,0.0,0.0,0.0,0.00418,0.0
4,649,1007,5,0.0,0.0,0.0,0.004965,0.0


In [5]:
y_train.head()

Unnamed: 0,RegisteredInTargetPeriod
0,0
1,0
2,0
3,0
4,0


In [6]:
# This code from: http://www.davidsbatista.net/blog/2018/02/23/model_optimization/

class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

In [7]:
models = {
    'LogisticRegression': LogisticRegression(),
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'GaussianNB': GaussianNB()
}

params = {
    'LogisticRegression': { 'penalty': ['none', 'l2'] },
    'RandomForestClassifier': { 'n_estimators': [32, 64] },
    'AdaBoostClassifier': { 'n_estimators': [32, 64] },
    'GradientBoostingClassifier': { 'n_estimators': [64, 128], 'learning_rate': [0.1, 0.5] },
    'KNeighborsClassifier': { 'n_neighbors': [5, 10] },
    'DecisionTreeClassifier': {},
    'GaussianNB': {}
}

In [8]:
# Run grid search CV across all estimators
helper = EstimatorSelectionHelper(models, params)
helper.fit(X_train, y_train, scoring=scoring, n_jobs=8)

Running GridSearchCV for LogisticRegression.
Fitting 3 folds for each of 2 candidates, totalling 6 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   6 out of   6 | elapsed:   25.3s finished
Running GridSearchCV for RandomForestClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   6 out of   6 | elapsed: 13.0min finished
Running GridSearchCV for AdaBoostClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   6 out of   6 | elapsed:  5.1min finished
Running GridSearchCV for GradientBoostingClassifier.
Fitting 3 folds for each of 4 candidates, totalling 12 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  10 out of  12 | elapsed: 23.0min 

In [9]:
helper.score_summary(sort_by='mean_score')

LogisticRegression
RandomForestClassifier
AdaBoostClassifier
GradientBoostingClassifier
KNeighborsClassifier
DecisionTreeClassifier
GaussianNB


Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,penalty,n_estimators,learning_rate,n_neighbors
3,RandomForestClassifier,0.674052,0.799422,0.862289,0.0886498,,64.0,,
2,RandomForestClassifier,0.671488,0.79639,0.859425,0.0883201,,32.0,,
9,GradientBoostingClassifier,0.668471,0.7664,0.815526,0.0692462,,128.0,0.5,
8,GradientBoostingClassifier,0.67432,0.758469,0.800685,0.0595024,,64.0,0.5,
7,GradientBoostingClassifier,0.685567,0.747886,0.779849,0.0440715,,128.0,0.1,
6,GradientBoostingClassifier,0.688606,0.739391,0.765789,0.0359202,,64.0,0.1,
5,AdaBoostClassifier,0.671847,0.721603,0.74768,0.0351965,,64.0,,
4,AdaBoostClassifier,0.654685,0.70606,0.732076,0.0363289,,32.0,,
11,KNeighborsClassifier,0.680187,0.693262,0.69996,0.00924664,,,,10.0
12,DecisionTreeClassifier,0.564528,0.68116,0.739538,0.0824707,,,,


# Results


Partial dataset: `GradientBoostingClassifier` with `n_estimators` = 128, `learning_rate` = 0.5 had the best performance with a mean average precision of **0.674885**.

Full dataset: `RandomForestClassifier` with `n_estimators` = 64 had the best performance with a mean average precision of **0.799422**.