In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
import joblib
from xgboost import XGBClassifier

np.random.seed(503)

In [2]:
# General parameters for the script
target_name = 'RegisteredInTargetPeriod'  # Target variable
features = [
    'DaysSinceLastRegistration', 'DaysSinceFirstRegistration',
    'PastRegistrations', 'LastDonationLocation_Center',
    'LastDonationType_Platelets', 'CenterRegistrationProportion', 'DonationsPerDay',
    'PlateletRegistrationProportion'
]

# Decide whether we're loading a subset or the full set
# dataset_size = 'partial'
dataset_size = 'full'

if dataset_size == 'full':
    file_names = {
        'X': 'X_train_full.csv',
        'y': 'y_train_full.csv',
        'model': '../../models/classifier_full.pkl'
    }
elif dataset_size == 'partial':
    file_names = {
        'X': 'X_train.csv',
        'y': 'y_train.csv',
        'model': '../../models/classifier.pkl'
    }

In [3]:
# Load data
with open('../../data/processed/dtypes.json') as in_file:
    non_date_dtypes = json.load(in_file)

with open('../../data/processed/date_types.json') as in_file:
    date_dtypes = json.load(in_file)

date_cols = list(date_dtypes)

# Read data, specifically parsing date columns as dates and only picking the features + target
X_train = pd.read_csv('../../data/processed/{0}'.format(file_names['X']), dtype=non_date_dtypes, index_col=0)
y_train = pd.read_csv('../../data/processed/{0}'.format(file_names['y']), index_col=0).squeeze()

In [4]:
# X_train = X_train[X_train['DaysSinceLastRegistration'] < 1200]

In [5]:
# Fit model using parameters selected via cross-validation
# clf = RandomForestClassifier(max_depth=15, n_estimators=150, verbose=1, n_jobs=8)
# clf = AdaBoostClassifier(learning_rate=1, n_estimators=100)
# clf = LogisticRegression(penalty='l2')
clf = XGBClassifier(learning_rate=0.5, n_estimators=100, max_depth=5, n_jobs=8, tree_method='hist', verbosity=2)
clf.fit(X_train, y_train)

[20:25:01] INFO: src/learner.cc:215: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[20:25:08] INFO: src/tree/updater_quantile_hist.cc:63: Generating gmat: 6.68585 sec
[20:25:09] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[20:25:11] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[20:25:12] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[20:25:14] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[20:25:15] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[20:25:17] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[20:25:18] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 ext

[20:26:16] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=5
[20:26:17] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[20:26:17] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 0 pruned nodes, max_depth=5
[20:26:18] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[20:26:19] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=5
[20:26:20] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 40 extra nodes, 0 pruned nodes, max_depth=5
[20:26:21] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=5
[20:26:22] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 0 pruned nodes, max_depth=5
[20:26:22] INFO: src/tree/updater_prune.cc:74: tree prun

XGBClassifier(learning_rate=0.5, max_depth=5, n_jobs=8, tree_method='hist',
              verbosity=2)

In [6]:
# Save model
joblib.dump(clf, file_names['model'])

['../../models/classifier_full.pkl']