In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import joblib

np.random.seed(503)

In [2]:
# General parameters for the script
target_name = 'RegisteredInTargetPeriod'  # Target variable
features = [
    'DaysSinceLastRegistration', 'DaysSinceFirstRegistration',
    'PastRegistrations', 'LastDonationLocation_Center',
    'LastDonationType_Platelets', 'CenterRegistrationProportion', 'DonationsPerDay',
    'PlateletRegistrationProportion'
]

# Decide whether we're loading a subset or the full set
# dataset_size = 'partial'
dataset_size = 'full'

if dataset_size == 'full':
    file_names = {
        'X': 'X_train_full.csv',
        'y': 'y_train_full.csv',
        'model': '../../models/classifier_full.pkl'
    }
elif dataset_size == 'partial':
    file_names = {
        'X': 'X_train.csv',
        'y': 'y_train.csv',
        'model': '../../models/classifier.pkl'
    }

In [3]:
# Load data
with open('../../data/processed/dtypes.json') as in_file:
    non_date_dtypes = json.load(in_file)

with open('../../data/processed/date_types.json') as in_file:
    date_dtypes = json.load(in_file)

date_cols = list(date_dtypes)

# Read data, specifically parsing date columns as dates and only picking the features + target
X_train = pd.read_csv('../../data/processed/{0}'.format(file_names['X']), dtype=non_date_dtypes, index_col=0)
y_train = pd.read_csv('../../data/processed/{0}'.format(file_names['y']), index_col=0)

In [4]:
# Fit model using parameters selected via cross-validation
clf = GradientBoostingClassifier(n_estimators=128, learning_rate=0.5, verbose=1)
clf.fit(X_train, y_train)

Iter       Train Loss   Remaining Time 
         1           0.7534           23.30m
         2           0.7273           22.93m
         3           0.7177           22.89m
         4           0.7133           22.76m
         5           0.7108           23.01m
         6           0.7096           22.78m
         7           0.7086           22.44m
         8           0.7078           22.18m
         9           0.7071           21.96m
        10           0.7067           21.78m
        20           0.7039           19.79m
        30           0.7031           18.11m
        40           0.7028           16.33m
        50           0.7024           14.36m
        60           0.7022           12.43m
        70           0.7021           10.56m
        80           0.7020            8.71m
        90           0.7018            6.87m
       100           0.7017            5.06m


GradientBoostingClassifier(learning_rate=0.5, n_estimators=128, verbose=1)

In [5]:
# Save model
joblib.dump(clf, file_names['model'])

['../../models/classifier_full.pkl']