# Overview

1. Generate clusters on dataset as a feature
2. Fit a variety of models using CV
3. Test best CV model to evaluate final performance

# Setup

In [1]:
# Import libraries
import json
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, classification_report
from sklearn.utils import resample

In [2]:
# Load data
with open('../../data/processed/dtypes.json') as in_file:
    non_date_dtypes = json.load(in_file)

with open('../../data/processed/date_types.json') as in_file:
    date_dtypes = json.load(in_file)

date_cols = list(date_dtypes)

# Read data, specifically parsing date columns as dates
data = pd.read_csv('../../data/processed/data.csv', dtype=non_date_dtypes, parse_dates=date_cols)

# data = data[data['DaysSinceLastRegistration'] <= 180]  # Must have donated within 180 days of each cutoff

In [3]:
# Choose target variable
target_name = 'RegisteredInTargetPeriod'
# target_name = 'RegisteredForPlateletsInTargetPeriod'

# feature_names = ['DaysSinceLastRegistration', 'DaysSinceFirstRegistration', 'PastRegistrations', 'LastDonationLocation_Center',
#                  'LastDonationType_WholeBlood', 'ModalDonationLocation_Center', 'DonationsPerDay', 'PercentOfTargetPeriodEligible']
feature_names = ['DaysSinceLastRegistration', 'DaysSinceFirstRegistration', 'PastRegistrations']

In [4]:
# Downsample data

# Separate majority (negative) and minority (positive) targets
data_majority = data[data[target_name] == 0]
data_minority = data[data[target_name] == 1]

# Downsample the majority
data_majority_downsampled = resample(data_majority, replace=False, n_samples=data[target_name].value_counts()[1], random_state=503)

# Combine into a new dataset
data_downsampled = pd.concat([data_majority_downsampled, data_minority])

data_downsampled[target_name].value_counts()

1    159303
0    159303
Name: RegisteredInTargetPeriod, dtype: int64

In [5]:
X = data_downsampled.loc[:, feature_names]
y = data_downsampled.loc[:, target_name]

# Split data randomly
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=503)
print(f"Training feature set size: {X_train.shape}")
print(f"Training response set size: {y_train.shape}")
print(f"Test feature set size: {X_test.shape}")
print(f"Test response set size: {y_test.shape}")

randstate = 503

Training feature set size: (254884, 3)
Training response set size: (254884,)
Test feature set size: (63722, 3)
Test response set size: (63722,)


In [6]:
kmeans = KMeans(n_clusters=5, random_state=randstate)
kmeans.fit(X_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=5, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=503, tol=0.0001, verbose=0)

In [7]:
clf_rf = RandomForestClassifier(max_depth=5, random_state=randstate)
clf_rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=503,
                       verbose=0, warm_start=False)

In [8]:
y_pred = clf_rf.predict(X_test)
clf_rf.score(X_test, y_test)

0.6800790935626628

In [9]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print((tn, fp, fn, tp))
print(classification_report(y_test, y_pred))

(22445, 9436, 10950, 20891)
              precision    recall  f1-score   support

           0       0.67      0.70      0.69     31881
           1       0.69      0.66      0.67     31841

    accuracy                           0.68     63722
   macro avg       0.68      0.68      0.68     63722
weighted avg       0.68      0.68      0.68     63722



In [10]:
clf_ab = AdaBoostClassifier(n_estimators=50, random_state=randstate)
clf_ab.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=503)

In [11]:
y_pred = clf_ab.predict(X_test)
clf_ab.score(X_test, y_test)

0.6789962650262076