# Setup

In [1]:
# Import libraries
import json
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_selection import RFECV, SelectKBest, VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report

In [2]:
# Load data
with open('../../data/processed/dtypes.json') as in_file:
    non_date_dtypes = json.load(in_file)

with open('../../data/processed/date_types.json') as in_file:
    date_dtypes = json.load(in_file)

date_cols = list(date_dtypes)
print(non_date_dtypes)
print(date_cols)

{'Random_ID': 'int64', 'DaysSinceLastRegistration': 'int64', 'DaysSinceFirstRegistration': 'int64', 'PastRegistrations': 'int64', 'DaysSinceLast2UnitsRBCRegistration': 'float64', 'DaysSinceLastPlasmaApheresisRegistration': 'float64', 'DaysSinceLastPlateletApheresisRegistration': 'float64', 'DaysSinceLastPlateletsandConcurrentPlasmaRegistration': 'float64', 'DaysSinceLastRBCwithPlasmaRegistration': 'float64', 'DaysSinceLastRBCwithPlateletsRegistration': 'float64', 'DaysSinceLastRBCwithPlateletsandPlasmaRegistration': 'float64', 'DaysSinceLastSingleUnitRecoveryRegistration': 'float64', 'DaysSinceLastWholeBloodRegistration': 'float64', 'DaysEligible': 'float64', 'PercentOfTargetPeriodEligible': 'float64', 'LastDonationLocation_Center': 'float64', 'LastDonationType_2UnitsRBC': 'float64', 'LastDonationType_PlasmaApheresis': 'float64', 'LastDonationType_PlateletApheresis': 'float64', 'LastDonationType_PlateletsandConcurrentPlasma': 'float64', 'LastDonationType_RBCwithPlasma': 'float64', 'Las

In [3]:
# Read data, specifically parsing date columns as dates
data = pd.read_csv('../../data/processed/data.csv', dtype=non_date_dtypes, parse_dates=date_cols)

In [4]:
data.head()

Unnamed: 0,Random_ID,DaysSinceLastRegistration,DaysSinceFirstRegistration,PastRegistrations,DaysSinceLast2UnitsRBCRegistration,DaysSinceLastPlasmaApheresisRegistration,DaysSinceLastPlateletApheresisRegistration,DaysSinceLastPlateletsandConcurrentPlasmaRegistration,DaysSinceLastRBCwithPlasmaRegistration,DaysSinceLastRBCwithPlateletsRegistration,...,TargetRBCwithPlasmaRegistrations,TargetRBCwithPlateletsRegistrations,TargetRBCwithPlateletsandPlasmaRegistrations,TargetSingleUnitRecoveryRegistrations,TargetWholeBloodRegistrations,TargetPlateletRegistrations,DonationsPerDay,CutoffDate,RegisteredInTargetPeriod,RegisteredForPlateletsInTargetPeriod
0,54260,32,32,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,2016-03-31 23:59:59,0,0
1,54261,308,308,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.003247,2016-03-31 23:59:59,0,0
2,54273,165,165,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.006061,2016-03-31 23:59:59,1,0
3,54330,100,301,2,301.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.006645,2016-03-31 23:59:59,0,0
4,54354,200,200,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.005,2016-03-31 23:59:59,1,0


In [5]:
list(data.columns)

['Random_ID',
 'DaysSinceLastRegistration',
 'DaysSinceFirstRegistration',
 'PastRegistrations',
 'DaysSinceLast2UnitsRBCRegistration',
 'DaysSinceLastPlasmaApheresisRegistration',
 'DaysSinceLastPlateletApheresisRegistration',
 'DaysSinceLastPlateletsandConcurrentPlasmaRegistration',
 'DaysSinceLastRBCwithPlasmaRegistration',
 'DaysSinceLastRBCwithPlateletsRegistration',
 'DaysSinceLastRBCwithPlateletsandPlasmaRegistration',
 'DaysSinceLastSingleUnitRecoveryRegistration',
 'DaysSinceLastWholeBloodRegistration',
 'DaysEligible',
 'PercentOfTargetPeriodEligible',
 'LastDonationLocation_Center',
 'LastDonationType_2UnitsRBC',
 'LastDonationType_PlasmaApheresis',
 'LastDonationType_PlateletApheresis',
 'LastDonationType_PlateletsandConcurrentPlasma',
 'LastDonationType_RBCwithPlasma',
 'LastDonationType_RBCwithPlatelets',
 'LastDonationType_RBCwithPlateletsandPlasma',
 'LastDonationType_SingleUnitRecovery',
 'LastDonationType_WholeBlood',
 'Past2UnitsRBCRegistrations',
 'PastPlasmaApheres

In [6]:
# Choose target variable
target_name = 'RegisteredInTargetPeriod'
# target_name = 'RegisteredForPlateletsInTargetPeriod'

cols_to_exclude = ['Random_ID', 'CutoffDate', 
                   'DaysSinceLast2UnitsRBCRegistration', 'DaysSinceLastPlasmaApheresisRegistration', 'DaysSinceLastPlateletApheresisRegistration',
                   'DaysSinceLastPlateletsandConcurrentPlasmaRegistration', 'DaysSinceLastRBCwithPlasmaRegistration', 'DaysSinceLastRBCwithPlateletsRegistration', 
                   'DaysSinceLastRBCwithPlateletsandPlasmaRegistration', 'DaysSinceLastSingleUnitRecoveryRegistration', 'DaysSinceLastWholeBloodRegistration',
                   'LastDonationType_2UnitsRBC', 'LastDonationType_PlasmaApheresis', 'LastDonationType_PlateletApheresis', 'LastDonationType_PlateletsandConcurrentPlasma', 
                   'LastDonationType_RBCwithPlasma', 'LastDonationType_RBCwithPlatelets', 'LastDonationType_RBCwithPlateletsandPlasma', 'LastDonationType_SingleUnitRecovery']
features = [col for col in list(data.columns) if 'Target' not in col and col not in cols_to_exclude]
print(features)

['DaysSinceLastRegistration', 'DaysSinceFirstRegistration', 'PastRegistrations', 'DaysEligible', 'LastDonationLocation_Center', 'LastDonationType_WholeBlood', 'Past2UnitsRBCRegistrations', 'PastPlasmaApheresisRegistrations', 'PastPlateletApheresisRegistrations', 'PastPlateletsandConcurrentPlasmaRegistrations', 'PastRBCwithPlasmaRegistrations', 'PastRBCwithPlateletsRegistrations', 'PastRBCwithPlateletsandPlasmaRegistrations', 'PastSingleUnitRecoveryRegistrations', 'PastWholeBloodRegistrations', 'PastCenterRegistrations', 'PastMobileRegistrations', 'ModalDonationLocation_Center', 'DonationsPerDay']


# Downsample data

In [7]:
from sklearn.utils import resample

# Separate majority (negative) and minority (positive) targets
data_majority = data[data[target_name] == 0]
data_minority = data[data[target_name] == 1]

# Downsample the majority
data_majority_downsampled = resample(data_majority, replace=False, n_samples=data[target_name].value_counts()[1], random_state=503)

# Combine into a new dataset
data_downsampled = pd.concat([data_majority_downsampled, data_minority])

data_downsampled[target_name].value_counts()

1    159303
0    159303
Name: RegisteredInTargetPeriod, dtype: int64

# Feature selection

In [8]:
X = data_downsampled.loc[:, features]
y = data_downsampled.loc[:, target_name]

In [9]:
clf_logreg = LogisticRegression(penalty='none', random_state=503)

In [10]:
X_train, X_test, y_train, y_text = train_test_split(X, y, test_size=0.2, random_state=503)

In [11]:
rfecv = RFECV(estimator=clf_logreg, step=1, cv=StratifiedKFold(2), scoring='accuracy')
rfecv.fit(X_train, y_train)

RFECV(cv=StratifiedKFold(n_splits=2, random_state=None, shuffle=False),
      estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                   fit_intercept=True, intercept_scaling=1,
                                   l1_ratio=None, max_iter=100,
                                   multi_class='auto', n_jobs=None,
                                   penalty='none', random_state=503,
                                   solver='lbfgs', tol=0.0001, verbose=0,
                                   warm_start=False),
      min_features_to_select=1, n_jobs=None, scoring='accuracy', step=1,
      verbose=0)

In [12]:
print(f"Optimal number of features: {rfecv.n_features_}")

Optimal number of features: 19


In [13]:
X_train.columns[rfecv.ranking_ == 1]

Index(['DaysSinceLastRegistration', 'DaysSinceFirstRegistration',
       'PastRegistrations', 'DaysEligible', 'LastDonationLocation_Center',
       'LastDonationType_WholeBlood', 'Past2UnitsRBCRegistrations',
       'PastPlasmaApheresisRegistrations',
       'PastPlateletApheresisRegistrations',
       'PastPlateletsandConcurrentPlasmaRegistrations',
       'PastRBCwithPlasmaRegistrations', 'PastRBCwithPlateletsRegistrations',
       'PastRBCwithPlateletsandPlasmaRegistrations',
       'PastSingleUnitRecoveryRegistrations', 'PastWholeBloodRegistrations',
       'PastCenterRegistrations', 'PastMobileRegistrations',
       'ModalDonationLocation_Center', 'DonationsPerDay'],
      dtype='object')