In [None]:
# Import libraries
import json
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from itertools import chain, combinations
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report

In [None]:
# Load data
with open('../data/processed/dtypes.json') as in_file:
    non_date_dtypes = json.load(in_file)

with open('../data/processed/date_types.json') as in_file:
    date_dtypes = json.load(in_file)

date_cols = list(date_dtypes)
print(non_date_dtypes)
print(date_cols)

In [None]:
# Read data, specifically parsing date columns as dates
data = pd.read_csv('../data/processed/data.csv', dtype=non_date_dtypes, parse_dates=date_cols)

In [None]:
# Choose target variable
target_name = 'RegisteredInTargetPeriod'
# target_name = 'RegisteredForPlateletsInTargetPeriod'

# features = [col for col in list(data.columns) if col not in ('Random_ID', 'CutoffDate') and 'Target' not in col]
features = ['DaysSinceLastRegistration', 'DaysSinceFirstRegistration',
       'PastRegistrations', 'DaysSinceLastWholeBloodRegistration', 'PercentOfTargetPeriodEligible', 'LastDonationLocation_Center',
       'LastDonationType_2UnitsRBC', 'LastDonationType_PlasmaApheresis',
       'LastDonationType_PlateletApheresis',
       'LastDonationType_PlateletsandConcurrentPlasma',
       'LastDonationType_RBCwithPlasma', 'LastDonationType_RBCwithPlatelets',
       'LastDonationType_RBCwithPlateletsandPlasma',
       'LastDonationType_SingleUnitRecovery', 'PastWholeBloodRegistrations',
       'PastCenterRegistrations', 'PastMobileRegistrations',
       'ModalDonationLocation_Center']

In [None]:
def calculate_best_subset(estimator, X, y, max_size=5, cv=3):
    """Calculates the best model of up to max_size features of X.
    estimator must have a fit and score functions.
    X must be a DataFrame."""

    n_features = X.shape[1]
    subsets = (combinations(range(n_features), k + 1) 
               for k in range(min(n_features, max_size)))

    best_size_subset = []
    k = 1
    for subsets_k in subsets:  # for each list of subsets of the same size
        print(f"Trying subsets of size {k}...")
        best_score = -np.inf
        best_subset = None
        for subset in subsets_k: # for each subset
            print(f"\tTrying feature subset: {list(subset)}...")
            estimator.fit(X.iloc[:, list(subset)], y)
            # get the subset with the best score among subsets of the same size
            score = estimator.score(X.iloc[:, list(subset)], y)
            if score > best_score:
                best_score, best_subset = score, subset
        # to compare subsets of different sizes we must use CV
        # first store the best subset of each size
        best_size_subset.append(best_subset)
        k += 1

    # compare best subsets of each size
    best_score = -np.inf
    best_subset = None
    list_scores = []
    for subset in best_size_subset:
        score = cross_val_score(estimator, X.iloc[:, list(subset)], y, cv=cv).mean()
        list_scores.append(score)
        if score > best_score:
            best_score, best_subset = score, subset

    return best_subset, best_score, best_size_subset, list_scores

In [None]:
from sklearn.utils import resample

# Separate majority (negative) and minority (positive) targets
data_majority = data[data[target_name] == 0]
data_minority = data[data[target_name] == 1]

# Downsample the majority
data_majority_downsampled = resample(data_majority, replace=False, n_samples=data[target_name].value_counts()[1], random_state=503)

# Combine into a new dataset
data_downsampled = pd.concat([data_majority_downsampled, data_minority])

# Set X and y
X = data_downsampled.loc[:, features]
y = data_downsampled.loc[:, target_name]

# May also need to try downsampling before feeding into the function

In [None]:
# Split data randomly
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=503)
print(f"Training feature set size: {X_train.shape}")
print(f"Training response set size: {y_train.shape}")
print(f"Test feature set size: {X_test.shape}")
print(f"Test response set size: {y_test.shape}")

In [None]:
# Instantiate an estimator/classifier
clf_logreg = LogisticRegression(penalty='none', random_state=503)

In [None]:
best_subset, best_score, best_size_subset, list_scores = calculate_best_subset(clf_logreg, X_train, y_train)

In [None]:
print(best_subset)  # 0, 2, 4, 5, 6

In [None]:
print(best_score)

In [None]:
print(best_size_subset)

In [None]:
print(list_scores)