# Overview

This notebook is used to experiment with various modeling techniques.

In [1]:
# Import libraries
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
# Load data
with open('../data/processed/dtypes.json') as in_file:
    non_date_dtypes = json.load(in_file)

with open('../data/processed/date_types.json') as in_file:
    date_dtypes = json.load(in_file)

date_cols = list(date_dtypes)
print(non_date_dtypes)
print(date_cols)

{'Random_ID': 'int64', 'DaysSinceLastRegistration': 'int64', 'DaysSinceFirstRegistration': 'int64', 'PastRegistrations': 'int64', 'DaysSinceLast2UnitsRBCRegistration': 'float64', 'DaysSinceLastPlasmaApheresisRegistration': 'float64', 'DaysSinceLastPlateletApheresisRegistration': 'float64', 'DaysSinceLastPlateletsandConcurrentPlasmaRegistration': 'float64', 'DaysSinceLastRBCwithPlasmaRegistration': 'float64', 'DaysSinceLastRBCwithPlateletsRegistration': 'float64', 'DaysSinceLastRBCwithPlateletsandPlasmaRegistration': 'float64', 'DaysSinceLastSingleUnitRecoveryRegistration': 'float64', 'DaysSinceLastWholeBloodRegistration': 'float64', 'Past2UnitsRBCRegistrations': 'float64', 'PastPlasmaApheresisRegistrations': 'float64', 'PastPlateletApheresisRegistrations': 'float64', 'PastPlateletsandConcurrentPlasmaRegistrations': 'float64', 'PastRBCwithPlasmaRegistrations': 'float64', 'PastRBCwithPlateletsRegistrations': 'float64', 'PastRBCwithPlateletsandPlasmaRegistrations': 'float64', 'PastSingleU

In [3]:
data = pd.read_csv('../data/processed/data.csv', dtype=non_date_dtypes, parse_dates=date_cols)

In [4]:
data.head()

Unnamed: 0,Random_ID,DaysSinceLastRegistration,DaysSinceFirstRegistration,PastRegistrations,DaysSinceLast2UnitsRBCRegistration,DaysSinceLastPlasmaApheresisRegistration,DaysSinceLastPlateletApheresisRegistration,DaysSinceLastPlateletsandConcurrentPlasmaRegistration,DaysSinceLastRBCwithPlasmaRegistration,DaysSinceLastRBCwithPlateletsRegistration,...,TargetPlateletApheresisRegistrations,TargetPlateletsandConcurrentPlasmaRegistrations,TargetRBCwithPlasmaRegistrations,TargetRBCwithPlateletsRegistrations,TargetRBCwithPlateletsandPlasmaRegistrations,TargetSingleUnitRecoveryRegistrations,TargetWholeBloodRegistrations,RegisteredInTargetPeriod,CutoffDate,TargetPeriodEndDate
0,54260,32,32,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2016-03-31 23:59:59,2016-04-30 23:59:59
1,54261,308,308,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2016-03-31 23:59:59,2016-04-30 23:59:59
2,54273,165,165,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,2016-03-31 23:59:59,2016-04-30 23:59:59
3,54330,100,301,2,301.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2016-03-31 23:59:59,2016-04-30 23:59:59
4,54354,200,200,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,2016-03-31 23:59:59,2016-04-30 23:59:59


In [5]:
# Construct feature set (remove date/time columns)
# feature_names =list(data.columns).remove(['Random_ID', 'CutoffDate', 'TargetPeriodEndDate'])
feature_names = ['DaysSinceLastRegistration', 'PastRegistrations', 'DaysSinceFirstRegistration']

X = data.loc[:, ['Random_ID'] + feature_names]
y = data.loc[:, 'RegisteredInTargetPeriod']

In [6]:
# Split data randomly
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=503)
print(f"Training feature set size: {X_train.shape}")
print(f"Training response set size: {y_train.shape}")
print(f"Test feature set size: {X_test.shape}")
print(f"Test response set size: {y_test.shape}")

Training feature set size: (652622, 4)
Training response set size: (652622,)
Test feature set size: (163156, 4)
Test response set size: (163156,)


In [7]:
# Train model on training data
clf_logreg = LogisticRegression(penalty='none', random_state=503).fit(X_train, y_train)

In [8]:
# Test on test data
clf_logreg.score(X_test, y_test)

0.916913873838535