# Overview

This notebook is used for exploring the data and documenting findings.

# Initial setup

In [1]:
# Import libraries
import glob
import pandas as pd
import numpy as np
import sqlite3
from sklearn.model_selection import train_test_split

In [2]:
# Load raw data
file_path = "../data/raw"
raw_files = glob.glob(file_path + "/*.csv")

file_list = []

for file in raw_files:
    file_list.append(pd.read_csv(file, index_col=None, header=0))

raw_data = pd.concat(file_list, axis=0, ignore_index=True)

In [3]:
print(raw_data.shape)
raw_data.head()

(3818852, 5)


Unnamed: 0,Random_ID,RegistrationTime,OutCome,DonationType,DonationLocation
0,52156190,6/27/2015 12:47:00,Donation,Whole Blood,Center
1,52825057,2/26/2015 9:53:00,Donation,2 Units RBC,Mobile
2,53025596,9/8/2015 16:49:59,Donation,Whole Blood,Mobile
3,2056692,8/26/2015 12:15:00,Donation,Whole Blood,Mobile
4,52879521,1/26/2015 17:18:00,Incomplete,Whole Blood,Center


# Clean raw data

In [4]:
# Show where we have NaNs/nulls
raw_data[raw_data.isnull().any(axis=1)]

Unnamed: 0,Random_ID,RegistrationTime,OutCome,DonationType,DonationLocation
9589,53639912,6/23/2015 14:20:59,Donation,Whole Blood,
30664,52877130,4/13/2015 16:49:00,Donation,Whole Blood,
55262,52869439,9/11/2015 13:38:00,Donation,Whole Blood,
236526,53594657,6/23/2015 15:55:00,Donation,Whole Blood,
240689,53165336,8/7/2015 17:19:59,Donation,Whole Blood,
268039,53256162,11,,,
284335,53373866,7/11/2016 17:42,Donation,Whole Blood,
596076,52879414,9/20/2016 14:42,Donation,Whole Blood,
670560,52971104,12/29/2016 13:49,Donation,Whole Blood,
737587,53591355,3/23/2016 14:57,Donation,2 Units RBC,


In [5]:
# Drop rows with nulls
cleaned_data = raw_data.dropna(axis=0, how='any')

In [6]:
# Clean up RegistrationTime: pad time with '00' seconds if only HH:MM is shown, then convert to a datetime type
cleaned_data['RegistrationTime'] = cleaned_data['RegistrationTime'].apply(lambda c: c + ':00' if c.count(':') < 2 else c)
cleaned_data['RegistrationTime'] = pd.to_datetime(cleaned_data['RegistrationTime'], format='%m/%d/%Y %H:%M:%S')
cleaned_data['RegistrationTime'].head()

0   2015-06-27 12:47:00
1   2015-02-26 09:53:00
2   2015-09-08 16:49:59
3   2015-08-26 12:15:00
4   2015-01-26 17:18:00
Name: RegistrationTime, dtype: datetime64[ns]

In [7]:
cleaned_data.head()

Unnamed: 0,Random_ID,RegistrationTime,OutCome,DonationType,DonationLocation
0,52156190,2015-06-27 12:47:00,Donation,Whole Blood,Center
1,52825057,2015-02-26 09:53:00,Donation,2 Units RBC,Mobile
2,53025596,2015-09-08 16:49:59,Donation,Whole Blood,Mobile
3,2056692,2015-08-26 12:15:00,Donation,Whole Blood,Mobile
4,52879521,2015-01-26 17:18:00,Incomplete,Whole Blood,Center


In [8]:
cleaned_data['DonationType'].unique()

array(['Whole Blood', '2 Units RBC', 'RBC with Platelets',
       'Platelet Apheresis', 'RBC with Plasma',
       'RBC with Platelets and Plasma', 'Plasma Apheresis',
       'Platelets and Concurrent Plasma', 'Single Unit Recovery'],
      dtype=object)

# Test pure Pandas

In [9]:
cleaned_data.groupby(by='Random_ID').agg({'RegistrationTime': 'count'}).nlargest(10, 'RegistrationTime')

Unnamed: 0_level_0,RegistrationTime
Random_ID,Unnamed: 1_level_1
54207926,168
53225617,117
50902340,112
51859516,112
52343278,111
52938451,106
53109325,105
2157068,103
51879814,103
53122258,102


In [10]:
subset_data = cleaned_data[cleaned_data['Random_ID'].isin([52437236, 52948680, 54207926])]

In [11]:
subset_data

Unnamed: 0,Random_ID,RegistrationTime,OutCome,DonationType,DonationLocation
12656,52437236,2015-04-14 12:31:00,Donation,Platelets and Concurrent Plasma,Center
15897,52437236,2015-03-13 12:50:59,Donation,Platelets and Concurrent Plasma,Center
17115,54207926,2015-09-24 09:57:59,Donation,Platelet Apheresis,Center
22328,52948680,2015-03-28 18:35:59,Donation,Whole Blood,Mobile
23600,52948680,2015-10-12 14:22:00,Donation,Platelet Apheresis,Center
...,...,...,...,...,...
3749418,52437236,2019-08-09 11:40:00,Donation,Platelets and Concurrent Plasma,Center
3750831,54207926,2019-08-09 11:40:00,Registration,Platelet Apheresis,Center
3774454,52948680,2019-07-29 14:09:00,Donation,Platelets and Concurrent Plasma,Center
3774945,54207926,2019-07-26 12:21:59,Donation,Platelets and Concurrent Plasma,Center


In [12]:
cutoff_dates = ['2016-05-31 23:59:59']

In [13]:
# for cutoff_date in cutoff_dates:
cutoff = pd.to_datetime(cutoff_dates[0], format='%Y-%m-%d %H:%M:%S')
target_start_date = cutoff + pd.Timedelta(seconds=1)
target_end_date = cutoff + pd.Timedelta(days=30)
print(f"Cutoff date: {cutoff}")
print(f"Target period: {target_start_date} - {target_end_date}")

Cutoff date: 2016-05-31 23:59:59
Target period: 2016-06-01 00:00:00 - 2016-06-30 23:59:59


In [14]:
# Filter records to those that are eligible for the current cutoff date
cutoff_history = subset_data[subset_data['RegistrationTime'] <= cutoff]
# Add CutoffDate as a literal value so we can combine all cutoff date histories
cutoff_history['CutoffDate'] = cutoff
cutoff_history

Unnamed: 0,Random_ID,RegistrationTime,OutCome,DonationType,DonationLocation,CutoffDate
12656,52437236,2015-04-14 12:31:00,Donation,Platelets and Concurrent Plasma,Center,2016-05-31 23:59:59
15897,52437236,2015-03-13 12:50:59,Donation,Platelets and Concurrent Plasma,Center,2016-05-31 23:59:59
17115,54207926,2015-09-24 09:57:59,Donation,Platelet Apheresis,Center,2016-05-31 23:59:59
22328,52948680,2015-03-28 18:35:59,Donation,Whole Blood,Mobile,2016-05-31 23:59:59
23600,52948680,2015-10-12 14:22:00,Donation,Platelet Apheresis,Center,2016-05-31 23:59:59
34127,52948680,2015-11-27 13:25:00,Donation,Platelet Apheresis,Center,2016-05-31 23:59:59
49654,54207926,2015-07-22 13:11:00,Donation,Platelet Apheresis,Center,2016-05-31 23:59:59
60492,52437236,2015-10-30 12:18:00,Donation,Platelet Apheresis,Center,2016-05-31 23:59:59
64096,52948680,2015-10-26 15:55:00,Donation,Platelet Apheresis,Center,2016-05-31 23:59:59
68711,52948680,2015-05-16 08:20:59,Donation,Platelet Apheresis,Center,2016-05-31 23:59:59


In [15]:
# Calculate recency: difference between most recent donation date per donor, and the current cutoff date
# Calculate time: total days since first registration
recency = cutoff_history.groupby(by='Random_ID', as_index=False).agg({'RegistrationTime': ['min', 'max']})
recency.columns = recency.columns.droplevel(0)
recency.columns = ['Random_ID', 'FirstRegistrationTime', 'LastRegistrationTime']
recency['DaysSinceLastRegistration'] = recency['LastRegistrationTime'].apply(lambda c: (cutoff - c).days)  # Just capture the days portion
recency['DaysSinceFirstRegistration'] = recency['FirstRegistrationTime'].apply(lambda c: (cutoff - c).days)
recency

Unnamed: 0,Random_ID,FirstRegistrationTime,LastRegistrationTime,DaysSinceLastRegistration,DaysSinceFirstRegistration
0,52437236,2015-03-13 12:50:59,2016-05-12 11:02:00,19,445
1,52948680,2015-02-26 14:51:59,2016-05-16 08:35:00,15,460
2,54207926,2015-07-22 13:11:00,2016-05-12 10:28:00,19,314


In [16]:
# Calculate recency by DonationType
recency_type = cutoff_history.groupby(by=['Random_ID', 'DonationType'], as_index=False) \
                             .agg({'RegistrationTime': 'max'}).rename(columns={'RegistrationTime': 'LastRegistrationTime'})
recency_type['DaysSinceLastRegistration'] = recency_type['LastRegistrationTime'].apply(lambda c: (cutoff - c).days)
recency_type = recency_type.pivot(index='Random_ID', columns='DonationType', values='DaysSinceLastRegistration') \
                          .reset_index().rename_axis(None, axis=1)  # Make Random_ID a column; remove index name
recency_type.columns = ['Random_ID'] + ['DaysSinceLast' + col_name.replace(' ', '') + 'Registration' for col_name in recency_type.columns if col_name != 'Random_ID']
recency_type

Unnamed: 0,Random_ID,DaysSinceLastPlateletApheresisRegistration,DaysSinceLastPlateletsandConcurrentPlasmaRegistration,DaysSinceLastWholeBloodRegistration
0,52437236,19.0,40.0,
1,52948680,15.0,,310.0
2,54207926,19.0,,


In [17]:
# Determine last registration for eligibility calculations
eligibility_map = {
    'Whole Blood': 56,
    'Platelets and Concurrent Plasma': 28,
    '2 Units RBC': 112,
    'RBC with Platelets and Plasma': 56,
    'Plasma Apheresis': 28,
    'Platelet Apheresis': 7,
    'RBC with Platelets': 56,
    'Single Unit Recovery': 56,
    'RBC with Plasma': 56
}

last_reg = cutoff_history[cutoff_history['OutCome'] == "Donation"].sort_values('RegistrationTime', ascending=False).groupby(by='Random_ID').head(1)
last_reg['DaysSinceLastDonation'] = last_reg['RegistrationTime'].apply(lambda c: (cutoff - c).days)
last_reg['DaysEligible'] = last_reg.apply(lambda row: 30 - (eligibility_map[row['DonationType']] - row['DaysSinceLastDonation']), axis=1)
last_reg['PercentOfTargetPeriodEligible'] = last_reg['DaysEligible'].apply(lambda x: 1 if x > 30 else (0 if x < 0 else x / 30))
last_reg['LastDonationLocation_Center'] = last_reg['DonationLocation'].apply(lambda x: 1 if x == "Center" else 0)
last_reg = last_reg.rename(columns={'DonationType': 'LastDonationType'})
last_reg = pd.get_dummies(last_reg, columns=['LastDonationType'])
last_reg.columns = ['Random_ID'] + [col_name.replace(' ', '') for col_name in last_reg.columns if col_name != 'Random_ID']
last_reg.drop(['RegistrationTime', 'OutCome', 'DonationLocation', 'DaysSinceLastDonation'], axis=1, inplace=True)
last_reg

Unnamed: 0,Random_ID,CutoffDate,DaysEligible,PercentOfTargetPeriodEligible,LastDonationLocation_Center,LastDonationType_PlateletApheresis
1165671,52948680,2016-05-31 23:59:59,38,1,1,1
404891,52437236,2016-05-31 23:59:59,42,1,1,1
487977,54207926,2016-05-31 23:59:59,42,1,1,1


In [18]:
last_reg[last_reg['PercentOfTargetPeriodEligible'] > 0]

Unnamed: 0,Random_ID,CutoffDate,DaysEligible,PercentOfTargetPeriodEligible,LastDonationLocation_Center,LastDonationType_PlateletApheresis
1165671,52948680,2016-05-31 23:59:59,38,1,1,1
404891,52437236,2016-05-31 23:59:59,42,1,1,1
487977,54207926,2016-05-31 23:59:59,42,1,1,1


In [19]:
# Calculate frequency: number of registrations for donation in the history period
frequency = cutoff_history.groupby(by='Random_ID', as_index=False).agg({'RegistrationTime': 'count'}).rename(columns={'RegistrationTime': 'PastRegistrations'})
frequency

Unnamed: 0,Random_ID,PastRegistrations
0,52437236,14
1,52948680,23
2,54207926,6


In [20]:
# Calculate frequency by DonationType
frequency_type = cutoff_history.groupby(by=['Random_ID', 'DonationType'], as_index=False) \
                               .agg({'RegistrationTime': 'count'}).rename(columns={'RegistrationTime': 'TotalRegistrations'}) \
                               .pivot(index='Random_ID', columns='DonationType', values='TotalRegistrations') \
                               .reset_index().rename_axis(None, axis=1)  # Make Random_ID a column; remove index name
frequency_type.columns = ['Random_ID'] + ['Past' + col_name.replace(' ', '') + 'Registrations' for col_name in frequency_type.columns if col_name != 'Random_ID']
frequency_type

Unnamed: 0,Random_ID,PastPlateletApheresisRegistrations,PastPlateletsandConcurrentPlasmaRegistrations,PastWholeBloodRegistrations
0,52437236,10.0,4.0,
1,52948680,21.0,,2.0
2,54207926,6.0,,


In [21]:
# Calculate monetary: total volume of donation in the history period?

In [22]:
# Calculate registrations per location type
location_counts = cutoff_history.groupby(by=['Random_ID', 'DonationLocation'], as_index=False) \
                                .agg({'RegistrationTime': 'count'}).rename(columns={'RegistrationTime': 'TotalRegistrations'})
location_counts

Unnamed: 0,Random_ID,DonationLocation,TotalRegistrations
0,52437236,Center,14
1,52948680,Center,21
2,52948680,Mobile,2
3,54207926,Center,6


In [23]:
# Pivot to add as features
frequency_location = location_counts.pivot(index='Random_ID', columns='DonationLocation', values='TotalRegistrations').reset_index().rename_axis(None, axis=1)
frequency_location.columns = ['Random_ID'] + ['Past' + col_name.replace(' ', '') + 'Registrations' for col_name in frequency_location.columns if col_name != 'Random_ID']
frequency_location

Unnamed: 0,Random_ID,PastCenterRegistrations,PastMobileRegistrations
0,52437236,14.0,
1,52948680,21.0,2.0
2,54207926,6.0,


In [24]:
# Calculate modal location per Random_ID
modal_location = location_counts.sort_values('TotalRegistrations', ascending=False).groupby(by='Random_ID').head(1).rename(columns={'DonationLocation': 'ModalDonationLocation'})
modal_location

Unnamed: 0,Random_ID,ModalDonationLocation,TotalRegistrations
1,52948680,Center,21
0,52437236,Center,14
3,54207926,Center,6


In [25]:
# Represent modal location as dummy variables (one-hot encoded)
modal_one_hot = pd.get_dummies(modal_location).drop('TotalRegistrations', axis=1)
modal_one_hot

Unnamed: 0,Random_ID,ModalDonationLocation_Center
1,52948680,1
0,52437236,1
3,54207926,1


In [26]:
# Filter data down to target period
cutoff_target = subset_data[(subset_data['RegistrationTime'] >= target_start_date) & (subset_data['RegistrationTime'] <= target_end_date)]
cutoff_target

Unnamed: 0,Random_ID,RegistrationTime,OutCome,DonationType,DonationLocation
284231,52437236,2016-06-02 14:01:00,Donation,Platelet Apheresis,Center
511342,52437236,2016-06-15 10:38:00,Incomplete,Platelet Apheresis,Center
788793,54207926,2016-06-29 10:03:00,Registration,Platelet Apheresis,Center
1080584,52948680,2016-06-11 09:14:00,Donation,Platelet Apheresis,Center
1189641,54207926,2016-06-09 10:06:00,Donation,Platelet Apheresis,Center
1196958,52948680,2016-06-01 13:08:00,Donation,Platelet Apheresis,Center


In [27]:
# Calculate base measure for target: how many total registrations each donor had in the target period
response = cutoff_target.groupby(by='Random_ID', as_index=False).agg({'RegistrationTime': 'count'}).rename(columns={'RegistrationTime': 'TargetRegistrations'})
response

Unnamed: 0,Random_ID,TargetRegistrations
0,52437236,2
1,52948680,2
2,54207926,2


In [28]:
# Calculate sub-targets: how many registrations of each DonationType each donor had in the target period
response_type = cutoff_target.groupby(by=['Random_ID', 'DonationType'], as_index=False) \
                             .agg({'RegistrationTime': 'count'}).rename(columns={'RegistrationTime': 'TotalRegistrations'}) \
                             .pivot(index='Random_ID', columns='DonationType', values='TotalRegistrations') \
                             .reset_index().rename_axis(None, axis=1)  # Make Random_ID a column; remove index name
response_type.columns = ['Random_ID'] + ['Target' + col_name.replace(' ', '') + 'Registrations' for col_name in response_type.columns if col_name != 'Random_ID']

In [29]:
response_type

Unnamed: 0,Random_ID,TargetPlateletApheresisRegistrations
0,52437236,2
1,52948680,2
2,54207926,2


In [34]:
response_platelets = cutoff_target[cutoff_target['DonationType'].isin(['RBC with Platelets', 'Platelet Apheresis',
                                                                       'RBC with Platelets and Plasma', 'Platelets and Concurrent Plasma'])] \
                                                                .groupby(by='Random_ID', as_index=False) \
                                                                .agg({'RegistrationTime': 'count'}).rename(columns={'RegistrationTime': 'TargetPlateletRegistrations'})                      
response_platelets

Unnamed: 0,Random_ID,PlateletRegistrations
0,52437236,2
1,52948680,2
2,54207926,2


In [30]:
# Combine datasets
data = pd.merge(recency, frequency, how='left').merge(response, how='left')
data['RegisteredInTargetPeriod'] = data['PastRegistrations'].apply(lambda x: 0 if pd.isna(x) else 1)
data['CutoffDate'] = cutoff
data['TargetPeriodStartDate'] = target_start_date
data['TargetPeriodEndDate'] = target_end_date
data

Unnamed: 0,Random_ID,FirstRegistrationTime,LastRegistrationTime,DaysSinceLastRegistration,DaysSinceFirstRegistration,PastRegistrations,TargetRegistrations,RegisteredInTargetPeriod,CutoffDate,TargetPeriodStartDate,TargetPeriodEndDate
0,52437236,2015-03-13 12:50:59,2016-05-12 11:02:00,19,445,14,2,1,2016-05-31 23:59:59,2016-06-01,2016-06-30 23:59:59
1,52948680,2015-02-26 14:51:59,2016-05-16 08:35:00,15,460,23,2,1,2016-05-31 23:59:59,2016-06-01,2016-06-30 23:59:59
2,54207926,2015-07-22 13:11:00,2016-05-12 10:28:00,19,314,6,2,1,2016-05-31 23:59:59,2016-06-01,2016-06-30 23:59:59


In [31]:
raw_data['DonationType'].value_counts()

Whole Blood                        3231266
Platelet Apheresis                  277174
2 Units RBC                         183740
RBC with Plasma                      54387
Platelets and Concurrent Plasma      44286
RBC with Platelets and Plasma        11863
RBC with Platelets                    6056
Single Unit Recovery                  5480
Plasma Apheresis                      4599
Name: DonationType, dtype: int64