# Overview

This notebook processes the data.

# Setup

In [1]:
# Import libraries
import glob, os, json
import pandas as pd
import numpy as np

In [2]:
# Load raw data
file_path = '../../data/raw'
raw_files = glob.glob(file_path + '/*.csv')

file_list = []

for file in raw_files:
    file_list.append(pd.read_csv(file, index_col=None, header=0))

raw_data = pd.concat(file_list, axis=0, ignore_index=True)

In [3]:
print(raw_data.shape)
raw_data.head()

(3818852, 5)


Unnamed: 0,Random_ID,RegistrationTime,OutCome,DonationType,DonationLocation
0,52156190,6/27/2015 12:47:00,Donation,Whole Blood,Center
1,52825057,2/26/2015 9:53:00,Donation,2 Units RBC,Mobile
2,53025596,9/8/2015 16:49:59,Donation,Whole Blood,Mobile
3,2056692,8/26/2015 12:15:00,Donation,Whole Blood,Mobile
4,52879521,1/26/2015 17:18:00,Incomplete,Whole Blood,Center


In [4]:
# Decide whether we're loading a subset or the full set
# dataset_size = 'partial'
dataset_size = 'full'

# Clean raw data

In [5]:
# Show where we have NaNs/nulls
raw_data[raw_data.isnull().any(axis=1)]

Unnamed: 0,Random_ID,RegistrationTime,OutCome,DonationType,DonationLocation
9589,53639912,6/23/2015 14:20:59,Donation,Whole Blood,
30664,52877130,4/13/2015 16:49:00,Donation,Whole Blood,
55262,52869439,9/11/2015 13:38:00,Donation,Whole Blood,
236526,53594657,6/23/2015 15:55:00,Donation,Whole Blood,
240689,53165336,8/7/2015 17:19:59,Donation,Whole Blood,
268039,53256162,11,,,
284335,53373866,7/11/2016 17:42,Donation,Whole Blood,
596076,52879414,9/20/2016 14:42,Donation,Whole Blood,
670560,52971104,12/29/2016 13:49,Donation,Whole Blood,
737587,53591355,3/23/2016 14:57,Donation,2 Units RBC,


In [6]:
# Drop rows with nulls
cleaned_data = raw_data.dropna(axis=0, how='any')

In [7]:
# Clean up RegistrationTime: pad time with '00' seconds if only HH:MM is shown, then convert to a datetime type
cleaned_data['RegistrationTime'] = cleaned_data['RegistrationTime'].apply(lambda c: c + ':00' if c.count(':') < 2 else c)
cleaned_data['RegistrationTime'] = pd.to_datetime(cleaned_data['RegistrationTime'], format='%m/%d/%Y %H:%M:%S')
cleaned_data['RegistrationTime'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


0   2015-06-27 12:47:00
1   2015-02-26 09:53:00
2   2015-09-08 16:49:59
3   2015-08-26 12:15:00
4   2015-01-26 17:18:00
Name: RegistrationTime, dtype: datetime64[ns]

In [8]:
cleaned_data.head()

Unnamed: 0,Random_ID,RegistrationTime,OutCome,DonationType,DonationLocation
0,52156190,2015-06-27 12:47:00,Donation,Whole Blood,Center
1,52825057,2015-02-26 09:53:00,Donation,2 Units RBC,Mobile
2,53025596,2015-09-08 16:49:59,Donation,Whole Blood,Mobile
3,2056692,2015-08-26 12:15:00,Donation,Whole Blood,Mobile
4,52879521,2015-01-26 17:18:00,Incomplete,Whole Blood,Center


In [9]:
cleaned_data.to_csv('../../data/processed/cleaned_data.csv', index=False, header=True)

# Generate features

1. Establish cutoff dates for data subsets
2. Process the data so that we have, for each cutoff date, one record per donor consisting of:
  * Recency: how long ago the donor donated
  * Frequency: how many times the donor donated
  * Time: how long the donor has been a donor
  * Modal location: the location type (e.g. mobile vs. center) that the donor donated in most often
  * Target: whether or not the donor donated in the target period
  * Additional breakdowns of recency, frequency, location, and target by donation type

In [10]:
# Establish cutoff dates for the "chunks" of our final dataset
target_window_size = 30
final_cutoff = pd.to_datetime('2019-08-20 23:59:59') + pd.Timedelta(days=-target_window_size)  # 8/20/2019 is the final resgistration in the dataset

cutoff_dates = sorted([cutoff_date for cutoff_date
    in [pd.to_datetime(final_cutoff, format='%Y-%m-%d %H:%M:%S') + pd.Timedelta(days=-target_window_size * i) for i in range(60)]
    if cutoff_date.year >= 2015])

if dataset_size == 'partial':
    del cutoff_dates[:-4]  # Remove all but the first four dates
    
cutoff_dates

[Timestamp('2015-01-13 23:59:59'),
 Timestamp('2015-02-12 23:59:59'),
 Timestamp('2015-03-14 23:59:59'),
 Timestamp('2015-04-13 23:59:59'),
 Timestamp('2015-05-13 23:59:59'),
 Timestamp('2015-06-12 23:59:59'),
 Timestamp('2015-07-12 23:59:59'),
 Timestamp('2015-08-11 23:59:59'),
 Timestamp('2015-09-10 23:59:59'),
 Timestamp('2015-10-10 23:59:59'),
 Timestamp('2015-11-09 23:59:59'),
 Timestamp('2015-12-09 23:59:59'),
 Timestamp('2016-01-08 23:59:59'),
 Timestamp('2016-02-07 23:59:59'),
 Timestamp('2016-03-08 23:59:59'),
 Timestamp('2016-04-07 23:59:59'),
 Timestamp('2016-05-07 23:59:59'),
 Timestamp('2016-06-06 23:59:59'),
 Timestamp('2016-07-06 23:59:59'),
 Timestamp('2016-08-05 23:59:59'),
 Timestamp('2016-09-04 23:59:59'),
 Timestamp('2016-10-04 23:59:59'),
 Timestamp('2016-11-03 23:59:59'),
 Timestamp('2016-12-03 23:59:59'),
 Timestamp('2017-01-02 23:59:59'),
 Timestamp('2017-02-01 23:59:59'),
 Timestamp('2017-03-03 23:59:59'),
 Timestamp('2017-04-02 23:59:59'),
 Timestamp('2017-05-

In [11]:
# Eligibility windows for different donation types
eligibility_map = {
    'Whole Blood': 56,
    'Platelets and Concurrent Plasma': 28,
    '2 Units RBC': 112,
    'RBC with Platelets and Plasma': 56,
    'Plasma Apheresis': 28,
    'Platelet Apheresis': 7,
    'RBC with Platelets': 56,
    'Single Unit Recovery': 56,
    'RBC with Plasma': 56
}

In [12]:
if dataset_size == 'partial':
    filename = 'data.csv'
elif dataset_size == 'full':
    filename = 'full_data.csv'

# Remove old processed file
try:
    os.remove(f'../../data/processed/{filename}')
except FileNotFoundError:
    print("Processed data file does not exist - proceeding.")

In [13]:
cutoff_subsets = []
iteration = 0
include_header = True

for cutoff in cutoff_dates:
    # cutoff = pd.to_datetime(cutoff_date, format='%Y-%m-%d %H:%M:%S')  # Convert to datetime object for easier time comparisons
    target_start_date = cutoff + pd.Timedelta(seconds=1)  # Midnight of the day after cutoff
    target_end_date = cutoff + pd.Timedelta(days=target_window_size)  # 11:59:59 PM of the nth day after cutoff
    print(f"Generating data for cutoff date of {cutoff}, target period of {target_start_date} - {target_end_date}...")

    # Filter data down to eligible registrations
    cutoff_history = cleaned_data[cleaned_data['RegistrationTime'] <= cutoff]

    # Calculate recency: difference between most recent donation date per donor, and the current cutoff date
    # Calculate time: total days since first registration
    recency = cutoff_history.groupby(by='Random_ID', as_index=False).agg({'RegistrationTime': ['min', 'max']})
    recency.columns = recency.columns.droplevel(0)
    recency.columns = ['Random_ID', 'FirstRegistrationTime', 'LastRegistrationTime']
    recency['DaysSinceLastRegistration'] = (cutoff - recency['LastRegistrationTime']).dt.days  # Just capture the days portion
    recency['DaysSinceFirstRegistration'] = (cutoff - recency['FirstRegistrationTime']).dt.days
    recency.drop(['FirstRegistrationTime', 'LastRegistrationTime'], inplace=True, axis=1)  # Drop unneeded date/time columns
    
    # Calculate recency by DonationType
    recency_type = cutoff_history.groupby(by=['Random_ID', 'DonationType'], as_index=False) \
                                 .agg({'RegistrationTime': 'max'}).rename(columns={'RegistrationTime': 'LastRegistrationTime'})
    recency_type['DaysSinceLastRegistration'] = recency_type['LastRegistrationTime'].apply(lambda c: (cutoff - c).days)
    recency_type = recency_type.pivot(index='Random_ID', columns='DonationType', values='DaysSinceLastRegistration') \
                               .reset_index().rename_axis(None, axis=1)  # Make Random_ID a column; remove index name
    recency_type.columns= ['Random_ID'] + ['DaysSinceLast' + col_name.replace(' ', '') + 'Registration' for col_name in recency_type.columns if col_name != 'Random_ID']

    # Calculate eligibility stats and last registration features
    # Based on days since last donation, and type of last donation
    last_reg = cutoff_history[cutoff_history['OutCome'] == "Donation"].sort_values('RegistrationTime', ascending=False).groupby(by='Random_ID').head(1)
    last_reg['DaysSinceLastDonation'] = last_reg['RegistrationTime'].apply(lambda c: (cutoff - c).days)
    last_reg['DaysEligible'] = last_reg.apply(lambda row: target_window_size - (eligibility_map[row['DonationType']] - row['DaysSinceLastDonation']), axis=1)
    last_reg['PercentOfTargetPeriodEligible'] = last_reg['DaysEligible'].apply(lambda x: 1 if x > target_window_size else (0 if x < 0 else x / target_window_size))
    last_reg['LastDonationLocation_Center'] = last_reg['DonationLocation'].apply(lambda x: 1 if x == "Center" else 0)
    last_reg = last_reg.rename(columns={'DonationType': 'LastDonationType'})
    last_reg['LastDonationType_Platelets'] = last_reg['LastDonationType'].apply(lambda x: 1 if x in ['RBC with Platelets', 'Platelet Apheresis',
                                                                                                     'RBC with Platelets and Plasma', 'Platelets and Concurrent Plasma'] else 0)
    # last_reg = pd.get_dummies(last_reg, columns=['LastDonationType'])
    last_reg.columns = ['Random_ID'] + [col_name.replace(' ', '') for col_name in last_reg.columns if col_name != 'Random_ID']
    # last_reg.drop(['RegistrationTime', 'OutCome', 'DonationLocation', 'DaysSinceLastDonation', 'LastDonationType_WholeBlood'], axis=1, inplace=True)
    last_reg.drop(['RegistrationTime', 'OutCome', 'DonationLocation', 'DaysSinceLastDonation', 'LastDonationType'], axis=1, inplace=True)

    # Calculate frequency: number of registrations for donation in the history period
    frequency = cutoff_history.groupby(by='Random_ID', as_index=False).agg({'RegistrationTime': 'count'}).rename(columns={'RegistrationTime': 'PastRegistrations'})    

    # Calculate frequency by DonationType
    frequency_type = cutoff_history.groupby(by=['Random_ID', 'DonationType'], as_index=False) \
                                   .agg({'RegistrationTime': 'count'}).rename(columns={'RegistrationTime': 'TotalRegistrations'}) \
                                   .pivot(index='Random_ID', columns='DonationType', values='TotalRegistrations') \
                                   .reset_index().rename_axis(None, axis=1)  # Make Random_ID a column; remove index name
    frequency_type.columns = ['Random_ID'] + ['Past' + col_name.replace(' ', '') + 'Registrations' for col_name in frequency_type.columns if col_name != 'Random_ID']

    # Calculate frequency just for platelet products
    frequency_platelets = cutoff_history[cutoff_history['DonationType'].isin(['RBC with Platelets', 'Platelet Apheresis',
                                                                              'RBC with Platelets and Plasma', 'Platelets and Concurrent Plasma'])] \
                                                                       .groupby(by='Random_ID', as_index=False) \
                                                                       .agg({'RegistrationTime': 'count'}).rename(columns={'RegistrationTime': 'PastPlateletRegistrations'}) 

    # Filter data down to target period
    cutoff_target = cleaned_data[(cleaned_data['RegistrationTime'] >= target_start_date) & (cleaned_data['RegistrationTime'] <= target_end_date)]

    # Calculate base measure for target: how many total registrations each donor had in the target period
    response = cutoff_target.groupby(by='Random_ID', as_index=False).agg({'RegistrationTime': 'count'}).rename(columns={'RegistrationTime': 'TargetRegistrations'})
    
    # Calculate sub-targets: how many registrations of each DonationType each donor had in the target period
    response_type = cutoff_target.groupby(by=['Random_ID', 'DonationType'], as_index=False) \
                                 .agg({'RegistrationTime': 'count'}).rename(columns={'RegistrationTime': 'TotalRegistrations'}) \
                                 .pivot(index='Random_ID', columns='DonationType', values='TotalRegistrations') \
                                 .reset_index().rename_axis(None, axis=1)
    response_type.columns = ['Random_ID'] + ['Target' + col_name.replace(' ', '') + 'Registrations' for col_name in response_type.columns if col_name != 'Random_ID']    

    # Calculate target for platelet donations (combined):
    response_platelets = cutoff_target[cutoff_target['DonationType'].isin(['RBC with Platelets', 'Platelet Apheresis',
                                                                           'RBC with Platelets and Plasma', 'Platelets and Concurrent Plasma'])] \
                                                                    .groupby(by='Random_ID', as_index=False) \
                                                                    .agg({'RegistrationTime': 'count'}).rename(columns={'RegistrationTime': 'TargetPlateletRegistrations'})     

    # Calculate registrations per location type
    location_counts = cutoff_history.groupby(by=['Random_ID', 'DonationLocation'], as_index=False) \
                                    .agg({'RegistrationTime': 'count'}).rename(columns={'RegistrationTime': 'TotalRegistrations'})
    
    # Pivot to add as features
    frequency_location = location_counts.pivot(index='Random_ID', columns='DonationLocation', values='TotalRegistrations').reset_index().rename_axis(None, axis=1)
    frequency_location.columns = ['Random_ID'] + ['Past' + col_name.replace(' ', '') + 'Registrations' for col_name in frequency_location.columns if col_name != 'Random_ID']
    frequency_location.fillna(0, inplace=True)
    frequency_location['CenterRegistrationProportion'] = frequency_location['PastCenterRegistrations'] / (frequency_location['PastMobileRegistrations'] + frequency_location['PastCenterRegistrations'])
    

    # Calculate modal location per Random_ID
    modal_location = location_counts.sort_values('TotalRegistrations', ascending=False).groupby(by='Random_ID').head(1).rename(columns={'DonationLocation': 'ModalDonationLocation'})
    # Represent modal location as dummy variables - dropping the dummy for mobile locations to prevent multicollinearity
    modal_dummies = pd.get_dummies(modal_location).drop(['TotalRegistrations', 'ModalDonationLocation_Mobile'], axis=1)

    # Combine datasets to create a subset for the current cutoff date
    # cutoff_subset = pd.merge(recency, frequency, how='left') \
    #                   .merge(recency_type, how='left') \
    #                   .merge(last_reg, how='left') \
    #                   .merge(frequency_type, how='left') \
    #                   .merge(frequency_location, how='left') \
    #                   .merge(frequency_platelets, how='left') \
    #                   .merge(modal_dummies, how='left') \
    #                   .merge(response, how='left') \
    #                   .merge(response_type, how='left') \
    #                   .merge(response_platelets, how='left')
    cutoff_subset = pd.merge(recency, frequency, how='left') \
                      .merge(last_reg, how='left') \
                      .merge(frequency_location, how='left') \
                      .merge(frequency_platelets, how='left') \
                      .merge(modal_dummies, how='left') \
                      .merge(response, how='left')

    # Add more features based on metadata or interactions    
    nonzero_time_mask = (cutoff_subset['DaysSinceFirstRegistration'] != 0)
    cutoff_subset['DonationsPerDay'] = 1
    nonzero_time_subset = cutoff_subset[nonzero_time_mask]
    cutoff_subset.loc[nonzero_time_mask, 'DonationsPerDay'] = nonzero_time_subset['PastRegistrations'] / nonzero_time_subset['DaysSinceFirstRegistration']
    cutoff_subset['PlateletRegistrationProportion'] = cutoff_subset['PastPlateletRegistrations'] / cutoff_subset['PastRegistrations']
    cutoff_subset['CutoffDate'] = cutoff
    cutoff_subset['RegisteredInTargetPeriod'] = cutoff_subset['TargetRegistrations'].apply(lambda x: 0 if pd.isna(x) or x == 0 else 1)
    # cutoff_subset['RegisteredForPlateletsInTargetPeriod'] = cutoff_subset['TargetPlateletRegistrations'].apply(lambda x: 0 if pd.isna(x) or x == 0 else 1)
    # cutoff_subset['TargetPeriodEndDate'] = target_end_date  # Target period start date is implicitly midnight of the day after the cutoff date    

    # Replace NaNs (the result of outer joins) with 0
    cutoff_subset.fillna(0, inplace=True)

    # Drop ineligible donors
#     cutoff_subset = cutoff_subset[(cutoff_subset['PercentOfTargetPeriodEligible'] > 0) & (cutoff_subset['DaysSinceLastRegistration'] <= 12*target_window_size)]
    cutoff_subset = cutoff_subset[(cutoff_subset['PercentOfTargetPeriodEligible'] > 0)]

    # Add the current subset to the list to combine later
    cutoff_subsets.append(cutoff_subset)

    # Append to CSV
    if iteration > 0:
        include_header = False
    
    cutoff_subset.to_csv(f'../../data/processed/{filename}', index=False, header=include_header, mode='a')
    iteration += 1

# Combine all cutoff subsets together for processed data
data = pd.concat(cutoff_subsets)

Generating data for cutoff date of 2015-01-13 23:59:59, target period of 2015-01-14 00:00:00 - 2015-02-12 23:59:59...
Generating data for cutoff date of 2015-02-12 23:59:59, target period of 2015-02-13 00:00:00 - 2015-03-14 23:59:59...
Generating data for cutoff date of 2015-03-14 23:59:59, target period of 2015-03-15 00:00:00 - 2015-04-13 23:59:59...
Generating data for cutoff date of 2015-04-13 23:59:59, target period of 2015-04-14 00:00:00 - 2015-05-13 23:59:59...
Generating data for cutoff date of 2015-05-13 23:59:59, target period of 2015-05-14 00:00:00 - 2015-06-12 23:59:59...
Generating data for cutoff date of 2015-06-12 23:59:59, target period of 2015-06-13 00:00:00 - 2015-07-12 23:59:59...
Generating data for cutoff date of 2015-07-12 23:59:59, target period of 2015-07-13 00:00:00 - 2015-08-11 23:59:59...
Generating data for cutoff date of 2015-08-11 23:59:59, target period of 2015-08-12 00:00:00 - 2015-09-10 23:59:59...
Generating data for cutoff date of 2015-09-10 23:59:59, 

In [14]:
# Save data types as a dict for reading
#dtypes_dict = data.dtypes.to_frame('dtypes').reset_index().set_index('index')['dtypes'].astype(str).to_dict()
data_dtypes = data.dtypes.to_frame('dtypes').reset_index().set_index('index')['dtypes'].astype(str)
non_date_dtypes = data_dtypes[data_dtypes != 'datetime64[ns]'].to_dict()
date_dtypes = data_dtypes[data_dtypes == 'datetime64[ns]'].to_dict()

with open('../../data/processed/dtypes.json', 'w') as out_file:
    json.dump(non_date_dtypes, out_file)

with open('../../data/processed/date_types.json', 'w') as out_file:
    json.dump(date_dtypes, out_file)

print(non_date_dtypes)
print(date_dtypes)

{'Random_ID': 'int64', 'DaysSinceLastRegistration': 'int64', 'DaysSinceFirstRegistration': 'int64', 'PastRegistrations': 'int64', 'DaysEligible': 'float64', 'PercentOfTargetPeriodEligible': 'float64', 'LastDonationLocation_Center': 'float64', 'LastDonationType_Platelets': 'float64', 'PastCenterRegistrations': 'float64', 'PastMobileRegistrations': 'float64', 'CenterRegistrationProportion': 'float64', 'PastPlateletRegistrations': 'float64', 'ModalDonationLocation_Center': 'uint8', 'TargetRegistrations': 'float64', 'DonationsPerDay': 'float64', 'PlateletRegistrationProportion': 'float64', 'RegisteredInTargetPeriod': 'int64'}
{'CutoffDate': 'datetime64[ns]'}
