# Overview

This notebook processes the data.

In [1]:
# Import libraries
import glob, os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# Load raw data
file_path = '../data/raw'
raw_files = glob.glob(file_path + '/*.csv')

file_list = []

for file in raw_files:
    file_list.append(pd.read_csv(file, index_col=None, header=0))

raw_data = pd.concat(file_list, axis=0, ignore_index=True)

In [3]:
print(raw_data.shape)
raw_data.head()

(3818852, 5)


Unnamed: 0,Random_ID,RegistrationTime,OutCome,DonationType,DonationLocation
0,52156190,6/27/2015 12:47:00,Donation,Whole Blood,Center
1,52825057,2/26/2015 9:53:00,Donation,2 Units RBC,Mobile
2,53025596,9/8/2015 16:49:59,Donation,Whole Blood,Mobile
3,2056692,8/26/2015 12:15:00,Donation,Whole Blood,Mobile
4,52879521,1/26/2015 17:18:00,Incomplete,Whole Blood,Center


# Clean raw data

In [4]:
# Show where we have NaNs/nulls
raw_data[raw_data.isnull().any(axis=1)]

Unnamed: 0,Random_ID,RegistrationTime,OutCome,DonationType,DonationLocation
9589,53639912,6/23/2015 14:20:59,Donation,Whole Blood,
30664,52877130,4/13/2015 16:49:00,Donation,Whole Blood,
55262,52869439,9/11/2015 13:38:00,Donation,Whole Blood,
236526,53594657,6/23/2015 15:55:00,Donation,Whole Blood,
240689,53165336,8/7/2015 17:19:59,Donation,Whole Blood,
268039,53256162,11,,,
284335,53373866,7/11/2016 17:42,Donation,Whole Blood,
596076,52879414,9/20/2016 14:42,Donation,Whole Blood,
670560,52971104,12/29/2016 13:49,Donation,Whole Blood,
737587,53591355,3/23/2016 14:57,Donation,2 Units RBC,


In [5]:
# Drop rows with nulls
cleaned_data = raw_data.dropna(axis=0, how='any')

In [6]:
# Clean up RegistrationTime: pad time with '00' seconds if only HH:MM is shown, then convert to a datetime type
cleaned_data['RegistrationTime'] = cleaned_data['RegistrationTime'].apply(lambda c: c + ':00' if c.count(':') < 2 else c)
cleaned_data['RegistrationTime'] = pd.to_datetime(cleaned_data['RegistrationTime'], format='%m/%d/%Y %H:%M:%S')
cleaned_data['RegistrationTime'].head()

0   2015-06-27 12:47:00
1   2015-02-26 09:53:00
2   2015-09-08 16:49:59
3   2015-08-26 12:15:00
4   2015-01-26 17:18:00
Name: RegistrationTime, dtype: datetime64[ns]

In [7]:
cleaned_data.head()

Unnamed: 0,Random_ID,RegistrationTime,OutCome,DonationType,DonationLocation
0,52156190,2015-06-27 12:47:00,Donation,Whole Blood,Center
1,52825057,2015-02-26 09:53:00,Donation,2 Units RBC,Mobile
2,53025596,2015-09-08 16:49:59,Donation,Whole Blood,Mobile
3,2056692,2015-08-26 12:15:00,Donation,Whole Blood,Mobile
4,52879521,2015-01-26 17:18:00,Incomplete,Whole Blood,Center


# Generate features

In [8]:
# Establish cutoff dates for the "chunks" of our final dataset
cutoff_dates = ['2016-03-31 23:59:59', '2016-06-30 23:59:59']
                # '2016-09-30 23:59:59', '2016-12-31 23:59:59',
                # '2017-03-31 23:59:59', '2017-06-30 23:59:59', '2017-09-30 23:59:59', '2017-12-31 23:59:59',
                # '2018-03-31 23:59:59', '2018-06-30 23:59:59', '2018-09-30 23:59:59', '2018-12-31 23:59:59',
                # '2019-03-31 23:59:59', '2019-06-30 23:59:59']

In [9]:
# Remove old processed file
try:
    os.remove('../data/processed/data.csv')
except FileNotFoundError:
    print("Processed data file does not exist - proceeding.")

In [10]:
cutoff_subsets = []

for cutoff_date in cutoff_dates:
    cutoff = pd.to_datetime(cutoff_date, format='%Y-%m-%d %H:%M:%S')  # Convert to datetime object for easier time comparisons
    target_start_date = cutoff + pd.Timedelta(seconds=1)  # Midnight of the day after cutoff
    target_end_date = cutoff + pd.Timedelta(days=30)  # 11:59:59 PM of the 30th day after cutoff
    print(f"Generating data for cutoff date of {cutoff}, target period of {target_start_date} - {target_end_date}...")

    # Filter data down to eligible registrations
    cutoff_history = cleaned_data[cleaned_data['RegistrationTime'] <= cutoff]

    # Calculate recency: difference between most recent donation date per donor, and the current cutoff date
    # Calculate time: total days since first registration
    recency = cutoff_history.groupby(by='Random_ID', as_index=False).agg({'RegistrationTime': ['min', 'max']})
    recency.columns = recency.columns.droplevel(0)
    recency.columns = ['Random_ID', 'FirstRegistrationTime', 'LastRegistrationTime']
    recency['DaysSinceLastRegistration'] = (cutoff - recency['LastRegistrationTime']).dt.days  # Just capture the days portion
    recency['DaysSinceFirstRegistration'] = (cutoff - recency['FirstRegistrationTime']).dt.days
    
    # Calculate recency by DonationType for eligibility calculations
    recency_type = cutoff_history.groupby(by=['Random_ID', 'DonationType'], as_index=False) \
                                 .agg({'RegistrationTime': 'max'}).rename(columns={'RegistrationTime': 'LastRegistrationTime'})
    recency_type['DaysSinceLastRegistration'] = recency_type['LastRegistrationTime'].apply(lambda c: (cutoff - c).days)
    recency_type = recency_type.pivot(index='Random_ID', columns='DonationType', values='DaysSinceLastRegistration') \
                               .reset_index().rename_axis(None, axis=1)  # Make Random_ID a column; remove index name
    recency_type.columns= ['Random_ID'] + ['DaysSinceLast' + col_name.replace(' ', '') + 'Registration' for col_name in recency_type.columns if col_name != 'Random_ID']

    # Calculate frequency: number of registrations for donation in the history period
    frequency = cutoff_history.groupby(by='Random_ID', as_index=False).agg({'RegistrationTime': 'count'}).rename(columns={'RegistrationTime': 'PastRegistrations'})

    # Calculate frequency by DonationType
    frequency_type = cutoff_history.groupby(by=['Random_ID', 'DonationType'], as_index=False) \
                                   .agg({'RegistrationTime': 'count'}).rename(columns={'RegistrationTime': 'TotalRegistrations'}) \
                                   .pivot(index='Random_ID', columns='DonationType', values='TotalRegistrations') \
                                   .reset_index().rename_axis(None, axis=1)  # Make Random_ID a column; remove index name
    frequency_type.columns = ['Random_ID'] + ['Past' + col_name.replace(' ', '') + 'Registrations' for col_name in frequency_type.columns if col_name != 'Random_ID']

    # Filter data down to target period
    cutoff_target = cleaned_data[(cleaned_data['RegistrationTime'] >= target_start_date) & (cleaned_data['RegistrationTime'] <= target_end_date)]

    # Calculate base measure for target: how many total registrations each donor had in the target period
    response = cutoff_target.groupby(by='Random_ID', as_index=False).agg({'RegistrationTime': 'count'}).rename(columns={'RegistrationTime': 'TargetRegistrations'})
    
    # Calculate sub-targets: how many registrations of each DonationType each donor had in the target period
    response_type = cutoff_target.groupby(by=['Random_ID', 'DonationType'], as_index=False) \
                                 .agg({'RegistrationTime': 'count'}).rename(columns={'RegistrationTime': 'TotalRegistrations'}) \
                                 .pivot(index='Random_ID', columns='DonationType', values='TotalRegistrations') \
                                 .reset_index().rename_axis(None, axis=1)
    response_type.columns = ['Random_ID'] + ['Target' + col_name.replace(' ', '') + 'Registrations' for col_name in response_type.columns if col_name != 'Random_ID']

    # Calculate registrations per location type
    location_counts = cutoff_history.groupby(by=['Random_ID', 'DonationLocation'], as_index=False) \
                                    .agg({'RegistrationTime': 'count'}).rename(columns={'RegistrationTime': 'TotalRegistrations'})
    
    # Pivot to add as features
    frequency_location = location_counts.pivot(index='Random_ID', columns='DonationLocation', values='TotalRegistrations').reset_index().rename_axis(None, axis=1)
    frequency_location.columns = ['Random_ID'] + ['Past' + col_name.replace(' ', '') + 'Registrations' for col_name in frequency_location.columns if col_name != 'Random_ID']

    # Calculate modal location per Random_ID
    modal_location = location_counts.sort_values('TotalRegistrations', ascending=False).groupby(by='Random_ID').head(1).rename(columns={'DonationLocation': 'ModalDonationLocation'})

    # Combine datasets to create a subset for the current cutoff date
    cutoff_subset = pd.merge(recency, frequency, how='left') \
                      .merge(recency_type, how='left') \
                      .merge(frequency_type, how='left') \
                      .merge(frequency_location, how='left') \
                      .merge(modal_location, how='left') \
                      .merge(response, how='left') \
                      .merge(response_type, how='left')
    cutoff_subset['RegisteredInTargetPeriod'] = cutoff_subset['TargetRegistrations'].apply(lambda x: 0 if pd.isna(x) else 1)
    cutoff_subset['CutoffDate'] = cutoff
    cutoff_subset['TargetPeriodStartDate'] = target_start_date
    cutoff_subset['TargetPeriodEndDate'] = target_end_date

    # Replace NaNs (the result of outer joins) with 0
    cutoff_subset.fillna(0, inplace=True)

    # Add the current subset to the list to combine later    
    cutoff_subsets.append(cutoff_subset)

    # Append to CSV
    cutoff_subset.to_csv('../data/processed/data.csv', index=False, header=True, mode='a')

# Combine all cutoff subsets together for processed data
data = pd.concat(cutoff_subsets)

Generating data for cutoff date of 2016-03-31 23:59:59, target period of 2016-04-01 00:00:00 - 2016-04-30 23:59:59...
Generating data for cutoff date of 2016-06-30 23:59:59, target period of 2016-07-01 00:00:00 - 2016-07-30 23:59:59...
