In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

In [2]:
# General parameters for the script
target_name = 'RegisteredInTargetPeriod'  # Target variable
features = [
    'DaysSinceLastRegistration', 'DaysSinceFirstRegistration',
    'PastRegistrations', 'DaysEligible', 'LastDonationLocation_Center',
    'LastDonationType_Platelets', 'ModalDonationLocation_Center',
    'CenterRegistrationProportion', 'DonationsPerDay'
]

# Decide whether we're loading a subset or the full set
dataset_size = 'partial'
# dataset_size = 'full'

if dataset_size == 'full':
    file_name = 'full_data.csv'
    file_suffix = '_full'
elif dataset_size == 'partial':
    file_name = 'data.csv'
    file_suffix = ''

In [3]:
# Load data
with open('../../data/processed/dtypes.json') as in_file:
    non_date_dtypes = json.load(in_file)

with open('../../data/processed/date_types.json') as in_file:
    date_dtypes = json.load(in_file)

date_cols = list(date_dtypes)

# Read data, specifically parsing date columns as dates and only picking the features + target
data = pd.read_csv(f'../../data/processed/{file_name}', dtype=non_date_dtypes, parse_dates=date_cols)

In [4]:
data.columns

Index(['Random_ID', 'DaysSinceLastRegistration', 'DaysSinceFirstRegistration',
       'PastRegistrations', 'DaysEligible', 'PercentOfTargetPeriodEligible',
       'LastDonationLocation_Center', 'LastDonationType_Platelets',
       'PastCenterRegistrations', 'PastMobileRegistrations',
       'CenterRegistrationProportion', 'ModalDonationLocation_Center',
       'TargetRegistrations', 'DonationsPerDay', 'CutoffDate',
       'RegisteredInTargetPeriod'],
      dtype='object')

In [5]:
# Take final cutoff date as a holdout set for demo purposes
holdout_cutoff = data['CutoffDate'].unique()[-1]
data[data['CutoffDate'] == holdout_cutoff][['Random_ID', target_name] + features].to_csv(f'../../data/processed/holdout{file_suffix}.csv')  # Only keep selected features + ID and save to CSV

# Keep all other cutoff date sets for training/testing the model
data = data[data['CutoffDate'] != holdout_cutoff][['Random_ID', target_name] + features]  # Only keep ID and selected features

In [6]:
# Separate majority (negative) and minority (positive) targets
data_majority = data[data[target_name] == 0]
data_minority = data[data[target_name] == 1]

target_ratio = 2

# Downsample the majority
data_majority_downsampled = resample(data_majority, replace=False, n_samples=int(np.ceil(data[target_name].value_counts()[1]*target_ratio)), random_state=503)

# Combine into a new dataset
data_downsampled = pd.concat([data_majority_downsampled, data_minority])

data_downsampled[target_name].value_counts()

0    217646
1    108823
Name: RegisteredInTargetPeriod, dtype: int64

In [7]:
# Split data
train, test = train_test_split(data_downsampled, test_size=0.2, random_state=503)

# Write training and test datasets to CSV
train.to_csv(f'../../data/processed/train{file_suffix}.csv')
test.to_csv(f'../../data/processed/test{file_suffix}.csv')