# Overview

This notebook contains code to pre-process data for time series analysis.

# Setup

In [1]:
# Import libraries
import json
import pandas as pd
import numpy as np
import seaborn as sb
from datetime import timedelta
import matplotlib.pyplot as plt

In [2]:
# Read data, specifically parsing date columns as dates
data = pd.read_csv('../../data/processed/cleaned_data.csv', parse_dates=['RegistrationTime'])

In [3]:
data.head()

Unnamed: 0,Random_ID,RegistrationTime,OutCome,DonationType,DonationLocation
0,52156190,2015-06-27 12:47:00,Donation,Whole Blood,Center
1,52825057,2015-02-26 09:53:00,Donation,2 Units RBC,Mobile
2,53025596,2015-09-08 16:49:59,Donation,Whole Blood,Mobile
3,2056692,2015-08-26 12:15:00,Donation,Whole Blood,Mobile
4,52879521,2015-01-26 17:18:00,Incomplete,Whole Blood,Center


# Form time series dataset (count of new donors per N-day period)

In [4]:
pd.to_datetime('2019-08-20 23:59:59') + pd.Timedelta(days=-30) + pd.Timedelta(seconds=1)

Timestamp('2019-07-22 00:00:00')

In [5]:
# Establish cutoff dates for the "chunks" of our final dataset
target_window_size = 30
final_cutoff = pd.to_datetime('2019-08-20 23:59:59') + pd.Timedelta(days=-target_window_size)  # 8/20/2019 is the final resgistration in the dataset
final_target_start = final_cutoff + pd.Timedelta(seconds=1)

cutoff_dates = sorted([cutoff_date for cutoff_date
    in [pd.to_datetime(final_target_start, format='%Y-%m-%d %H:%M:%S') + pd.Timedelta(days=-target_window_size * i) for i in range(60)]
    if cutoff_date.year >= 2015])

if dataset_size == 'partial':
    del cutoff_dates[4:]  # Remove all but the first four dates

NameError: name 'dataset_size' is not defined

In [None]:
target_windows = pd.concat([
    pd.Series(cutoff_dates),
    pd.Series(cutoff_dates) + pd.Timedelta(days=target_window_size-1, hours=23, minutes=59, seconds=59)
], axis=1)
target_windows.columns = ['StartDatetime', 'EndDatetime']
target_windows['NewDonors'] = 0
target_windows.head()

In [None]:
# Group by Random_ID to get each person's first registration time (i.e. when they were a new donor)
new_donors = data.groupby('Random_ID', as_index=False).agg({'RegistrationTime': 'min'})
new_donors

In [None]:
def calculate_donor_count(row, raw_data):
    # Calculates number of new donors using raw data DataFrame
    window_subset = raw_data[(raw_data['RegistrationTime'] >= row['StartDatetime']) & (raw_data['RegistrationTime'] <= row['EndDatetime'])]
    return len(window_subset.index)

# Populate NewDonors
target_windows['NewDonors'] = target_windows.apply(lambda row: calculate_donor_count(row, new_donors), axis=1)
target_windows.head()

In [None]:
# Convert to a time series-friendly DataFrame with just date as an index and a single variable
series = target_windows.copy().drop(['EndDatetime'], axis=1).set_index('StartDatetime').squeeze()
# series = series[(series != 0) & (series.index.year != 2015)]
series

In [None]:
series.to_csv('../../data/processed/new_donors.csv')

In [None]:
import matplotlib.dates as mdates

# years = mdates.YearLocator()
# months = mdates.MonthLocator()
window_fmt = mdates.DateFormatter('%m/%d/%Y')

fig = plt.figure(figsize=(15,10))
axes = plt.axes()
sb.lineplot(data=series)
plt.xticks(series.index, rotation=90)
axes.xaxis.set_major_formatter(window_fmt)
plt.show()

In [None]:
series_no_outliers = series[series.index != pd.to_datetime('2016-06-07')]

In [None]:
sb.boxplot(series_no_outliers.index.month, series_no_outliers)