In [None]:
#preprocessing SANBS data

import pandas as pd
import numpy as np
import datetime as dt
from datetime import timedelta

import matplotlib.pyplot as plt
import seaborn as sns

import warnings

warnings.filterwarnings('ignore')

df=pd.read_csv("../1_data/private/SANBSdata2.csv")
len(df)

In [None]:
#merge with don hist

df['Visit_Date']=pd.to_datetime(df['Visit_Date']).dt.strftime('%Y-%m-%dT%H:%M:%SZ') #covert to ISO -#2019-03-26T00:00:00Z\n",
don_hist=pd.read_csv("../1_data/private/don_hist2.csv")

df=pd.merge(df, don_hist[['DonorID', 'Visit_Date','rbc_loss_last_12_months', 'rbc_loss_last_24_months']], how='left', on=["DonorID", "Visit_Date"])



In [None]:
#drop theraputic donations

df.drop(df[(df['Outcome'] == 'THERAPEUTIC')].index, inplace=True)

#add a column OUTCOME_TYPE that indicates hgb deferal, other deferral or completed donation to indicate deferral type

df.loc[ (df['Outcome'] == 'DEFERRAL') &(df["category"]=='Haematocrit/Haemoglobin'), 'OUTCOME_TYPE'] = 'low hgb'
df.loc[(df['Outcome'] == "SUCCESSFUL DONATION"), 'OUTCOME_TYPE'] = 'completed'
df.loc[(df['Outcome'] == "DEFERRAL") & (df["category"]!='Haematocrit/Haemoglobin') , 'OUTCOME_TYPE'] = 'other deferral'
df['OUTCOME_TYPE'].fillna("no draw", inplace=True)

df.drop(df[df['Visit_Date'].isna()].index, inplace=True) #remove any missing visit dates

df["Visit_Date"] = df["Visit_Date"].astype("datetime64") #convert to date time object

df.sort_values(by=['DonorID', 'don_Sequence', 'Visit_Date'], axis = 0, inplace=True)


In [None]:
#compute % of aph and ccp donations
print(len(df.loc[(df.donation_product=="APH")])/len(df.loc[df.OUTCOME_TYPE=='completed']))
print(len(df.loc[df.donation_product=='CCP'])/len(df.loc[df.OUTCOME_TYPE=='completed']))

In [None]:
##compute donation frequency

df['Year']= df['Visit_Date'].dt.strftime('%Y')
df['Year']=df.Year.astype('int32')

df1=df.loc[(df.OUTCOME_TYPE=='completed') & (df.donation_product=='WB')]
# Calculate donation frequency by year

#WB only
result = df1.groupby('Year')['DonorID'].agg(['count', 'nunique'])
result.columns = ['total_donations', 'unique_donors']

result['donation_frequency'] = result['total_donations'] / result['unique_donors']
print(result)

#WB and APH
df2= df.loc[(df.OUTCOME_TYPE=='completed')]
df1= df2.loc[(df2.donation_product=='WB')|(df2.donation_product=='APH')]

# Calculate donation frequency by year

result = df1.groupby('Year')['DonorID'].agg(['count', 'nunique'])
result.columns = ['total_donations', 'unique_donors']

result['donation_frequency'] = result['total_donations'] / result['unique_donors']
print(result)


#WB and APH and CCP
df1= df2.loc[(df2.donation_product=='WB')|(df2.donation_product=='APH')|(df2.donation_product=='CCP')]

# Calculate donation frequency by year

result = df1.groupby('Year')['DonorID'].agg(['count', 'nunique'])
result.columns = ['total_donations', 'unique_donors']

result['donation_frequency'] = result['total_donations'] / result['unique_donors']
print(result)


In [None]:
#drop non-WB visits

#keep only wb and rbc 
df.drop(df[(df['donation_product'] != 'WB')].index, inplace=True)

In [None]:

#add censored column (indicates whether row is right censored) 0-censored, 1-notcensored
df['CENSORED']=np.where(df['DonorID'] == df['DonorID'].shift(-1), 1, 0)

#add time_to_return column (time between visits)
#if return exists -> difference between visit dates
#else difference between visit date and end of data collection 2022-12-31

df['time_to_return']= np.where(df['CENSORED']==1, (df['Visit_Date'].shift(-1)-df['Visit_Date'])/np.timedelta64(1, 'D'), (dt.datetime(2022, 12, 31) - df["Visit_Date"])/np.timedelta64(1, 'D'))


In [None]:
#remove outcomes with def_end < visit_date
df.drop(df[df['Visit_Date'] > df['Def_end']].index, inplace=True)


In [None]:
#remove permanent deferrals
df.drop(df[df.Deferral_permanent == 'Perm'].index, inplace=True)
df.drop(df[(df['OUTCOME_TYPE'] == 'other deferral') & (df['Deferral_permanent'] == 'NoDeferral')].index, inplace=True)



In [None]:
#compute time_to_return with shifts for completed donation: subtract 56 days 
df['time_to_return2']=df['time_to_return']

df['time_to_return'] = np.where(df['OUTCOME_TYPE'] == 'completed', (df['time_to_return2'] - 56) , df['time_to_return'])

#end of data collection
#df['time_to_return']=np.where((df['time_to_return'] < 0)& (df.OUTCOME_TYPE=='completed') , df['time_to_return2'], df['time_to_return'])

#compute time_to_return with shifts for other donations
#first covert eligibilty year and month to a datetime object

df["Def_end"] = df["Def_end"].astype("datetime64")
df['def_Year'] = df['Def_end'].dt.year  
#df['Def_end'].isna().sum()

#hgb deferrals
df['time_to_return'] = np.where((df['OUTCOME_TYPE'] == 'low hgb') & (df['def_Year']< 2023) , (df['time_to_return2'] - ((df['Def_end'] - df['Visit_Date'])/np.timedelta64(1, 'D'))), df['time_to_return'])

#other deferrals
df['time_to_return'] = np.where((df['OUTCOME_TYPE'] == 'other deferral') & (df['def_Year'] < 2023), (df['time_to_return2'] - ((df['Def_end'] - df['Visit_Date'])/np.timedelta64(1, 'D'))), df['time_to_return'])
df['time_to_return'].isna().sum()

#drop individuals who returned before end of eligibility period
#df.drop(df[(df.time_to_return<0)].index, inplace =True)

#for sens analysis
df['time_to_return']=np.where(df.time_to_return<0, 0, df.time_to_return)



In [None]:
# add variable first time -> 1- first time, 0-repeat

df['first_time'] = np.where((df['don_Sequence'] == 1.0) & (df['don_Sequence'].shift(1)!=1.0) & (df['DonorID'] != df['DonorID'].shift(1)), 1, 0)

# make age categorical - high school age
df['high_school_age']=np.where(df['Visit_Age']<=20, 1, 0)

#add donor history
df['cum_lifetime_donations']=df['don_Sequence']


In [None]:
#Fixed-Mobile Donations Analysis

#plot number - mobile=0, fixed=1
fixed=df["Fixed"]
#sns.histplot(df, x="Fixed", stat="probability")
#sns.countplot(fixed)
#print(fixed.value_counts())

# group and remove some categories that are not fixed or mobile
# administrative site and apheresis clinic fall under fixed
# exclude others

df.drop(df[(df['Fixed']== 'Lapsed Clinic') | (df['Fixed']=='Processing Lab')| (df['Fixed']=='Unallocated') 
           | (df["Fixed"]== 'Laboratorium') |(df["Fixed"]=='Doctor')].index, inplace=True)

df['Fixed_mobile'] = np.where(df['Fixed'] == 'Mobile Clinic', 0, 1) #mobile is 0, fixed is 1


#create a column to indicate 
    #1. mobile then mobile - mm
    #2. mobile then fixed -mf
    #3. fixed then fixed - ff
    #4. fixed then mobile - fm

df["FIXED_NEXT"] = np.where(df['DonorID'] == df['DonorID'].shift(-1), df['Fixed_mobile'].shift(-1), 10)
df.loc[((df['FIXED_NEXT']== 1) & (df['Fixed_mobile']== 1)), 'fixed_mobile_pattern'] = 'FF'

df.loc[((df['FIXED_NEXT']==0) & (df['Fixed_mobile']== 0)), 'fixed_mobile_pattern']= 'MM'

df.loc[((df['FIXED_NEXT']==0) & (df['Fixed_mobile']== 1)), 'fixed_mobile_pattern'] = 'FM'

df.loc[((df['FIXED_NEXT']==1) & (df['Fixed_mobile']== 0)), 'fixed_mobile_pattern']= 'MF'
df['fixed_mobile_pattern'].fillna('last', inplace=True)



In [None]:
# create a events column to generate cause specific Cumulative incidence functions
# competing events are - return to mobile, return to fixed

df['competing_events'] = np.where((df['fixed_mobile_pattern']=="MF")|(df['fixed_mobile_pattern']=="FF"), 1, 2)
df['competing_events'] = np.where((df['fixed_mobile_pattern']=="last"), 0, df['competing_events'])

In [None]:
#recode ABO_RH
df.loc[df['ABO_RH'] == 'A2BPOS', 'ABO_RH'] = 'ABpos'
df.loc[df['ABO_RH'] == 'A2NEG', 'ABO_RH'] = 'Aneg'
df.loc[df['ABO_RH'] == 'A2POS', 'ABO_RH'] = 'Apos'
df.loc[df['ABO_RH'] == 'ANEGL', 'ABO_RH'] = 'Aneg'
df.loc[df['ABO_RH'] == 'BwPOS', 'ABO_RH'] = 'Bpos'
df.loc[df['ABO_RH'] == 'ONEGH', 'ABO_RH'] = 'Oneg'
df.loc[df['ABO_RH'] == 'OPOSH', 'ABO_RH'] = 'Opos'
df.loc[df['ABO_RH'] == 'OPOSL', 'ABO_RH'] = 'Opos'
df.loc[df['ABO_RH'] == 'wABNEG', 'ABO_RH'] = 'ABneg'
df.loc[df['ABO_RH'] == 'wABPOS', 'ABO_RH'] = 'ABpos'
df.loc[df['ABO_RH'] == 'wANEG', 'ABO_RH'] = 'Aneg'
df.loc[df['ABO_RH'] == 'wAPOS', 'ABO_RH'] = 'Apos'
df.loc[df['ABO_RH'] == 'APOS', 'ABO_RH'] = 'Apos'
df.loc[df['ABO_RH'] == 'ANEG', 'ABO_RH'] = 'Aneg'
df.loc[df['ABO_RH'] == 'BPOS', 'ABO_RH'] = 'Bpos'
df.loc[df['ABO_RH'] == 'BNEG', 'ABO_RH'] = 'Bneg'
df.loc[df['ABO_RH'] == 'ABPOS', 'ABO_RH'] = 'ABpos'
df.loc[df['ABO_RH'] == 'ABNEG', 'ABO_RH'] = 'ABneg'
df.loc[df['ABO_RH'] == 'OPOS', 'ABO_RH'] = 'Opos'
df.loc[df['ABO_RH'] == 'ONEG', 'ABO_RH'] = 'Oneg'


df.ABO_RH.value_counts()



In [None]:
df.race.value_counts()

In [None]:
#check if donor return to the same mobile ID

#for fixed only, for mobile only and overall

df.loc[(df.Fixed_mobile==0) & (df['MobileID'] == df['MobileID'].shift(-1)), 'return_to_same_ID_by_dn']= 'mm' #for mobile only
df.loc[(df.Fixed_mobile==1) & (df['MobileID'] == df['MobileID'].shift(-1)), 'return_to_same_ID_by_dn']= 'ff' #for fixed only
df.loc[(df['MobileID'] == df['MobileID'].shift(-1)), 'return_to_same_ID']= 1 #overall


In [None]:
# add oppurtunities to donate in next 12 months - computed in 00_mobileID.ipynb and saved in csv file, mID_table
df_mID=pd.read_csv("../1_data/private/mID_table.csv")
df_mID["Visit_Date"] = df_mID["Visit_Date"].astype("datetime64") #convert to date time object
df=pd.merge(df, df_mID[['MobileID', 'Visit_Date', 'count_in_next_12_months']], how='left', on=["MobileID", "Visit_Date"])


In [None]:
df.loc[df.count_in_next_12_months==1, 'Opp_to_donate']="Annual_1"
df.loc[df.count_in_next_12_months==2, 'Opp_to_donate']="Bi_Annual_2"
df.loc[(df.count_in_next_12_months>=3) & (df.count_in_next_12_months<=6), 'Opp_to_donate']="3_6"
df.loc[(df.count_in_next_12_months>6) & (df.count_in_next_12_months<=12), 'Opp_to_donate']="7_12"
df.loc[(df.count_in_next_12_months>12), 'Opp_to_donate']="12_more"
df.Opp_to_donate.value_counts()

In [None]:
df = df.sort_values(by=['DonorID', 'Visit_Date'])
df['TimeSinceFirstDonation'] = df.groupby('DonorID')['Visit_Date'].diff().dt.days

In [None]:
# Identify the first-time donors who received a hemoglobin deferral #within a year
first_time_comp = df.loc[(df['first_time']==1)]
#  keep only the first-time donors and their subsequent donations
filtered_df = df.loc[df['DonorID'].isin(first_time_comp['DonorID'])]
filtered_df = filtered_df[filtered_df['TimeSinceFirstDonation'] <= 365]
grouped_filtered_df=filtered_df.groupby('DonorID')["OUTCOME_TYPE"].apply(list)
grouped_filtered_df= pd.DataFrame({'DonorID': grouped_filtered_df.index, 'Outcomes': grouped_filtered_df.values})
grouped_filtered_df['Outcomes'] = grouped_filtered_df['Outcomes'].apply(tuple)
# Split the list column into multiple columns
grouped_filtered_df= grouped_filtered_df['Outcomes'].apply(pd.Series)


In [None]:
group_df2=grouped_filtered_df

comp=group_df2.loc[group_df2[0]=='completed']
hgb=group_df2.loc[group_df2[0]=='low hgb']

#look at consectutive
comp2=comp.loc[comp[1]=='completed']
hgb2=hgb.loc[hgb[1]=='low hgb']
comp_hgb=comp.loc[comp[1]=='low hgb']

##calculate probabulity of first tiem donors returning after 2 comp donation
num=comp2[2].notna().sum()
prob_f_cc=num/len(comp2)


#calculate probabulity of first time donors returning after 2 hgb def
num=hgb2[2].notna().sum()
prob_f_hh=num/len(hgb2)

#calculate probabulity of first tim donors returning after a comp donation then a hgb deferral
num=comp_hgb[2].notna().sum()
prob_f_ch=num/len(comp_hgb)


In [None]:
#compute deferral rates

first=df.query("first_time == 1")
repeat=df.query("first_time == 0")
    
table={}
table['-']=['Female', 'Male', 'All']
   
    #first time
female=(len(first.loc[(first.sex == 'F') & (first.OUTCOME_TYPE=='low hgb')])/len(first.loc[(first.sex == 'F')]))
male=(len(first.loc[(first.sex == 'M') & (first.OUTCOME_TYPE=='low hgb')])/len(first.loc[(first.sex == 'M')]))
alls=(len(first.loc[(first.OUTCOME_TYPE=='low hgb')])/len(first))
table['First Time']=[female, male, alls]
    
    #repeat
    
female=(len(repeat.loc[(repeat.sex == 'F') & (repeat.OUTCOME_TYPE=='low hgb')])/len(repeat.loc[(repeat.sex == 'F')]))
male=(len(repeat.loc[(repeat.sex == 'M') & (repeat.OUTCOME_TYPE=='low hgb')])/len(repeat.loc[(repeat.sex == 'M')]))
alls=(len(repeat.loc[(repeat.OUTCOME_TYPE=='low hgb')])/len(repeat))
table['Repeat']=[female, male, alls]
    
    #all 
    
female=(len(df.loc[(df.sex == 'F') & (df.OUTCOME_TYPE=='low hgb')])/len(df.loc[(df.sex== 'F')]))
male=(len(df.loc[(df.sex == 'M') & (df.OUTCOME_TYPE=='low hgb')])/len(df.loc[(df.sex == 'M')]))
alls=(len(df.loc[(df.OUTCOME_TYPE=='low hgb')])/len(df))
table['All']=[female, male, alls]


In [None]:
#compute stats to report
print("-----------total deferrals------------")
print(df.OUTCOME_TYPE.value_counts())  # number of deferrals
print(len(df))#total observations

print('-----------total counts of outcomes---------')
print(df['OUTCOME_TYPE'].value_counts())

print("-----------deferral rate by fixed mobile---------")

print(len(df.loc[((df['OUTCOME_TYPE']=="low hgb") & (df['Fixed_mobile']== 1))])/len(df.loc[(df['Fixed_mobile']==1)]))
print(len(df.loc[((df['OUTCOME_TYPE']=="low hgb") & (df['Fixed_mobile']== 0))])/len(df.loc[(df['Fixed_mobile']==0)]))

print('------------% of fixed/mobile--------------')
print(len(df.loc[(df['Fixed_mobile']==1)])/len(df))
print(len(df.loc[(df['Fixed_mobile']==0)])/len(df))

print('--------------deferral rate table by sex -----------')
table=pd.DataFrame.from_dict(table)
print(table)

print('--------donor loyalty to mobileID--------------')
print(len(df.loc[df.return_to_same_ID_by_dn =='mm'])/len(df.loc[(df.Fixed_mobile==0)])) #mob only
print(len(df.loc[df.return_to_same_ID_by_dn =='ff'])/len(df.loc[(df.Fixed_mobile==1)]))#fixed only
print(len(df.loc[df.return_to_same_ID ==1])/len(df))#overall


print('---------total donors---------------')
print(df.DonorID.nunique())
print('---------total visits---------------')
print(len(df))


In [None]:
#drop cols that arent needed 

#save preprocessed file
df.to_csv(("../1_data/private/preprocessed_data.csv"), index=False)
