# Data Prep

The data source is a REDCap audit logging file, a REDCap report, and a CSV extract from Clarity. 

## Imports

In [1]:
import pandas as pd
from datetime import datetime
from datetime import date
import numpy as np

pd.options.display.max_colwidth = 75

# import logging file from REDCap (REDCap > Sidebar > Logging > Export all logging (CSV))
log = pd.read_csv('data/raw/UCIInternalREDCapPCORIProject_Logging_2021-01-13_2027.csv')

# import extract aka "report" of useful data from REDCap (REDCap > Sidebar > Data Exports, Reports, and Stats)
redcap_extract = pd.read_csv('data/raw/UCIInternalREDCapPCO-AdminExportForRecrui_DATA_2021-01-21_1832.csv')

# import extract used for intervention tracking from Clarity
clarity_extract = pd.read_csv('data/raw/20201229_PCORIWeeklyExtract.csv')

# import mailing lists for 2nd round, duplicate baseline research surveys
round_2_mailing_a = pd.read_csv('data/raw/2.5.20_mailing_list_A.csv')
round_2_mailing_b = pd.read_csv('data/raw/2.5.20_mailing_list_B.csv')
round_2_mailing_c = pd.read_csv('data/raw/2.5.20_mailing_list_C.csv')
round_2_mailing_d = pd.read_csv('data/raw/2.5.20_mailing_list_D.csv')

## Data Cleaning

### Log: Split into round 1 / 2 / 3 call dataframes

In [2]:
# make a new df with only REDCap Patient Record Updates
log_updates = log[log['Action'].str.contains("Updated")].copy()

# make a new df with only REDCap Patient Record Updates + Phone Call 1 / 2 / 3 Attempts
# assumes that RA updating the contact date field in Internal REDCAp = call completed
log_updates_call1 = log_updates[log_updates['List of Data Changes OR Fields Exported'].str.contains("contact1_dt", na=False)].copy()
log_updates_call2 = log_updates[log_updates['List of Data Changes OR Fields Exported'].str.contains("contact2_dt", na=False)].copy()
log_updates_call3 = log_updates[log_updates['List of Data Changes OR Fields Exported'].str.contains("contact3_dt", na=False)].copy()

print("# of Round 1 Call Updates: ", log_updates_call1.shape[0])
print("# of Round 2 Call Updates: ", log_updates_call2.shape[0])
print("# of Round 3 Call Updates: ", log_updates_call3.shape[0])
print("\nTotal Call Updates: ", log_updates_call1.shape[0] + log_updates_call2.shape[0] + log_updates_call3.shape[0])

# of Round 1 Call Updates:  790
# of Round 2 Call Updates:  509
# of Round 3 Call Updates:  287

Total Call Updates:  1586


### Log: obtain study ID

In [3]:
# extract study ID from long string ('Action')
log_updates_call1['study_id'] = log_updates_call1['Action'].str.split(' ').str[2]
log_updates_call2['study_id'] = log_updates_call2['Action'].str.split(' ').str[2]
log_updates_call3['study_id'] = log_updates_call3['Action'].str.split(' ').str[2]

### Log: handle duplicates

In [4]:
# convert dtype from string to datetime
log_updates_call1['Time / Date'] = pd.to_datetime(log_updates_call1['Time / Date'])
log_updates_call2['Time / Date'] = pd.to_datetime(log_updates_call2['Time / Date'])
log_updates_call3['Time / Date'] = pd.to_datetime(log_updates_call3['Time / Date'])

# sort calls by date from oldest to newest
log_updates_call1 = log_updates_call1.sort_values(by=['Time / Date'])
log_updates_call2 = log_updates_call2.sort_values(by=['Time / Date'])
log_updates_call3 = log_updates_call3.sort_values(by=['Time / Date'])

# check for duplicate updates to 1 study ID
print("original round 1 duplicates: ", log_updates_call1[log_updates_call1['study_id'].duplicated(keep=False)].shape[0])
print("original round 2 duplicates: ", log_updates_call2[log_updates_call2['study_id'].duplicated(keep=False)].shape[0])
print("original round 3 duplicates: ", log_updates_call3[log_updates_call3['study_id'].duplicated(keep=False)].shape[0])

# when there are multiple updates to 1 study ID, only take the most recent update
unique_log_updates_call1 = log_updates_call1.drop_duplicates(subset=['study_id'], keep='last')
unique_log_updates_call2 = log_updates_call2.drop_duplicates(subset=['study_id'], keep='last')
unique_log_updates_call3 = log_updates_call3.drop_duplicates(subset=['study_id'], keep='last')

# check for duplicate updates to 1 study ID
print("\nnew round 1 duplicates: ", unique_log_updates_call1[unique_log_updates_call1['study_id'].duplicated(keep=False)].shape[0])
print("new round 2 duplicates: ", unique_log_updates_call2[unique_log_updates_call2['study_id'].duplicated(keep=False)].shape[0])
print("new round 3 duplicates: ", unique_log_updates_call3[unique_log_updates_call3['study_id'].duplicated(keep=False)].shape[0])

print("\n# of Round 1 Calls: ", unique_log_updates_call1.shape[0])
print("# of Round 2 Calls: ", unique_log_updates_call2.shape[0])
print("# of Round 3 Calls: ", unique_log_updates_call3.shape[0])
print("\nTotal Calls: ", unique_log_updates_call1.shape[0] + unique_log_updates_call2.shape[0] + unique_log_updates_call3.shape[0])

original round 1 duplicates:  0
original round 2 duplicates:  7
original round 3 duplicates:  2

new round 1 duplicates:  0
new round 2 duplicates:  0
new round 3 duplicates:  0

# of Round 1 Calls:  790
# of Round 2 Calls:  505
# of Round 3 Calls:  286

Total Calls:  1581


### Log: extract useful data from strings

In [5]:
# extract individual updates from long string ('List of Data Changes OR Fields Exported'), based on delimiter ','
split_unique_log_updates_call1 = pd.concat([unique_log_updates_call1['study_id'], 
                                     unique_log_updates_call1['List of Data Changes OR Fields Exported'].str.split(',', expand=True)],
                                    axis=1,)
split_unique_log_updates_call2 = pd.concat([unique_log_updates_call2['study_id'],
                                     unique_log_updates_call2['List of Data Changes OR Fields Exported'].str.split(',', expand=True)],
                                    axis=1,)
split_unique_log_updates_call3 = pd.concat([unique_log_updates_call3['study_id'],
                                     unique_log_updates_call3['List of Data Changes OR Fields Exported'].str.split(',', expand=True)],
                                    axis=1,)



# extract updates specific to round 1 / 2 / 3 phone calls 
# make a boolean mask of all columns in the round 1 / 2 / 3 dataframe, True for matching strings
# forward fill rows with matching strings and take only the last value 
# basically picks out the matching value regardless of column location and places it into the correct column

# round 1
df = split_unique_log_updates_call1.copy()
strings = ['contact1_dt', 'contact1_output', 'contact1_nt', 'verbal_yn']
updates_round_1 = pd.DataFrame()

for s in strings:
    for col in df: 
        df[col] = df[col].mask(~df[col].str.contains(s, na=False))
    updates_round_1[s] = df.ffill(axis=1).iloc[:, -1]
    df = split_unique_log_updates_call1.copy()

# extracts date from 'contact1_dt' output
updates_round_1.iloc[:,0] = pd.to_datetime(updates_round_1.iloc[:,0].str.extract('(\d{1,4}-\d{1,2}-\d{1,2})')[0])

# convert call code outputs to real words, per UCI's REDCap Codebook 
updates_round_1.iloc[:,1] = updates_round_1.iloc[:,1].replace({" contact1_output = '1'": "No answer/unable to leave VM/busy/disconnected"}, regex=True)
updates_round_1.iloc[:,1] = updates_round_1.iloc[:,1].replace({" contact1_output = '2'": "Left a message"}, regex=True)
updates_round_1.iloc[:,1] = updates_round_1.iloc[:,1].replace({" contact1_output = '3'": "Call back later"}, regex=True)
updates_round_1.iloc[:,1] = updates_round_1.iloc[:,1].replace({" contact1_output = '4'": "Hasn't received packet yet, call back in one week"}, regex=True)
updates_round_1.iloc[:,1] = updates_round_1.iloc[:,1].replace({" contact1_output = '5'": "Hasn't received packet and team needs to resend in the mail"}, regex=True)
updates_round_1.iloc[:,1] = updates_round_1.iloc[:,1].replace({" contact1_output = '6'": "Send link to the survey"}, regex=True)
updates_round_1.iloc[:,1] = updates_round_1.iloc[:,1].replace({" contact1_output = '7'": "Completed survey by phone"}, regex=True)
updates_round_1.iloc[:,1] = updates_round_1.iloc[:,1].replace({" contact1_output = '8'": "Patient refused"}, regex=True)
updates_round_1.iloc[:,1] = updates_round_1.iloc[:,1].replace({" contact1_output = '9'": "Deceased"}, regex=True)
updates_round_1.iloc[:,1] = updates_round_1.iloc[:,1].replace({" contact1_output = '10'": "Other Notes"}, regex=True)

updates_round_1.iloc[:,3] = updates_round_1.iloc[:,3].replace({" verbal_yn = '0'": "No Verbal Consent"}, regex=True)
updates_round_1.iloc[:,3] = updates_round_1.iloc[:,3].replace({" verbal_yn = '1'": "Yes Verbal Consent"}, regex=True)
    
    
# round 2
df = split_unique_log_updates_call2.copy()
strings = ['contact2_dt', 'contact2_output', 'contact2_nt', 'verbal_yn']
updates_round_2 = pd.DataFrame()

for s in strings:
    for col in df: 
        df[col] = df[col].mask(~df[col].str.contains(s, na=False))
    updates_round_2[s] = df.ffill(axis=1).iloc[:, -1]
    df = split_unique_log_updates_call2.copy()

updates_round_2.iloc[:,0] = pd.to_datetime(updates_round_2.iloc[:,0].str.extract('(\d{1,4}-\d{1,2}-\d{1,2})')[0])

updates_round_2.iloc[:,1] = updates_round_2.iloc[:,1].replace({" contact2_output = '1'": "No answer/unable to leave VM/busy/disconnected"}, regex=True)
updates_round_2.iloc[:,1] = updates_round_2.iloc[:,1].replace({" contact2_output = '2'": "Left a message"}, regex=True)
updates_round_2.iloc[:,1] = updates_round_2.iloc[:,1].replace({" contact2_output = '3'": "Call back later"}, regex=True)
updates_round_2.iloc[:,1] = updates_round_2.iloc[:,1].replace({" contact2_output = '4'": "Hasn't received packet yet, call back in one week"}, regex=True)
updates_round_2.iloc[:,1] = updates_round_2.iloc[:,1].replace({" contact2_output = '5'": "Hasn't received packet and team needs to resend in the mail"}, regex=True)
updates_round_2.iloc[:,1] = updates_round_2.iloc[:,1].replace({" contact2_output = '6'": "Send link to the survey"}, regex=True)
updates_round_2.iloc[:,1] = updates_round_2.iloc[:,1].replace({" contact2_output = '7'": "Completed survey by phone"}, regex=True)
updates_round_2.iloc[:,1] = updates_round_2.iloc[:,1].replace({" contact2_output = '8'": "Patient refused"}, regex=True)
updates_round_2.iloc[:,1] = updates_round_2.iloc[:,1].replace({" contact2_output = '9'": "Deceased"}, regex=True)
updates_round_2.iloc[:,1] = updates_round_2.iloc[:,1].replace({" contact2_output = '10'": "Other Notes"}, regex=True)

updates_round_2.iloc[:,3] = updates_round_2.iloc[:,3].replace({" verbal_yn = '0'": "No Verbal Consent"}, regex=True)
updates_round_2.iloc[:,3] = updates_round_2.iloc[:,3].replace({" verbal_yn = '1'": "Yes Verbal Consent"}, regex=True)


# round 3
df = split_unique_log_updates_call3.copy()
strings = ['contact3_dt', 'contact3_output', 'contact3_nt', 'verbal_yn']
updates_round_3 = pd.DataFrame()

for s in strings:
    for col in df: 
        df[col] = df[col].mask(~df[col].str.contains(s, na=False))
    updates_round_3[s] = df.ffill(axis=1).iloc[:, -1]
    df = split_unique_log_updates_call3.copy()

updates_round_3.iloc[:,0] = pd.to_datetime(updates_round_3.iloc[:,0].str.extract('(\d{1,4}-\d{1,2}-\d{1,2})')[0])

updates_round_3.iloc[:,1] = updates_round_3.iloc[:,1].replace({" contact3_output = '1'": "No answer/unable to leave VM/busy/disconnected"}, regex=True)
updates_round_3.iloc[:,1] = updates_round_3.iloc[:,1].replace({" contact3_output = '2'": "Left a message"}, regex=True)
updates_round_3.iloc[:,1] = updates_round_3.iloc[:,1].replace({" contact3_output = '3'": "Call back later"}, regex=True)
updates_round_3.iloc[:,1] = updates_round_3.iloc[:,1].replace({" contact3_output = '4'": "Hasn't received packet yet, call back in one week"}, regex=True)
updates_round_3.iloc[:,1] = updates_round_3.iloc[:,1].replace({" contact3_output = '5'": "Hasn't received packet and team needs to resend in the mail"}, regex=True)
updates_round_3.iloc[:,1] = updates_round_3.iloc[:,1].replace({" contact3_output = '6'": "Send link to the survey"}, regex=True)
updates_round_3.iloc[:,1] = updates_round_3.iloc[:,1].replace({" contact3_output = '7'": "Completed survey by phone"}, regex=True)
updates_round_3.iloc[:,1] = updates_round_3.iloc[:,1].replace({" contact3_output = '8'": "Patient refused"}, regex=True)
updates_round_3.iloc[:,1] = updates_round_3.iloc[:,1].replace({" contact3_output = '9'": "Deceased"}, regex=True)
updates_round_3.iloc[:,1] = updates_round_3.iloc[:,1].replace({" contact3_output = '10'": "Other Notes"}, regex=True)

updates_round_3.iloc[:,3] = updates_round_3.iloc[:,3].replace({" verbal_yn = '0'": "No Verbal Consent"}, regex=True)
updates_round_3.iloc[:,3] = updates_round_3.iloc[:,3].replace({" verbal_yn = '1'": "Yes Verbal Consent"}, regex=True)




# create new dataframes with relevant columns for round 1 / 2 / 3 calls 

# round 1
clean_round_1 = split_unique_log_updates_call1['study_id'].copy()
clean_round_1 = pd.concat((clean_round_1, unique_log_updates_call1['Username']), axis=1)
clean_round_1 = pd.concat((clean_round_1, unique_log_updates_call1['Time / Date']), axis=1)
clean_round_1 = pd.concat((clean_round_1, updates_round_1), axis=1)
clean_round_1.columns = ['study_id', 'caller_username_1', 'call_timestamp_1', 'call_date_1', 'call_output_1', 'call_notes_1', 'call_verbal_consent_1']


# round 2 
clean_round_2 = split_unique_log_updates_call2['study_id'].copy()
clean_round_2 = pd.concat((clean_round_2, unique_log_updates_call2['Username']), axis=1)
clean_round_2 = pd.concat((clean_round_2, unique_log_updates_call2['Time / Date']), axis=1)
clean_round_2 = pd.concat((clean_round_2, updates_round_2), axis=1)
clean_round_2.columns = ['study_id', 'caller_username_2', 'call_timestamp_2', 'call_date_2', 'call_output_2', 'call_notes_2', 'call_verbal_consent_2']


# round 3
clean_round_3 = split_unique_log_updates_call3['study_id'].copy()
clean_round_3 = pd.concat((clean_round_3, unique_log_updates_call3['Username']), axis=1)
clean_round_3 = pd.concat((clean_round_3, unique_log_updates_call3['Time / Date']), axis=1)
clean_round_3 = pd.concat((clean_round_3, updates_round_3), axis=1)
clean_round_3.columns = ['study_id', 'caller_username_3', 'call_timestamp_3', 'call_date_3', 'call_output_3', 'call_notes_3', 'call_verbal_consent_3']


print("column names: ", clean_round_1.columns.values.tolist())
print("shape of round 1 dataset: ", clean_round_1.shape)
print("shape of round 2 dataset: ", clean_round_2.shape)
print("shape of round 3 dataset: ", clean_round_3.shape)

column names:  ['study_id', 'caller_username_1', 'call_timestamp_1', 'call_date_1', 'call_output_1', 'call_notes_1', 'call_verbal_consent_1']
shape of round 1 dataset:  (790, 7)
shape of round 2 dataset:  (505, 7)
shape of round 3 dataset:  (286, 7)


### Log: (UCI ONLY) shift incorrect datetimes

In [6]:
# UCI-Only Correction: Timestamps created during 2/3 - 2/5 are 8 hours ahead, due to REDCap upgrade error
# incorrect updates range from to 02/03/2020 5:30pm to 02/06/2020 1:50am
# only the timestamps that come from the REDCap audit log are affected, not the "call_date" which is input by users
# store indeces of calls made between incorrect range
# change timestamps of incorrect calls to shift 8 hrs earlier
round_1_mistakes_index = clean_round_1[(clean_round_1['call_timestamp_1'] > '2020-02-03 16:30:00') &
                                       (clean_round_1['call_timestamp_1'] < '2020-02-06 02:00:00')].index.values.tolist()
clean_round_1.loc[round_1_mistakes_index, 'call_timestamp_1'] += pd.DateOffset(hours=-8)


round_2_mistakes_index = clean_round_2[(clean_round_2['call_timestamp_2'] > '2020-02-03 16:30:00') &
                                       (clean_round_2['call_timestamp_2'] < '2020-02-06 02:00:00')].index.values.tolist()
clean_round_2.loc[round_2_mistakes_index, 'call_timestamp_2'] += pd.DateOffset(hours=-8)


round_3_mistakes_index = clean_round_3[(clean_round_3['call_timestamp_3'] > '2020-02-03 16:30:00') &
                                       (clean_round_3['call_timestamp_3'] < '2020-02-06 02:00:00')].index.values.tolist()
clean_round_3.loc[round_3_mistakes_index, 'call_timestamp_3'] += pd.DateOffset(hours=-8)

### REDCap Extract: convert number coding to strings

In [7]:
# NOTE: for some reason, the columns "gift_card_type" and "survey_completed_method" are stored as floats,
#       which is problematic when converting to strings 
# convert the 2 float columns to Int64 (which allows null values to coexist with ints)
redcap_extract['gift_card_type'] = redcap_extract['gift_card_type'].astype('Int64')
redcap_extract['survery_completed_method'] = redcap_extract['survery_completed_method'].astype('Int64')

# convert entire dataframe to strings
redcap_extract = redcap_extract.applymap(str)

# convert dates back to datetime data type
redcap_extract['survey_completed_dt'] = pd.to_datetime(redcap_extract['survey_completed_dt'])
redcap_extract['consent_received_dt'] = pd.to_datetime(redcap_extract['consent_received_dt'])
redcap_extract['hipaa_received_dt'] = pd.to_datetime(redcap_extract['hipaa_received_dt'])
redcap_extract['consent_mailed_dt'] = pd.to_datetime(redcap_extract['consent_mailed_dt'])

redcap_extract.iloc[:,2] = redcap_extract.iloc[:,2].replace({"0": "English"}, regex=True)
redcap_extract.iloc[:,2] = redcap_extract.iloc[:,2].replace({"1": "Spanish"}, regex=True)

redcap_extract.iloc[:,3] = redcap_extract.iloc[:,3].replace({"0": "No HIPAA Sent"}, regex=True)
redcap_extract.iloc[:,3] = redcap_extract.iloc[:,3].replace({"1": "Yes HIPAA Sent"}, regex=True)

redcap_extract.iloc[:,6] = redcap_extract.iloc[:,6].replace({"1": "Paper"}, regex=True)
redcap_extract.iloc[:,6] = redcap_extract.iloc[:,6].replace({"2": "Phone"}, regex=True)
redcap_extract.iloc[:,6] = redcap_extract.iloc[:,6].replace({"3": "Email"}, regex=True)

redcap_extract.iloc[:,10] = redcap_extract.iloc[:,10].replace({"1": "e-Gift Card"}, regex=True)
redcap_extract.iloc[:,10] = redcap_extract.iloc[:,10].replace({"2": "Physical Gift Card"}, regex=True)
redcap_extract.iloc[:,10] = redcap_extract.iloc[:,10].replace({"3": "Target Gift Card"}, regex=True)
redcap_extract.iloc[:,10] = redcap_extract.iloc[:,10].replace({"4": "Patient Declined Gift Card"}, regex=True)

# add date that 1st bulk mailing was dropped off at USPS
redcap_extract['survey_mailing_date_1'] = pd.to_datetime('11/13/2019')

# NOTE: may want to add some additional items from Internal REDCap
useful_redcap_cols = ['study_id', 'survey_language_sent', 'hipaa_sent_yn', 'survey_completed_dt', 
                      'consent_received_dt', 'survery_completed_method',  'hipaa_received_dt', 
                      'caregiver_name', 'consent_mailed_dt', 'gift_card_type', 'opt_out_pat_dt',
                      'opted_out_patient_reasons', 'opted_out_patient_other',
                      'opted_out_patient_transcription', 'survey_mailing_date_1']

clean_redcap_extract = redcap_extract.filter(useful_redcap_cols).copy()

### Round 2 Mailing: merge mailing lists

In [8]:
# take the 4 mailing lists sent to mail vendor and combine 
all_mailings = [round_2_mailing_a, round_2_mailing_b, round_2_mailing_c, round_2_mailing_d]
all_round_2_mailings = pd.concat(all_mailings)

# add date that 2nd bulk mailing was dropped off at USPS 
all_round_2_mailings['survey_mailing_date_2'] = pd.to_datetime('2/18/2020')

# only take necessary columns
useful_round_2_mailing_cols = ['study_id', 'survey_mailing_date_2']
clean_round_2_mailings = all_round_2_mailings.filter(useful_round_2_mailing_cols).copy()

# convert study_id to string
clean_round_2_mailings = clean_round_2_mailings.astype({'study_id': str})

### Clarity Extract: calculate age and convert data types

In [9]:
# convert dates to datetime data type
clarity_extract['birth_date'] = pd.to_datetime(clarity_extract['birth_date'])
clarity_extract['death_date'] = pd.to_datetime(clarity_extract['death_date'])

# convert int to string for consistency
clarity_extract['study_id'] = clarity_extract['study_id'].astype(str)

# calculate age from today's date, unless patient is dead (then use deceased date)
# NOTE: may want to make this more precise and account for months 
def calculate_age(dob, dod): 
    today = date.today()
    if dod is pd.NaT: 
        return today.year - dob.year
    else: 
        return dod.year - dob.year

clarity_extract['age'] = clarity_extract.apply(lambda x: calculate_age(x['birth_date'], x['death_date']), axis=1)


# TODO: ask Rick for marital status, religion
useful_clarity_cols = ['study_id', 'sex', 'race', 'ethnicity', 'age', 'spoken_language', 'written_language']

clean_clarity_extract = clarity_extract.filter(useful_clarity_cols).copy()

## Join REDCap Extract, REDCap Log, and Clarity Extract

In [10]:
# merge round 1 / 2 / 3 calls so there is 1 patient per line with all call data in columns 
clean_1_2_calls = pd.merge(clean_round_1, clean_round_2, how='left', on='study_id')
clean_all_calls = pd.merge(clean_1_2_calls, clean_round_3, how='left', on='study_id')

# join patients who were called with patients who returned surveys (source: REDCap report / extract)
calls_and_redcap = pd.merge(clean_all_calls, clean_redcap_extract, how='outer', on='study_id')

# add 2nd round mailing dates
calls_and_redcap_and_mailing = pd.merge(calls_and_redcap, clean_round_2_mailings, how='left', on='study_id')

# get full clarity data on patients who were called and / or returned surveys 
all_recruit = pd.merge(calls_and_redcap_and_mailing, clean_clarity_extract, how='left', on='study_id')

print("Total # of patients who received recruitment attempts: ", all_recruit.shape[0])
#print("\n\nColumns: ", all_recruit.columns.values.tolist())

Total # of patients who received recruitment attempts:  892


In [11]:
print(all_recruit.dtypes)

study_id                                   object
caller_username_1                          object
call_timestamp_1                   datetime64[ns]
call_date_1                        datetime64[ns]
call_output_1                              object
call_notes_1                               object
call_verbal_consent_1                      object
caller_username_2                          object
call_timestamp_2                   datetime64[ns]
call_date_2                        datetime64[ns]
call_output_2                              object
call_notes_2                               object
call_verbal_consent_2                      object
caller_username_3                          object
call_timestamp_3                   datetime64[ns]
call_date_3                        datetime64[ns]
call_output_3                              object
call_notes_3                               object
call_verbal_consent_3                      object
survey_language_sent                       object


In [12]:
#all_recruit.to_csv('data/processed/UCI_recruitment_cohort.csv', index=False)