# Data Prep

The data source is a REDCap audit logging file, which can be exported by going to REDCap > Sidebar > Logging > Export all logging (CSV). 

## Imports

In [46]:
import pandas as pd
from datetime import datetime
import numpy as np

pd.options.display.max_colwidth = 75

# import logging file from REDCap (REDCap > Sidebar > Logging > Export all logging (CSV))
log = pd.read_csv('data/raw/PCORIACPRecruitment_Logging_2020-12-26_0615.csv',encoding = "ISO-8859-1")

# import patient demographics from Clarity/other repository
Patient_Demographics = pd.read_csv('data/raw/demographics.csv',encoding = "ISO-8859-1")

In [2]:
## Functions

In [3]:
def general_missingness(dataframe,dataframe_name):
    print("Length of %s table: %d" % (dataframe_name,len(dataframe)))
    print()
    print( "Count Missing in Each Column:")
    print(dataframe.isnull().sum())

def print_counts_per_column(dataframe,column_list):
    for col_name in column_list:
        print('Column Name: %s' % col_name)
        print(dataframe[col_name].value_counts())
        print()

## Data Cleaning

### Split logging file into round 1 / 2 / 3 call dataframes

In [4]:
# make a new df with only REDCap Patient Record Updates
log_updates = log[log['Action'].str.contains("Updated")].copy()

# make a new df with only REDCap Patient Record Updates + Phone Call 1 / 2 / 3 Attempts
# assumes that RA updating the contact date field in Internal REDCAp = call completed
log_updates_call1 = log_updates[log_updates['List of Data Changes OR Fields Exported'].str.contains("contact1_dt", na=False)].copy()
log_updates_call2 = log_updates[log_updates['List of Data Changes OR Fields Exported'].str.contains("contact2_dt", na=False)].copy()
log_updates_call3 = log_updates[log_updates['List of Data Changes OR Fields Exported'].str.contains("contact3_dt", na=False)].copy()

log_updates_caregiver_call1 = log_updates[log_updates['List of Data Changes OR Fields Exported'].str.contains("caregiver_contact1_dt_vad", na=False)].copy()
log_updates_caregiver_call2 = log_updates[log_updates['List of Data Changes OR Fields Exported'].str.contains("caregiver_contact2_dt_vad", na=False)].copy()
log_updates_caregiver_call3 = log_updates[log_updates['List of Data Changes OR Fields Exported'].str.contains("caregiver_contact3_dt_vad", na=False)].copy()

print("# of Round 1 Call Updates: ", log_updates_call1.shape[0])
print("# of Round 2 Call Updates: ", log_updates_call2.shape[0])
print("# of Round 3 Call Updates: ", log_updates_call3.shape[0])
print("\nTotal Call Updates: ", log_updates_call1.shape[0] + log_updates_call2.shape[0] + log_updates_call3.shape[0])
print("# of Round 1 Caregiver Call Updates: ", log_updates_caregiver_call1.shape[0])
print("# of Round 2 Caregiver Call Updates: ", log_updates_caregiver_call2.shape[0])
print("# of Round 3 Caregiver Call Updates: ", log_updates_caregiver_call3.shape[0])


# of Round 1 Call Updates:  2773
# of Round 2 Call Updates:  370
# of Round 3 Call Updates:  29

Total Call Updates:  3172
# of Round 1 Caregiver Call Updates:  4
# of Round 2 Caregiver Call Updates:  0
# of Round 3 Caregiver Call Updates:  0


### Obtain study ID

In [5]:
# extract study ID from long string ('Action')
log_updates_call1['study_id'] = log_updates_call1['Action'].str.split(' ').str[2]
log_updates_call2['study_id'] = log_updates_call2['Action'].str.split(' ').str[2]
log_updates_call3['study_id'] = log_updates_call3['Action'].str.split(' ').str[2]

### Handle duplicates

In [6]:
# convert dtype from string to datetime
log_updates_call1['Time / Date'] = pd.to_datetime(log_updates_call1['Time / Date'])
log_updates_call2['Time / Date'] = pd.to_datetime(log_updates_call2['Time / Date'])
log_updates_call3['Time / Date'] = pd.to_datetime(log_updates_call3['Time / Date'])

# sort calls by date from oldest to newest
log_updates_call1 = log_updates_call1.sort_values(by=['Time / Date'])
log_updates_call2 = log_updates_call2.sort_values(by=['Time / Date'])
log_updates_call3 = log_updates_call3.sort_values(by=['Time / Date'])

# check for duplicate updates to 1 study ID
print("original round 1 duplicates: ", log_updates_call1[log_updates_call1['study_id'].duplicated(keep=False)].shape[0])
print("original round 2 duplicates: ", log_updates_call2[log_updates_call2['study_id'].duplicated(keep=False)].shape[0])
print("original round 3 duplicates: ", log_updates_call3[log_updates_call3['study_id'].duplicated(keep=False)].shape[0])

# when there are multiple updates to 1 study ID, only take the most recent update
unique_log_updates_call1 = log_updates_call1.drop_duplicates(subset=['study_id'], keep='last')
unique_log_updates_call2 = log_updates_call2.drop_duplicates(subset=['study_id'], keep='last')
unique_log_updates_call3 = log_updates_call3.drop_duplicates(subset=['study_id'], keep='last')

# check for duplicate updates to 1 study ID
print("\nnew round 1 duplicates: ", unique_log_updates_call1[unique_log_updates_call1['study_id'].duplicated(keep=False)].shape[0])
print("new round 2 duplicates: ", unique_log_updates_call2[unique_log_updates_call2['study_id'].duplicated(keep=False)].shape[0])
print("new round 3 duplicates: ", unique_log_updates_call3[unique_log_updates_call3['study_id'].duplicated(keep=False)].shape[0])

print("\n# of Round 1 Calls: ", unique_log_updates_call1.shape[0])
print("# of Round 2 Calls: ", unique_log_updates_call2.shape[0])
print("# of Round 3 Calls: ", unique_log_updates_call3.shape[0])
print("\nTotal Calls: ", unique_log_updates_call1.shape[0] + unique_log_updates_call2.shape[0] + unique_log_updates_call3.shape[0])

original round 1 duplicates:  40
original round 2 duplicates:  2
original round 3 duplicates:  0

new round 1 duplicates:  0
new round 2 duplicates:  0
new round 3 duplicates:  0

# of Round 1 Calls:  2752
# of Round 2 Calls:  369
# of Round 3 Calls:  29

Total Calls:  3150


### Extract useful data from strings

In [7]:
# extract individual updates from long string ('List of Data Changes OR Fields Exported'), based on delimiter ','
split_unique_log_updates_call1 = pd.concat([unique_log_updates_call1['study_id'], 
                                     unique_log_updates_call1['List of Data Changes OR Fields Exported'].str.split(',', expand=True)],
                                    axis=1,)
split_unique_log_updates_call2 = pd.concat([unique_log_updates_call2['study_id'],
                                     unique_log_updates_call2['List of Data Changes OR Fields Exported'].str.split(',', expand=True)],
                                    axis=1,)
split_unique_log_updates_call3 = pd.concat([unique_log_updates_call3['study_id'],
                                     unique_log_updates_call3['List of Data Changes OR Fields Exported'].str.split(',', expand=True)],
                                    axis=1,)



In [8]:


# extract updates specific to round 1 / 2 / 3 phone calls 
# make a boolean mask of all columns in the round 1 / 2 / 3 dataframe, True for matching strings
# forward fill rows with matching strings and take only the last value 
# basically picks out the matching value regardless of column location and places it into the correct column

# round 1
df = split_unique_log_updates_call1.copy()
strings = ['contact1_dt', 'contact1_output', 'contact1_nt', 'verbal_yn']
updates_round_1 = pd.DataFrame()

for s in strings:
    for col in df: 
        df[col] = df[col].mask(~df[col].str.contains(s, na=False))
    updates_round_1[s] = df.ffill(axis=1).iloc[:, -1]
    df = split_unique_log_updates_call1.copy()

In [10]:


# extracts date from 'contact1_dt' output
updates_round_1.iloc[:,0] = pd.to_datetime(updates_round_1.iloc[:,0].str.extract('(\d{1,4}-\d{1,2}-\d{1,2})')[0])

# convert call code outputs to real words, per UCI's REDCap Codebook 
updates_round_1.iloc[:,1] = updates_round_1.iloc[:,1].replace({"contact1_output = '1'": "No answer/unable to leave VM/busy/disconnected"}, regex=True)
updates_round_1.iloc[:,1] = updates_round_1.iloc[:,1].replace({"contact1_output = '2'": "Left a message"}, regex=True)
updates_round_1.iloc[:,1] = updates_round_1.iloc[:,1].replace({"contact1_output = '3'": "Call back later"}, regex=True)
updates_round_1.iloc[:,1] = updates_round_1.iloc[:,1].replace({"contact1_output = '4'": "Hasn't received packet yet, call back in one week"}, regex=True)
updates_round_1.iloc[:,1] = updates_round_1.iloc[:,1].replace({"contact1_output = '5'": "Hasn't received packet and team needs to resend in the mail"}, regex=True)
updates_round_1.iloc[:,1] = updates_round_1.iloc[:,1].replace({"contact1_output = '6'": "Send link to the survey"}, regex=True)
updates_round_1.iloc[:,1] = updates_round_1.iloc[:,1].replace({"contact1_output = '7'": "Completed survey by phone"}, regex=True)
updates_round_1.iloc[:,1] = updates_round_1.iloc[:,1].replace({"contact1_output = '8'": "Patient refused"}, regex=True)
updates_round_1.iloc[:,1] = updates_round_1.iloc[:,1].replace({"contact1_output = '9'": "Deceased"}, regex=True)
updates_round_1.iloc[:,1] = updates_round_1.iloc[:,1].replace({"contact1_output = '10'": "Other Notes"}, regex=True)

updates_round_1.iloc[:,3] = updates_round_1.iloc[:,3].replace({"verbal_yn = '0'": "No Verbal Consent"}, regex=True)
updates_round_1.iloc[:,3] = updates_round_1.iloc[:,3].replace({"verbal_yn = '1'": "Yes Verbal Consent"}, regex=True)
    
    

In [12]:

# round 2
df = split_unique_log_updates_call2.copy()
strings = ['contact2_dt', 'contact2_output', 'contact2_nt', 'verbal_yn']
updates_round_2 = pd.DataFrame()

for s in strings:
    for col in df: 
        df[col] = df[col].mask(~df[col].str.contains(s, na=False))
    updates_round_2[s] = df.ffill(axis=1).iloc[:, -1]
    df = split_unique_log_updates_call2.copy()

updates_round_2.iloc[:,0] = pd.to_datetime(updates_round_2.iloc[:,0].str.extract('(\d{1,4}-\d{1,2}-\d{1,2})')[0])

updates_round_2.iloc[:,1] = updates_round_2.iloc[:,1].replace({"contact2_output = '1'": "No answer/unable to leave VM/busy/disconnected"}, regex=True)
updates_round_2.iloc[:,1] = updates_round_2.iloc[:,1].replace({"contact2_output = '2'": "Left a message"}, regex=True)
updates_round_2.iloc[:,1] = updates_round_2.iloc[:,1].replace({"contact2_output = '3'": "Call back later"}, regex=True)
updates_round_2.iloc[:,1] = updates_round_2.iloc[:,1].replace({"contact2_output = '4'": "Hasn't received packet yet, call back in one week"}, regex=True)
updates_round_2.iloc[:,1] = updates_round_2.iloc[:,1].replace({"contact2_output = '5'": "Hasn't received packet and team needs to resend in the mail"}, regex=True)
updates_round_2.iloc[:,1] = updates_round_2.iloc[:,1].replace({"contact2_output = '6'": "Send link to the survey"}, regex=True)
updates_round_2.iloc[:,1] = updates_round_2.iloc[:,1].replace({"contact2_output = '7'": "Completed survey by phone"}, regex=True)
updates_round_2.iloc[:,1] = updates_round_2.iloc[:,1].replace({"contact2_output = '8'": "Patient refused"}, regex=True)
updates_round_2.iloc[:,1] = updates_round_2.iloc[:,1].replace({"contact2_output = '9'": "Deceased"}, regex=True)
updates_round_2.iloc[:,1] = updates_round_2.iloc[:,1].replace({"contact2_output = '10'": "Other Notes"}, regex=True)

updates_round_2.iloc[:,3] = updates_round_2.iloc[:,3].replace({"verbal_yn = '0'": "No Verbal Consent"}, regex=True)
updates_round_2.iloc[:,3] = updates_round_2.iloc[:,3].replace({"verbal_yn = '1'": "Yes Verbal Consent"}, regex=True)



In [13]:

# round 3
df = split_unique_log_updates_call3.copy()
strings = ['contact3_dt', 'contact3_output', 'contact3_nt', 'verbal_yn']
updates_round_3 = pd.DataFrame()

for s in strings:
    for col in df: 
        df[col] = df[col].mask(~df[col].str.contains(s, na=False))
    updates_round_3[s] = df.ffill(axis=1).iloc[:, -1]
    df = split_unique_log_updates_call3.copy()




In [14]:

updates_round_3.iloc[:,0] = pd.to_datetime(updates_round_3.iloc[:,0].str.extract('(\d{1,4}-\d{1,2}-\d{1,2})')[0])


In [16]:
updates_round_3.iloc[:,1] = updates_round_3.iloc[:,1].replace({"contact3_output = '1'": "No answer/unable to leave VM/busy/disconnected"}, regex=True)
updates_round_3.iloc[:,1] = updates_round_3.iloc[:,1].replace({"contact3_output = '2'": "Left a message"}, regex=True)
updates_round_3.iloc[:,1] = updates_round_3.iloc[:,1].replace({"contact3_output = '3'": "Call back later"}, regex=True)
updates_round_3.iloc[:,1] = updates_round_3.iloc[:,1].replace({"contact3_output = '4'": "Hasn't received packet yet, call back in one week"}, regex=True)
updates_round_3.iloc[:,1] = updates_round_3.iloc[:,1].replace({"contact3_output = '5'": "Hasn't received packet and team needs to resend in the mail"}, regex=True)
updates_round_3.iloc[:,1] = updates_round_3.iloc[:,1].replace({"contact3_output = '6'": "Send link to the survey"}, regex=True)
updates_round_3.iloc[:,1] = updates_round_3.iloc[:,1].replace({"contact3_output = '7'": "Completed survey by phone"}, regex=True)
updates_round_3.iloc[:,1] = updates_round_3.iloc[:,1].replace({"contact3_output = '8'": "Patient refused"}, regex=True)
updates_round_3.iloc[:,1] = updates_round_3.iloc[:,1].replace({"contact3_output = '9'": "Deceased"}, regex=True)
updates_round_3.iloc[:,1] = updates_round_3.iloc[:,1].replace({"contact3_output = '10'": "Other Notes"}, regex=True)
# neded to force str datatype on verbal_yn because all the entries were NaN
updates_round_3.iloc[:,3] = updates_round_3.iloc[:,3].astype(str) 
updates_round_3.iloc[:,3] = updates_round_3.iloc[:,3].replace({"verbal_yn = '0'": "No Verbal Consent"}, regex=True)
updates_round_3.iloc[:,3] = updates_round_3.iloc[:,3].replace({"verbal_yn = '1'": "Yes Verbal Consent"}, regex=True)

In [17]:
# create new dataframes with relevant columns for round 1 / 2 / 3 calls 

# round 1
clean_round_1 = split_unique_log_updates_call1['study_id'].copy()
clean_round_1 = pd.concat((clean_round_1, unique_log_updates_call1['Username']), axis=1)
clean_round_1 = pd.concat((clean_round_1, unique_log_updates_call1['Time / Date']), axis=1)
clean_round_1 = pd.concat((clean_round_1, updates_round_1), axis=1)
clean_round_1.columns = ['study_id', 'username_1', 'timestamp_1', 'call_date_1', 'call_output_1', 'call_notes_1', 'verbal_consent_1']


# round 2 
clean_round_2 = split_unique_log_updates_call2['study_id'].copy()
clean_round_2 = pd.concat((clean_round_2, unique_log_updates_call2['Username']), axis=1)
clean_round_2 = pd.concat((clean_round_2, unique_log_updates_call2['Time / Date']), axis=1)
clean_round_2 = pd.concat((clean_round_2, updates_round_2), axis=1)
clean_round_2.columns = ['study_id', 'username_2', 'timestamp_2', 'call_date_2', 'call_output_2', 'call_notes_2', 'verbal_consent_2']


# round 3
clean_round_3 = split_unique_log_updates_call3['study_id'].copy()
clean_round_3 = pd.concat((clean_round_3, unique_log_updates_call3['Username']), axis=1)
clean_round_3 = pd.concat((clean_round_3, unique_log_updates_call3['Time / Date']), axis=1)
clean_round_3 = pd.concat((clean_round_3, updates_round_3), axis=1)
clean_round_3.columns = ['study_id', 'username_3', 'timestamp_3', 'call_date_3', 'call_output_3', 'call_notes_3', 'verbal_consent_3']


print("column names: ", clean_round_1.columns.values.tolist())
print("shape of round 1 dataset: ", clean_round_1.shape)
print("shape of round 2 dataset: ", clean_round_2.shape)
print("shape of round 3 dataset: ", clean_round_3.shape)

column names:  ['study_id', 'username_1', 'timestamp_1', 'call_date_1', 'call_output_1', 'call_notes_1', 'verbal_consent_1']
shape of round 1 dataset:  (2752, 7)
shape of round 2 dataset:  (369, 7)
shape of round 3 dataset:  (29, 7)


### Merge round 1 / 2 / 3 dataframes on study ID

In [18]:
# merge round 1 / 2 / 3 calls so there is 1 patient per line with all call data in columns 
clean_1_2_calls = pd.merge(clean_round_1, clean_round_2, how="left", on='study_id')
clean_all_calls = pd.merge(clean_1_2_calls, clean_round_3, how="left", on='study_id')

#print("Example output for each patient:\n\n", clean_all_calls.iloc[200,:])


# TODO: merge Clarity extract to obtain other useful data (age, sex, race, ethnicity, etc.)

In [19]:
clean_all_calls.head()

Unnamed: 0,study_id,username_1,timestamp_1,call_date_1,call_output_1,call_notes_1,verbal_consent_1,username_2,timestamp_2,call_date_2,call_output_2,call_notes_2,verbal_consent_2,username_3,timestamp_3,call_date_3,call_output_3,call_notes_3,verbal_consent_3
0,1000003,jsanz,2019-07-12 06:58:00,NaT,,,,,NaT,NaT,,,,,NaT,NaT,,,
1,1000029534,jantoniolopez,2019-09-05 13:13:00,2019-09-05,Completed survey by phone,contact1_nt = 'Patient signed consent and HIPAA forms and will mail ba...,Yes Verbal Consent,,NaT,NaT,,,,,NaT,NaT,,,
2,1000030950,kmsantos,2019-10-08 08:10:00,2019-10-07,Completed survey by phone,contact1_nt = 'Pt called in to complete survey and scored 19/20 on dep...,,,NaT,NaT,,,,,NaT,NaT,,,
3,1000030428,adepaolisdickey,2019-10-23 13:46:00,2019-10-23,Other Notes,contact1_nt = 'States he sent in survey already',,,NaT,NaT,,,,,NaT,NaT,,,
4,1000029439,adepaolisdickey,2019-10-23 13:50:00,2019-10-23,Call back later,contact1_nt = 'States just returned from dialysis and is not feeling w...,,,NaT,NaT,,,,,NaT,NaT,,,


In [43]:
## Demographics

In [58]:
# standarize demographics headers
Patient_Demographics.columns = Patient_Demographics.columns.str.lower()# get counts
print('Patient_Demographics: ', format(Patient_Demographics.shape[0],  ',d'))

# convert study_id to object to match clean_all_calls.study_id
# clean_all_calls.dtypes
Patient_Demographics['study_id'] = Patient_Demographics['study_id'].astype(object)
Patient_Demographics.dtypes

Patient_Demographics:  5,116


study_id                       object
gender                         object
race                           object
ethnicity                      object
marital_status                 object
religion                       object
calc_current_age_death_age    float64
svi_socio_econ                float64
svi_hcomp_lang                float64
svi_mino_lang                 float64
svi_htyp_trans                float64
svi_total                     float64
dtype: object

In [48]:
 general_missingness(Patient_Demographics,'Patient_Demographics')

Length of Patient_Demographics table: 5116

Count Missing in Each Column:
study_id                         0
gender                           5
race                             6
ethnicity                        5
marital_status                   5
religion                         5
calc_current_age_death_age       0
svi_socio_econ                2594
svi_hcomp_lang                2594
svi_mino_lang                 2594
svi_htyp_trans                2594
svi_total                     2594
dtype: int64


In [30]:
# peek at demographics table
Patient_Demographics.head()

Unnamed: 0_level_0,gender,race,ethnicity,marital_status,religion,calc_current_age_death_age,svi_socio_econ,svi_hcomp_lang,svi_mino_lang,svi_htyp_trans,svi_total
study_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1000026253,Male,White or Caucasian,Not Hispanic or Latino,Widowed,Jewish,87.0,,,,,
1000026254,Male,White or Caucasian,Not Hispanic or Latino,Married,Presbyterian,62.0,0.0271,0.6434,0.0444,0.1711,0.0964
1000026255,Female,Other,Not Hispanic or Latino,Single,Patient Refused,999.0,,,,,
1000026257,Female,White or Caucasian,Not Hispanic or Latino,Widowed,Catholic,999.0,0.0347,0.1145,0.2684,0.2873,0.0947
1000026258,Male,White or Caucasian,Not Hispanic or Latino,Married,,83.0,,,,,


In [49]:
# Create field list
column_list = list(Patient_Demographics)
# Print counts per column
print_counts_per_column(Patient_Demographics,column_list)


Column Name: study_id
1000032255    1
1000026663    1
1000028702    1
1000026655    1
1000030753    1
             ..
1000031514    1
1000029469    1
1000031518    1
1000027424    1
1000030208    1
Name: study_id, Length: 5116, dtype: int64

Column Name: gender
Male      2608
Female    2503
Name: gender, dtype: int64

Column Name: race
White or Caucasian                           3248
Other                                         682
Asian                                         492
Black or African American                     480
Patient Refused                               102
Multiple Races                                 58
Unknown                                        30
American Indian or Alaska Native               13
Native Hawaiian or Other Pacific Islander       5
Name: race, dtype: int64

Column Name: ethnicity
Not Hispanic or Latino                  4205
Hispanic or Latino                       568
Patient Refused                          139
Mexican, Mexican American, C

In [41]:
# Clean up demo df before merging

In [64]:
# Add demographic and socio-economic indicators to call df
clean_all_calls_demo = pd.merge(clean_all_calls,Patient_Demographics, how="left", on='study_id')

In [65]:
clean_all_calls_demo.head()

Unnamed: 0,study_id,username_1,timestamp_1,call_date_1,call_output_1,call_notes_1,verbal_consent_1,username_2,timestamp_2,call_date_2,...,race,ethnicity,marital_status,religion,calc_current_age_death_age,svi_socio_econ,svi_hcomp_lang,svi_mino_lang,svi_htyp_trans,svi_total
0,1000003,jsanz,2019-07-12 06:58:00,NaT,,,,,NaT,NaT,...,,,,,,,,,,
1,1000029534,jantoniolopez,2019-09-05 13:13:00,2019-09-05,Completed survey by phone,contact1_nt = 'Patient signed consent and HIPAA forms and will mail ba...,Yes Verbal Consent,,NaT,NaT,...,,,,,,,,,,
2,1000030950,kmsantos,2019-10-08 08:10:00,2019-10-07,Completed survey by phone,contact1_nt = 'Pt called in to complete survey and scored 19/20 on dep...,,,NaT,NaT,...,,,,,,,,,,
3,1000030428,adepaolisdickey,2019-10-23 13:46:00,2019-10-23,Other Notes,contact1_nt = 'States he sent in survey already',,,NaT,NaT,...,,,,,,,,,,
4,1000029439,adepaolisdickey,2019-10-23 13:50:00,2019-10-23,Call back later,contact1_nt = 'States just returned from dialysis and is not feeling w...,,,NaT,NaT,...,,,,,,,,,,


In [66]:
clean_all_calls_demo.dtypes

study_id                              object
username_1                            object
timestamp_1                   datetime64[ns]
call_date_1                   datetime64[ns]
call_output_1                         object
call_notes_1                          object
verbal_consent_1                      object
username_2                            object
timestamp_2                   datetime64[ns]
call_date_2                   datetime64[ns]
call_output_2                         object
call_notes_2                          object
verbal_consent_2                      object
username_3                            object
timestamp_3                   datetime64[ns]
call_date_3                   datetime64[ns]
call_output_3                         object
call_notes_3                          object
verbal_consent_3                      object
gender                                object
race                                  object
ethnicity                             object
marital_st

In [67]:
clean_all_calls_demo.shape[0]

2752