# 03. Clinical Preprocessing

<p>This notebook imports and organizes clinical data corresponding to the patient samples that have been evaluated for HIV-associated neurocogntive disorders and assigned a Global Deficit Score (GDS). This data is stored in a csv file that is a data dump from REDCAP, as well as a file containing the results of neurocognitive assessments. All of the patients in this study belong to the DrexelMed CARES Cohort.</p>
<p><b>Input:</b></p>
<ul>
<li>REDCAP data csv file
<li>Drexel CARES neurocognitive impairment file
</ul>
<p><b>Output:</b></p>
<ul>
<li>csv file containing all clinical information
</ul>

# Import Requirements

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
module_directory = '../code'
modules = sys.path
if module_directory not in modules:
    sys.path.append(module_directory)

import clinical_preprocess as cp

# Import Data

In [3]:
redcap_path = '../data/raw_clinical/DrexelMedCARESCohort_DATA_LABELS_2015-10-06_1434.csv'
gds_path = '../data/raw_clinical/Drexel HIV NP Data select standardized 072815 - fixed.xlsx'

# Data clean-up

In [4]:
cols = {'PatientID':'Patient',
        'Patient visit number':'Visit',
        'Date of visit':'DateOfVisit',
        'Initial CD4 count (cells/uL)':'iCD4',
        'Nadir CD4 count (cells/uL)':'nCD4',
        'Latest CD4 count (cells/uL)':'CD4',
        'Initial CD8 count (cells/uL)': 'iCD8',
        'Nadir CD8 count (cells/uL)': 'nCD8',
        'Latest CD8 count (cells/uL)': 'CD8',
        'Initial viral load (copies/mL)': 'iVL',
        'Peak viral load (copies/mL)': 'pVL',
        'Latest viral load': 'VL',
        'Total Modified Hopkins Dementia Score': 'TMHDS',
        'Gender': 'Gender',
        'Age': 'Age',
        'Years seropositive': 'Years_seropositive',
        'Race (choice=Asian)': 'Race_Asian',
        'Race (choice=American Indian/Alaska Native)': 'Race_Native_American',
        'Race (choice=Black or African American)': 'Race_Black',
        'Race (choice=Native Hawaiian or other Pacific Islander)': 'Race_Native_Hawaiian',
        'Race (choice=White)': 'Race_White',
        'Race (choice=More than one race)': 'Race_Multiple',
        'Race (choice=Unknown)': 'Race_Unknown',
        'Exposure Category (choice=Men who have sex with men (MSM))':'Exposure_MSM',
        'Exposure Category (choice=Injection drug use (IDU))':'Exposure_IDU',
        'Exposure Category (choice=Blood transfusion)':'Exposure_blood_transfusion',
        'Exposure Category (choice=Heterosexual)':'Exposure_heterosexual',
        'Exposure Category (choice=Hemophilia)':'Exposure_hemophilia', 
        'Exposure Category (choice=Perinatal)':'Exposure_perinatal',
        'Exposure Category (choice=MSM and IDU)':'Exposure_MSM_and_IDU',
        'Exposure Category (choice=Heterosexual and IDU)':'Exposure_heterosexual_and_IDU',
        'Exposure Category (choice=Other)':'Exposure_other',
        'Exposure Category (choice=ND)':'Exposure_unknown',
        'Current ART status': 'ART'}

col_order = ['Patient', 'Visit', 'DateOfVisit', 'Age', 'Gender', 'ART',
             'VL', 'iVL', 'pVL', 'CD4', 'iCD4', 'nCD4', 'CD8', 'iCD8', 'nCD8',
             'Race_Asian', 'Race_Native_American', 'Race_Black', 'Race_Native_Hawaiian',
             'Race_White', 'Race_Multiple', 'Race_Unknown', 'Exposure_MSM',
             'Exposure_IDU', 'Exposure_blood_transfusion', 'Exposure_heterosexual',
             'Exposure_hemophilia', 'Exposure_perinatal', 'Exposure_MSM_and_IDU',
             'Exposure_heterosexual_and_IDU', 'Exposure_other', 'Exposure_unknown',
             'Years_seropositive','TMHDS']



In [5]:
redcap_df = cp.import_redcap_data(redcap_path)
redcap_df['PatientID'] = redcap_df['PatientID'].astype(str)
print('REDCAP:', redcap_df.shape)

GDS_df = pd.read_excel(gds_path, headline=1)
GDS_df['PatientID'] = GDS_df['PatientID'].astype(str)
print('GDS:', GDS_df.shape)

clin_df = cp.select_clinical_parameters(redcap_df, cols)
clin_df = clin_df[col_order]
print('CLIN:', clin_df.shape)
display(clin_df.head())

REDCAP: (3055, 433)
GDS: (197, 20)
CLIN: (3055, 34)


Unnamed: 0,Patient,Visit,DateOfVisit,Age,Gender,ART,VL,iVL,pVL,CD4,...,Exposure_blood_transfusion,Exposure_heterosexual,Exposure_hemophilia,Exposure_perinatal,Exposure_MSM_and_IDU,Exposure_heterosexual_and_IDU,Exposure_other,Exposure_unknown,Years_seropositive,TMHDS
0,A0001,R00,2006-09-12,51.0,Male,on,1515,3892,65000,384,...,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Checked,11,
1,A0001,R01,2007-08-15,52.0,Male,on,80,3892,65000,724,...,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,12,
2,A0001,R02,2008-06-04,53.0,Male,on,80,3892,65000,573,...,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,13,1.0
3,A0001,R03,2008-11-11,53.0,Male,on,<48,3892,65000,858,...,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,13,5.0
4,A0001,R04,2009-11-10,54.0,Male,on,<48,3892,65000,689,...,Unchecked,Checked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,14,8.5


In [6]:
visits = []
for idx, row in GDS_df.iterrows():
    NEURO_PATIENT = row['PatientID']
    NEURO_DATE = row['VisitDate']    
    
    clin_info = clin_df[clin_df.Patient == NEURO_PATIENT][['Patient', 'Visit', 'DateOfVisit']]
    clin_info2 = clin_info[clin_info.DateOfVisit == NEURO_DATE][['Patient', 'Visit', 'DateOfVisit']]
    if clin_info2.empty:
        #print NEURO_PATIENT, NEURO_DATE
        for a, b in clin_info.dropna().iterrows():
            days_difference = b.DateOfVisit - NEURO_DATE
            if abs(days_difference.days) < 7:
                visits.append(b.Visit)
                #print ''
    else:
        #print NEURO_PATIENT, NEURO_DATE
        visits.append(list(clin_info2['Visit'])[0])
        #print ''
        
GDS_df['Visit'] = visits
print(GDS_df.shape)
GDS_df[['PatientID', 'Visit', 'VisitDate', 'GDS']].head()

(197, 20)


Unnamed: 0,PatientID,Visit,VisitDate,GDS
0,A0001,R09,2014-11-10,0.583333
1,A0002,R11,2013-10-22,2.5
2,A0004,R10,2014-11-10,0.272727
3,A0005,R05,2013-12-12,0.75
4,A0008,R07,2014-08-05,0.5


# Merge Data

In [7]:
fullclinical_df = pd.merge(clin_df,GDS_df,
                  left_on = ['Patient','Visit'],
                  right_on = ['PatientID','Visit'],
                  how = 'outer')

fullclinical_df.drop(['PatientID'], axis=1, inplace=True)
print(fullclinical_df.shape)
display(fullclinical_df.head())

(3055, 52)


Unnamed: 0,Patient,Visit,DateOfVisit,Age,Gender,ART,VL,iVL,pVL,CD4,...,TrailB_Heaton_T,LetterFluencyFL_SENAS_T,CategoryFluency_SENAS_T,ROCF_CNNS_T,WorkingMemory_SENAS_T,WordListLearning_SENAS_T,BVMTimmed_CNNS_T,BVMTdelay_CNNS_T,BVMTrecog_CNNS_T,GDS
0,A0001,R00,2006-09-12,51.0,Male,on,1515,3892,65000,384,...,,,,,,,,,,
1,A0001,R01,2007-08-15,52.0,Male,on,80,3892,65000,724,...,,,,,,,,,,
2,A0001,R02,2008-06-04,53.0,Male,on,80,3892,65000,573,...,,,,,,,,,,
3,A0001,R03,2008-11-11,53.0,Male,on,<48,3892,65000,858,...,,,,,,,,,,
4,A0001,R04,2009-11-10,54.0,Male,on,<48,3892,65000,689,...,,,,,,,,,,


# Fix Errors

In [8]:
incorrect_columns = ['iCD4','nCD4','CD4','iCD8','nCD8','CD8','iVL','pVL','VL']

for colname in incorrect_columns:
    #print colname
    corrected_values = []
    for item in list(fullclinical_df[colname]):
        l = []
        if isinstance(item, float):
            corrected_values.append(item)
        elif isinstance(item, str):
            item = item.replace('<','')
            item = item.replace('>','')
            item = item.replace(',','')
            item = item.replace('?','')
            for t in item.split():
                try:
                    l.append(float(t))
                except ValueError:
                    #pass
                    l.append(np.nan)
                except IndexError:
                    l.append(np.nan)
            #print l, item
            corrected_values.append(l[0])
        else:
            print(item, type(item))
    #print len(corrected_values)
    fullclinical_df[colname] = corrected_values

print('Full Clinical:', fullclinical_df.shape)
fullclinical_df.head()

Full Clinical: (3055, 52)


Unnamed: 0,Patient,Visit,DateOfVisit,Age,Gender,ART,VL,iVL,pVL,CD4,...,TrailB_Heaton_T,LetterFluencyFL_SENAS_T,CategoryFluency_SENAS_T,ROCF_CNNS_T,WorkingMemory_SENAS_T,WordListLearning_SENAS_T,BVMTimmed_CNNS_T,BVMTdelay_CNNS_T,BVMTrecog_CNNS_T,GDS
0,A0001,R00,2006-09-12,51.0,Male,on,1515.0,3892.0,65000.0,384.0,...,,,,,,,,,,
1,A0001,R01,2007-08-15,52.0,Male,on,80.0,3892.0,65000.0,724.0,...,,,,,,,,,,
2,A0001,R02,2008-06-04,53.0,Male,on,80.0,3892.0,65000.0,573.0,...,,,,,,,,,,
3,A0001,R03,2008-11-11,53.0,Male,on,48.0,3892.0,65000.0,858.0,...,,,,,,,,,,
4,A0001,R04,2009-11-10,54.0,Male,on,48.0,3892.0,65000.0,689.0,...,,,,,,,,,,


# Save clinical information in file

In [9]:
fullclinical_df.to_csv('../data/processed_clinical/full_clinical.csv', index=False)