In [2]:
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 400)
pd.set_option('colheader_justify', 'left')

In [3]:
# Load data dictionary from file
dd = pd.read_csv('../utils/clinical_data_dict.csv')
# Load the  data frames.
clinical_df = pd.read_csv('../../data/core_data_set_20200211_adalab_clinical.csv', encoding='latin1')
blood_df = pd.read_csv('../../data/core_data_set_20200211_adalab_blood.csv', encoding='latin1')
imaging_df = pd.read_csv('../../data/core_data_set_20200211_adalab_imaging.csv', encoding='latin1')

In [13]:
all_cols = set(clinical_df.columns).union(blood_df.columns)#.union(imaging_df.columns)
dd_cols = set(dd["Progenitor-Variable"].values)

# All variables from clinical and blood data, that have no entry in the Data Dictionary.
# In imaging data, only 3 of 1091 columns are explained in the DataDictionary.
all_cols.difference(dd_cols)

{'LDL_HDL_ratio',
 'Status_3Monate',
 'T1Creatinine_mg_dl',
 'T1Glucose_mg_dl',
 'T1Glucose_mmolL',
 'T1Glucose_mmolL_Reanalysis',
 'T1HDL_mg_dl',
 'T1HDL_mmolL',
 'T1HDL_mmolL_Reanalysis',
 'T1HbA1c_Percentage',
 'T1LDL_mg_dl',
 'T1MCH',
 'T1MCHC',
 'T1TotalCholest_mg_dl',
 'T1Triglyc_mg_dl',
 'T1Triglyc_mmolL_Reanalysis',
 'T1Triglycerides_mmolL',
 'T1UricAcid_mg_dl',
 'T1_IL18_pgml_Boraschi',
 'T1_MDA',
 'T1_NTproBNP',
 'T1_S100A12_plasma',
 'T1_Troponin',
 'T1_Volk_IL8_pgml',
 'T1nonHDL_mg_dl',
 'anesthComb',
 'anesthReg',
 'diabetes_any',
 'localisation_ThrAbdPlv',
 'localisation_intracranial',
 'op0270_cat_v2',
 'preopAnaemia',
 'subject'}

## Custom Accessor to display the Data Dictionary on Demand

In [3]:
@pd.api.extensions.register_dataframe_accessor("ada")
class AdaAccessor:
    def __init__(self, df):
        self._obj = df
        self.dd = pd.read_csv('../utils/clinical_data_dict.csv').set_index("Progenitor-Variable")
        mask = df.columns.isin(dd["Progenitor-Variable"])
        missings = df.columns[~mask].to_list()
        self._missings = missings

    @property
    def description(self):
        # return the Data Description for every column that is present.
        mask = ~self._obj.columns.isin(self._missings)
        cols = self._obj.columns[mask].to_list()
        return self.dd.loc[cols,:]


In [9]:
blood_df.ada.description

Unnamed: 0_level_0,Variable,Name,Units/ Categorization,Description,Comment
Progenitor-Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
T1_Proinsulinintakt,T1_Proinsulinintakt,"Proinsulin, intact at baseline",pmol/L,Immundiagnostik,
T1_oxLDL,T1_oxLDL,oxidized LDL at baseline,ng/ml,Immundiagnostik,
T1_Nitrotyrosin,T1_Nitrotyrosin,Nitrotyrosin at baseline,nM,Immundiagnostik,
T1_Calprotectinn,T1_Calprotectin,Calprotectin at baseline,ng/mL,Immundiagnostik,
T1_Zonulin_N1200,T1_Zonulin_N1200,Zonulin at baseline,ng/mL,Immundiagnostik,
T1_hArginin,T1_hArginin,Homoarginine at baseline,µmol/l,Immundiagnostik,
T1_ADMA,T1_ADMA,Asymmetric dimethylarginine at baseline,µmol/l,Immundiagnostik,
T1_SDMA,T1_SDMA,Symmetric dimethylarginine at baseline,µmol/l],Immundiagnostik,
T1_KNYAcid,T1_KNYAcid,Kynurenine-Acid at baseline,nM,Immundiagnostik,
T1_NTproBNP_MissingRepl,T1_NTproBNP_MissingRepl,N-terminal pro-brain natriuretic peptide at baseline,pmol/l,Immundiagnostik,"""<3"" = missing replaced by 2.9"


In [10]:
imaging_df.ada.description

Unnamed: 0_level_0,Variable,Name,Units/ Categorization,Description,Comment
Progenitor-Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BrainVol_cm3_pre,BrainVol_cm3_pre,Brainvolume preoperative,cm³,Sum of voxels in SPM12 grey and white matter maps (whole brain tissue),Method described1
BFCS_Vol_cm3_pre,BFCS_Vol_mm3_pre,Basal forebrain cholinergic system volume preoperative,mm³,"Sum of voxels in Zaborszkys map of the basal forebrain (CH12, CH3, CH4, CH4p)","Method described1,2"
NBM_Vol_cm3_pre,NBM_Vol_mm3_pre,Nucleus basalis Meynert volume preoperative,mm³,"Sum of voxels in Zaborszkys regions most likely corresponding to the Nucleus basalis of Meynert (magnocellularis)(CH4, CH4p)","Method described1,2"


In [11]:
meta = clinical_df.ada.description
meta

Unnamed: 0_level_0,Variable,Name,Units/ Categorization,Description,Comment
Progenitor-Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
dm0030_v1,dm0030_v1,Age,Years,age of the patient at the timepoint of study inclusion,
ie0072_v1,ie0072_v1,MMSE score,Points,MMSE score at Inclusion,
op0270_v2,op0270_v2,Duration of anaesthesia (min),minutes,Time between start and end of anesthesia,
LOSdays,LOSdays,Days in hospital,days,Time between hospital discharge and hospital admission,
localisation,localisation,Site of surgery,"- 1 = intracranial - 2 = intrathoracic, -abdominal or pelvic - 3 = peripheral",Site of Surgery according to Operationen- und Prozedurenschlüssel (OPS-Code),
icd0300_v1,icd0300_v1,ASA physical status,- 1 = ASA I - 2 = ASA II - 3 = ASA III - 4 = ASA IV,Physical Status according to American Society of Anestesiologists Physical Status (ASA PS),
complication,complication,"Occurrence of postoperative complications (no, any, inhouse death)",- 1 = no complication - 2 = complication excluding death (any deviation from normal postoperative course) - 3 = death,Postoperative Complications,"Timeframe: from recovery room to discharge or inhouse death Data source: patient files, medical records (paper and electronic)"
icd0031_v1,icd0031_v1,Arterial hypertension,- 0 = no - 1 = yes,Arterial hypertension as known comorbidity at the timepoint of study inclusion,From medical records and patient’s interview
icd0041_v1,icd0041_v1,Coronary artery disease,- 0 = no - 1 = yes,Coronary artery disease as known comorbidity at the timepoint of study inclusion,From medical records and information from the patient
icd0121_v1,icd0121_v1,non-insulin dependent diabetes mellitus,- 0 = no - 1 = yes,"Diabetes mellitus (with dietary treatment or treatment with oral diabetes agents only, NIDDM) as known comorbidity at the timepoint of study inclusion",From medical records and information from the patient
