##  NOTE - This notebook was adapted from explore_clinicaldata.ipynb from ai-readi-notebooks repository

# What is "concept ID"?

In the AIREADI dataset, OMOP concept IDs are assigned to various variables such as HbA1c, visual acuity, and answers to questionnaires for standardization purposes. For more details, see the OMOP Clinical Data Structure documentation: https://docs.aireadi.org/docs/1/dataset/clinical-data/OMOP-Clinical-Data-Structure/. Finding concept IDs is essential when searching for specific values.

To explore which values are included, you can refer to the OMOP Mapping Table for clinical data (https://docs.aireadi.org/v1-omopTable) or the Data Domain Table for clinical lab tests (https://docs.aireadi.org/v1-dataDomainTable).

# How to find concept IDs?

To identify concept IDs, you can use files such as condition_occurrence.csv, measurement.csv, and observation.csv within the clinical_data section of the dataset.

Here are example codes of finding concept IDs.

In [60]:
import os
import stat

from tqdm import tqdm

import time
import pandas as pd

In [61]:
# custom path -- change to match your file structure

data_root = "C:\\Users\\preet\\Documents\\AI_READI\\"  

In [62]:
# Load clinical data from TSV and CSV files
participants_df = pd.read_csv(data_root + 'participants.tsv', sep='\t')
measurement_df = pd.read_csv(os.path.join(data_root, "clinical_data", "measurement.csv"))



In [63]:
for df in [participants_df,measurement_df]:
    print(df.shape)
    print(df.columns)
    #print(df.head())

(1067, 15)
Index(['participant_id', 'clinical_site', 'study_group', 'age',
       'study_visit_date', 'recommended_split', 'cardiac_ecg', 'clinical_data',
       'environment', 'retinal_flio', 'retinal_oct', 'retinal_octa',
       'retinal_photography', 'wearable_activity_monitor',
       'wearable_blood_glucose'],
      dtype='object')
(114807, 25)
Index(['measurement_id', 'person_id', 'measurement_concept_id',
       'measurement_date', 'measurement_datetime', 'measurement_time',
       'measurement_type_concept_id', 'operator_concept_id', 'value_as_number',
       'value_as_concept_id', 'unit_concept_id', 'range_low', 'range_high',
       'provider_id', 'visit_occurrence_id', 'visit_detail_id',
       'measurement_source_value', 'measurement_source_concept_id',
       'unit_source_value', 'unit_source_concept_id', 'value_source_value',
       'measurement_event_id', 'meas_event_field_concept_id',
       'qualifier_concept_id', 'qualifier_source_value'],
      dtype='object')


In [64]:
participants_df.head()

Unnamed: 0,participant_id,clinical_site,study_group,age,study_visit_date,recommended_split,cardiac_ecg,clinical_data,environment,retinal_flio,retinal_oct,retinal_octa,retinal_photography,wearable_activity_monitor,wearable_blood_glucose
0,1001,UW,pre_diabetes_lifestyle_controlled,69,2023-07-27,train,True,True,True,True,True,True,True,False,True
1,1002,UW,healthy,69,2023-08-01,train,True,True,True,True,True,True,True,False,True
2,1003,UW,oral_medication_and_or_non_insulin_injectable_...,82,2023-08-02,train,True,True,True,True,True,True,True,False,True
3,1004,UW,oral_medication_and_or_non_insulin_injectable_...,61,2023-08-08,val,True,True,True,True,True,True,True,False,True
4,1005,UW,insulin_dependent,58,2023-08-08,val,True,True,True,True,True,True,True,False,True


In [65]:
measurement_df.head()

Unnamed: 0,measurement_id,person_id,measurement_concept_id,measurement_date,measurement_datetime,measurement_time,measurement_type_concept_id,operator_concept_id,value_as_number,value_as_concept_id,...,visit_detail_id,measurement_source_value,measurement_source_concept_id,unit_source_value,unit_source_concept_id,value_source_value,measurement_event_id,meas_event_field_concept_id,qualifier_concept_id,qualifier_source_value
0,20452,7289,2005200055,2024-05-14,2024-05-14 00:00:00,,32862,0,40.0,0,...,0,"viaosmsf, Snellen fraction - Mesopic VA - OS",0,,0,,0,0,0.0,
1,111043,7043,3017250,2024-02-29,2024-02-29 00:00:00,00:00:00,32856,4172703,28.3,0,...,0,Urine Creatinine (mg/dL),0,mg/DL,0,28.3,0,0,,
2,42372,1102,2005200182,2024-01-09,2024-01-09 00:00:00,,32862,0,10.58,0,...,0,"lbscat_wbc, White Blood Cells (WBC) - x10E3/µL",0,,0,,0,0,0.0,
3,32175,4243,2005200056,2024-06-26,2024-06-26 00:00:00,,32862,0,45.0,0,...,0,"viaodmlog, LLVA Letter Score - Mesopic VA - OD",0,,0,,0,0,45876703.0,Right eye
4,54995,1311,2005200012,2024-06-04,2024-06-04 00:00:00,,32862,0,25.0,0,...,0,"viaodpsf, Snellen fraction - Photopic VA - OD",0,,0,,0,0,0.0,


In [66]:
# Find concept IDs in measurement.csv
measurement_unique_values = measurement_df['measurement_source_value'].unique()
measurement_sorted_list = sorted(measurement_unique_values)
for value in measurement_sorted_list:
    concept_id = measurement_df['measurement_concept_id'].get(measurement_df['measurement_source_value']==value).iloc[0]
    print(f"'{value}': {concept_id},")

'A/G Ratio': 4288601,
'ALT (IU/L)': 3006923,
'AST (IU/L)': 3013721,
'Albumin (g/dL)': 3024561,
'Alkaline Phosphatase (IU/L)': 3035995,
'BUN (mg/dL)': 3013682,
'BUN/Creatinine ratio': 4112223,
'Bilirubin, Total (mg/dL)': 3024128,
'C-Peptide (ng/mL)': 3010084,
'CRP - HS (mg/L)': 3010156,
'Calcium (mg/dL)': 3006906,
'Carbon Dioxide, Total (mEq/L)': 3015632,
'Chloride (mEq/L)': 3014576,
'Creatinine (mg/dL)': 3016723,
'Globulin, Total (g/dL)': 3021886,
'Glucose (mg/dL)': 3004501,
'HDL Cholesterol (mg/dL)': 3007070,
'HbA1c (%)': 3004410,
'INSULIN (ng/mL)': 3016244,
'LDL Cholesterol Calculation (mg/dL)': 3028288,
'NT-proBNP (pg/mL)': 3029187,
'Potassium (mEq/L)': 3023103,
'Protein, Total (g/dL)': 3020630,
'Sodium (mEq/L)': 3019550,
'Total Cholesterol (mg/dL)': 3027114,
'Triglycerides (mg/dL)': 3022192,
'Troponin-T (ng/L)': 40769783,
'Urine Albumin (mg/dL)': 3001802,
'Urine Creatinine (mg/dL)': 3017250,
'bmi_vsorres, BMI': 4245997,
'bp1_diabp_vsorres, Diastolic (mmHg)': 3012888,
'bp1_sysbp_vso

# How to make csv files for analyzing

### Example of making csv file by using variables listed in measurement.csv

In [67]:
concept_ids = {
'HbA1c (%)': 3004410,
}

def parse_measurement(df, concept_id):
    temp_df = df[df['measurement_concept_id'] == concept_id]
    return temp_df


final_df = pd.DataFrame(columns=['participant_id', 'age', 'study_group','recommended_split', 'clinical_site'])

final_df['participant_id'] = participants_df['participant_id']
final_df['age'] = participants_df['age']
final_df['study_group'] = participants_df['study_group']
final_df['clinical_site'] = participants_df['clinical_site']
final_df['recommended_split'] = participants_df['recommended_split']

for key, value in concept_ids.items():
    temp_df = parse_measurement(measurement_df, value)
    temp_df = temp_df.rename(columns={'value_as_number': key})
    temp_df = temp_df.rename(columns={'person_id': 'participant_id'})
    temp_df = temp_df[['participant_id', key]]

    final_df = pd.merge(final_df, temp_df, on='participant_id', how='right')
 

final_df = final_df.drop_duplicates(subset='participant_id', keep='last')
print(final_df.shape)
final_df.head()

(1027, 6)


Unnamed: 0,participant_id,age,study_group,recommended_split,clinical_site,HbA1c (%)
0,4188,82,oral_medication_and_or_non_insulin_injectable_...,train,UCSD,5.7
1,1251,46,healthy,train,UW,5.5
2,1252,53,insulin_dependent,val,UW,5.9
3,7183,73,healthy,train,UAB,5.8
4,7047,71,pre_diabetes_lifestyle_controlled,val,UAB,5.6


In [68]:
final_df.dropna(inplace=True)
final_df.drop(final_df[final_df['HbA1c (%)'] == 0].index, inplace=True)


In [69]:
final_df.shape

(1016, 6)

In [70]:
final_df.to_csv("HbA1c_all_patients.csv")