In [1]:
import os

In [2]:
from vanna.remote import VannaDefault

In [3]:
api_key = os.environ.get('VANNA_API_KEY')

In [6]:
vn = VannaDefault(model='omop', api_key=api_key)

In [7]:
# Check the models available in the account
vn._model

'omop'

In [8]:
import pandas as pd

In [9]:
from pathlib import Path

### Get the ddl, documentation and queries for training the model

In [None]:
# Get the ddl files
ddl = Path('../omop-schema/omop_cdm_schema.ddl').read()


In [10]:
omop_ddl_url = 'https://raw.githubusercontent.com/OHDSI/CommonDataModel/main/ddl/5.4/postgresql/OMOPCDM_postgresql_5.4_ddl.sql'

In [11]:
import requests

In [12]:
ddl = requests.get(omop_ddl_url).text

In [15]:
table_info = pd.read_csv('https://raw.githubusercontent.com/OHDSI/CommonDataModel/main/inst/csv/OMOP_CDMv5.4_Table_Level.csv', encoding='latin1')

In [13]:
field_info = pd.read_csv('https://raw.githubusercontent.com/OHDSI/CommonDataModel/main/inst/csv/OMOP_CDMv5.4_Field_Level.csv', encoding='latin1')

In [16]:
table_info

Unnamed: 0,cdmTableName,schema,isRequired,conceptPrefix,measurePersonCompleteness,measurePersonCompletenessThreshold,validation,tableDescription,userGuidance,etlConventions
0,person,CDM,Yes,,No,,,This table serves as the central identity mana...,All records in this table are independent Pers...,All Persons in a database needs one record in ...
1,observation_period,CDM,Yes,,Yes,0.0,,This table contains records which define spans...,"For each Person, one or more OBSERVATION_PERIO...",Each Person needs to have at least one OBSERVA...
2,visit_occurrence,CDM,No,VISIT_,Yes,0.0,,This table contains Events where Persons engag...,The configuration defining the Visit are descr...,Visits can be derived easily if the source dat...
3,visit_detail,CDM,No,VISIT_DETAIL_,Yes,0.0,,The VISIT_DETAIL table is an optional table us...,The configuration defining the Visit Detail is...,It is not mandatory that the VISIT_DETAIL tabl...
4,condition_occurrence,CDM,No,CONDITION_,Yes,0.0,,This table contains records of Events of a Per...,Conditions are defined by Concepts from the Co...,Source codes and source text fields mapped to ...
5,drug_exposure,CDM,No,DRUG_,Yes,0.0,,This table captures records about the exposure...,The purpose of records in this table is to ind...,Information about quantity and dose is provide...
6,procedure_occurrence,CDM,No,PROCEDURE_,Yes,0.0,,This table contains records of activities or p...,"Lab tests are not a procedure, if something is...","When dealing with duplicate records, the ETL m..."
7,device_exposure,CDM,No,DEVICE_,Yes,0.0,,The Device domain captures information about a...,The distinction between Devices or supplies an...,Source codes and source text fields mapped to ...
8,measurement,CDM,No,MEASUREMENT_,Yes,0.0,,The MEASUREMENT table contains records of Meas...,Measurements are predominately lab tests with ...,Only records where the source value maps to a ...
9,observation,CDM,No,OBSERVATION_,Yes,0.0,,The OBSERVATION table captures clinical facts ...,Observations differ from Measurements in that ...,Records whose Source Values map to any domain ...


In [19]:
field_info

Unnamed: 0,cdmTableName,cdmFieldName,isRequired,cdmDatatype,userGuidance,etlConventions,isPrimaryKey,isForeignKey,fkTableName,fkFieldName,fkDomain,fkClass,unique DQ identifiers
0,person,person_id,Yes,integer,It is assumed that every person with a differe...,Any person linkage that needs to occur to uniq...,Yes,No,,,,,
1,person,gender_concept_id,Yes,integer,This field is meant to capture the biological ...,Use the gender or sex value present in the dat...,No,Yes,CONCEPT,CONCEPT_ID,Gender,,
2,person,year_of_birth,Yes,integer,Compute age using year_of_birth.,"For data sources with date of birth, the year ...",No,No,,,,,
3,person,month_of_birth,No,integer,,For data sources that provide the precise date...,No,No,,,,,
4,person,day_of_birth,No,integer,,For data sources that provide the precise date...,No,No,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
427,cohort_definition,cohort_definition_description,No,varchar(MAX),A complete description of the cohort.,,No,No,,,,,
428,cohort_definition,definition_type_concept_id,Yes,integer,Type defining what kind of Cohort Definition t...,,No,Yes,CONCEPT,CONCEPT_ID,,,
429,cohort_definition,cohort_definition_syntax,No,varchar(MAX),Syntax or code to operationalize the Cohort De...,,No,No,,,,,
430,cohort_definition,subject_concept_id,Yes,integer,This field contains a Concept that represents ...,,No,Yes,CONCEPT,CONCEPT_ID,,,


In [44]:
table_info.head(3)

Unnamed: 0,cdmTableName,schema,isRequired,conceptPrefix,measurePersonCompleteness,measurePersonCompletenessThreshold,validation,tableDescription,userGuidance,etlConventions
0,person,CDM,Yes,,No,,,This table serves as the central identity mana...,All records in this table are independent Pers...,All Persons in a database needs one record in ...
1,observation_period,CDM,Yes,,Yes,0.0,,This table contains records which define spans...,"For each Person, one or more OBSERVATION_PERIO...",Each Person needs to have at least one OBSERVA...
2,visit_occurrence,CDM,No,VISIT_,Yes,0.0,,This table contains Events where Persons engag...,The configuration defining the Visit are descr...,Visits can be derived easily if the source dat...


In [46]:
field_info.head(3)

Unnamed: 0,cdmTableName,cdmFieldName,isRequired,cdmDatatype,userGuidance,etlConventions,isPrimaryKey,isForeignKey,fkTableName,fkFieldName,fkDomain,fkClass,unique DQ identifiers
0,person,person_id,Yes,integer,It is assumed that every person with a differe...,Any person linkage that needs to occur to uniq...,Yes,No,,,,,
1,person,gender_concept_id,Yes,integer,This field is meant to capture the biological ...,Use the gender or sex value present in the dat...,No,Yes,CONCEPT,CONCEPT_ID,Gender,,
2,person,year_of_birth,Yes,integer,Compute age using year_of_birth.,"For data sources with date of birth, the year ...",No,No,,,,,


In [17]:
# Get the documentation files

doc = ''

for i, row in table_info.iterrows():
    table_name = row['cdmTableName']
    schema = row['schema'].lower().replace('vocab', 'vocabularies')
    tableDescription =row['tableDescription']
    userGuidance = row['tableDescription']

    fields = ''
    for i, field_row in field_info[field_info.cdmTableName == table_name].iterrows():
        field_name = field_row['cdmFieldName']
        field_help = field_row['userGuidance']
        field_required = field_row['isRequired']
        field_type = field_row['cdmDatatype']
        field_doc = f'''
        Field name: {field_name} of type {field_type}
        {field_name} field is {'not' if field_required == 'No' else ''} required
        {field_help}
         '''
        fields += field_doc

    table_doc = f'''
    Table {table_name}
    Description for table {table_name}
    {tableDescription} 
    User guidance for table {table_name}
    userGuidance
    {table_name} fields
    {fields}
    =======================================
    '''

    doc += table_doc
    

In [18]:
print(doc)


    Table person
    Description for table person
    This table serves as the central identity management for all Persons in the database. It contains records that uniquely identify each person or patient, and some demographic information. 
    User guidance for table person
    userGuidance
    person fields
    
        Field name: person_id of type integer
        person_id field is  required
        It is assumed that every person with a different unique identifier is in fact a different person and should be treated independently.
         
        Field name: gender_concept_id of type integer
        gender_concept_id field is  required
        This field is meant to capture the biological sex at birth of the Person. This field should not be used to study gender identity issues.
         
        Field name: year_of_birth of type integer
        year_of_birth field is  required
        Compute age using year_of_birth.
         
        Field name: month_of_birth of type integer


In [19]:
# Get the query files

# Specify the directory you want to crawl
directory = Path('../omop-schema/queries')

# Retrieve all .md files in the directory and its subdirectories
md_files = directory.rglob('*.md')

# Convert to a list if you need to use it multiple times
md_files_list = list(md_files)

queries=''
# Print the paths of all .md files
for md_file in md_files_list:
    queries += Path(md_file).read_text()


In [20]:
print(queries)

<!---
Group:person
Name:PE06 Number of patients grouped by year of birth
Author: Alberto Labarga
CDM Version: 5.4
-->

# PE06: Number of patients grouped by year of birth

## Description
Counts the year of birth (year_of_birth) across all person records. All existing values for year of birth are summarized.

## Query
```sql
SELECT
  year_of_birth,
  COUNT(person_id) AS num_persons
FROM cdm.person
GROUP BY year_of_birth
ORDER BY year_of_birth
;
```

## Input

None

## Output

|  Field |  Description |
| --- | --- |
|  year_of_birth |  Year of birth of the patient |
|  num_persons |  Number of patients in the dataset of specific year of birth |

## Example output record

| Field |  Value |
| --- | --- |
|  year_of_birth |  1950 |
|  num_persons |  389019 |

## Documentation
https://ohdsi.github.io/CommonDataModel/cdm54.html#PERSON
<!---
Group:person
Name:PE03 Number of patients grouped by gender
Author: Alberto Labarga
CDM Version: 5.4
-->

# PE03: Number of patients grouped by gender

#

In [None]:
doc

In [23]:
%%time

# Train the model with the ddl
vn.train(ddl=ddl, documentation=doc, sql=queries)

Adding documentation....
CPU times: user 7.12 ms, sys: 1.84 ms, total: 8.96 ms
Wall time: 893 ms


''

In [21]:
vn.connect_to_postgres(
    host="84.88.186.203",
    dbname="hospital_edge",
    user="postgres",
    password="pgrulez",
    port=5432
) # Connect to your database here



In [25]:
vn.ask('How many patients are in the database?')

SELECT COUNT(*) AS total_patients
FROM patients;
