In [1]:
# Requirements
collection_name = "omop_test_synthea"

# Optional
files = ['observation.tsv', 'conditions.tsv']

In [2]:
import subprocess
import os
import pandas
from IPython.display import display, HTML

display(HTML("<style>.container { width:100% !important; }</style>"))

# https://support.terra.bio/hc/en-us/articles/360046617372-Accessing-data-from-the-workspace-Bucket-in-a-notebook#h_e2733953-ecfe-4ffe-86f6-bab3eb0cf4fc
BILLING_PROJECT_ID = os.environ['WORKSPACE_NAMESPACE']
WORKSPACE = os.environ['WORKSPACE_NAME']
bucket = os.environ['WORKSPACE_BUCKET']

# Copies the repo into an editable dir.
!rsync -a --exclude='.git' /opt/ftd-omop-validator/ /home/jupyter/ftd-omop-validator/
!PYTHONPATH=home/jupyter/ftd-omop-validator/src

# The custom img creates an env for python3.12 includes pip. Use these versions when running the validation script.
py39 = "/opt/py39_venv/bin/python"
pip39 = "/opt/py39_venv/bin/pip"
base = "/home/jupyter"

# Frequently used filepaths
fov = "/home/jupyter/ftd-omop-validator/src/ftd_omop_validator"
script_loc = "ftd-omop-validator/src/ftd_omop_validator/omop_file_validator.py"
csv_dir = f"/home/jupyter/uploads/{collection_name}"
results = "/home/jupyter/ftd-omop-validator/src/ftd_omop_validator/data/output/results.csv"
results_html = "/home/jupyter/ftd-omop-validator/src/ftd_omop_validator/data/output/results.html"


In [3]:
# Copy all files from the workspace bucket to the notebook disk
!mkdir -p {csv_dir}
!gcloud storage cp --recursive $bucket/* {base}

Copying gs://fc-7b54f13c-b2af-420d-a76d-e63c27660750/notebooks/dbt_test_implementation.ipynb to file:///home/jupyter/notebooks/dbt_test_implementation.ipynb
Copying gs://fc-7b54f13c-b2af-420d-a76d-e63c27660750/notebooks/how_to_312_custom_env.ipynb to file:///home/jupyter/notebooks/how_to_312_custom_env.ipynb
Copying gs://fc-7b54f13c-b2af-420d-a76d-e63c27660750/notebooks/omop_validation.ipynb to file:///home/jupyter/notebooks/omop_validation.ipynb
⠛ Completed files 0 | 0B                                                       Copying gs://fc-7b54f13c-b2af-420d-a76d-e63c27660750/uploads/cleaned_patients.csv to file:///home/jupyter/uploads/cleaned_patients.csv
Copying gs://fc-7b54f13c-b2af-420d-a76d-e63c27660750/uploads/omop_test_synthea/conditions.tsv to file:///home/jupyter/uploads/omop_test_synthea/conditions.tsv
Copying gs://fc-7b54f13c-b2af-420d-a76d-e63c27660750/uploads/omop_test_synthea/observation.tsv to file:///home/jupyter/uploads/omop_test_synthea/observation.tsv
  Completed fil

In [4]:
# Optional: Sanity check - Views the tables listed in the `files` variable
print('Available files')
!ls {csv_dir}


print('View files')
for table in files:
    view = f"{csv_dir}/{table}"
    e = pandas.read_csv(view)
    print(table)
    print(e.head(2))

Available files
conditions.tsv	observation.tsv
View files
observation.tsv
  DATE\tPATIENT\tENCOUNTER\tCATEGORY\tCODE\tDESCRIPTION\tVALUE\tUNITS\tTYPE
0  2024-02-10T14:06:25Z\tb037c279-fd7b-58a1-45c3-...                       
1  2024-02-10T14:06:25Z\tb037c279-fd7b-58a1-45c3-...                       
conditions.tsv
  START\tSTOP\tPATIENT\tENCOUNTER\tSYSTEM\tCODE\tDESCRIPTION
0  2023-11-11\t2024-04-20\t82b58f87-e223-122b-9ba...        
1  2024-02-10\t2024-03-16\tb037c279-fd7b-58a1-45c...        


In [5]:
subprocess.run([py39, script_loc, "-c", csv_dir], cwd=base)

  df = pd.read_csv(f,


/home/jupyter/uploads/omop_test_synthea
Found .tsv file /home/jupyter/uploads/omop_test_synthea/conditions.tsv
"conditions" is not a valid OMOP table
Finished processing conditions.tsv

Printing to /home/jupyter/ftd-omop-validator/src/ftd_omop_validator/data/output/results.csv
/home/jupyter/uploads/omop_test_synthea
Found .tsv file /home/jupyter/uploads/omop_test_synthea/observation.tsv
Parsing CSV file for OMOP table "observation"
CSV file for "observation" parsed successfully. Please check for errors in the results files.
Finished processing observation.tsv

Printing to /home/jupyter/ftd-omop-validator/src/ftd_omop_validator/data/output/results.csv


CompletedProcess(args=['/opt/py39_venv/bin/python', 'ftd-omop-validator/src/ftd_omop_validator/omop_file_validator.py', '-c', '/home/jupyter/uploads/omop_test_synthea'], returncode=0)

In [6]:
pandas.read_csv(results)

Unnamed: 0,File Name,Table Name,Message,Column Name,Actual,Expected
0,conditions.tsv,Conditions,"""conditions"" is not a valid OMOP table",,,
1,observation.tsv,Observation,Please add/fix incorrect headers at the top of...,,['DATE\tPATIENT\tENCOUNTER\tCATEGORY\tCODE\tDE...,"['observation_id', 'person_id', 'observation_c..."
2,observation.tsv,Observation,Incorrect number of columns on line 2: ['2024-...,,,
3,observation.tsv,Observation,Column not in table definition,DATE\tPATIENT\tENCOUNTER\tCATEGORY\tCODE\tDESC...,DATE\tPATIENT\tENCOUNTER\tCATEGORY\tCODE\tDESC...,
4,observation.tsv,Observation,Column missing in file,observation_id,,observation_id
5,observation.tsv,Observation,Column missing in file,person_id,,person_id
6,observation.tsv,Observation,Column missing in file,observation_concept_id,,observation_concept_id
7,observation.tsv,Observation,Column missing in file,observation_date,,observation_date
8,observation.tsv,Observation,Column missing in file,observation_datetime,,observation_datetime
9,observation.tsv,Observation,Column missing in file,observation_type_concept_id,,observation_type_concept_id


In [7]:
print(f'To see the html version, click on the jupyter logo at the top of this notebook and navigate here:\n"{results_html}"')

To see the html version, click on the jupyter logo at the top of this notebook and navigate here:
"/home/jupyter/ftd-omop-validator/src/ftd_omop_validator/data/output/results.html"
