In [204]:
import synapseclient
import synapseutils
from synapseclient import Project, File, Folder
from synapseclient import Schema, Column, Table, Row, RowSet, as_table_columns
import pandas as pd
import itertools

In [205]:
syn = synapseclient.Synapse()
syn.login()



Welcome, Victor Baham!



In [206]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [214]:
# read in AD data model from Github
ad_model_df = pd.read_csv('https://raw.githubusercontent.com/adknowledgeportal/data-models/main/AD.model.csv')

In [378]:
ad_model_df

Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,columnType,module
0,days,"Age measured in days, a period of 24 hours.",,,,,ageAssessmentUnits,,http://purl.obolibrary.org/obo/NCIT_C25301,,string,experimentalData
1,gestational weeks,Gestational age (written with both weeks and d...,,,,,ageAssessmentUnits,,http://purl.obolibrary.org/obo/NCIT_C81253,,string,experimentalData
2,months,Age measured in calendar months (approximately...,,,,,ageAssessmentUnits,,http://purl.obolibrary.org/obo/NCIT_C29846,,string,experimentalData
3,PCW,Post-Conception Weeks,,,,,ageAssessmentUnits,,https://embryology.med.unsw.edu.au/embryology/...,,string,experimentalData
4,weeks,"Age measured in weeks, a period of 7 consecuti...",,,,,ageAssessmentUnits,,http://purl.obolibrary.org/obo/NCIT_C29844,,string,experimentalData
5,years,Age measured in years (approximately 365 days).,,,,,ageAssessmentUnits,,http://purl.obolibrary.org/obo/NCIT_C29848,,string,experimentalData
6,days,"Age measured in days, a period of 24 hours.",,,,,ageDeathUnits,,http://purl.obolibrary.org/obo/NCIT_C25301,,string,demographics
7,gestational weeks,Gestational age (written with both weeks and d...,,,,,ageDeathUnits,,http://purl.obolibrary.org/obo/NCIT_C81253,,string,demographics
8,months,Age measured in calendar months (approximately...,,,,,ageDeathUnits,,http://purl.obolibrary.org/obo/NCIT_C29846,,string,demographics
9,PCW,Post-Conception Weeks,,,,,ageDeathUnits,,https://embryology.med.unsw.edu.au/embryology/...,,string,demographics


In [234]:
# add "admID" column to uniquely identify rows and explode DataFrame on "Valid Values" column
# to obtain rows containing unique (key, valid value) pairs
ad_model_df['admID'] = ad_model_df.index.astype(str)
ad_model_df['admID'] = ad_model_df['admID'].map(lambda x: 'adm' + x.zfill(8))

if (type(ad_model_df['Valid Values'][0]) != str):
    del ad_model_df
    ad_model_df = pd.read_csv('https://raw.githubusercontent.com/adknowledgeportal/data-models/main/AD.model.csv')
    ad_model_df['Valid Values'] = ad_model_df['Valid Values'].str.split(",")
    new_df = ad_model_df.explode('Valid Values', ignore_index=True)

In [358]:
# find all valid values to drop the rows where "Attribute" is one of the valid values
mega = ",".join(ad_model_df['Valid Values'].astype(str))
valid_vals = new_df['Valid Values'].astype(str).map(lambda x: x.lstrip()).tolist()
valid_vals = [x for x in valid_vals if str(x) != 'nan']

In [332]:
# map values to descriptions and value descriptions
values = [x for x in ad_model_df['Attribute'].astype(str).tolist() 
          if x not in ad_model_df['Parent'].astype(str).tolist()]

atbr_to_desc = dict(zip(ad_model_df['Attribute'], ad_model_df['Description']))

val_to_desc = {x: atbr_to_desc.get(x) for x in values if x not in valid_vals}
val_to_val_desc = {x: atbr_to_desc.get(x) for x in values if x in valid_vals}

In [354]:
# fill in value description column conditionally
new_df['valueDescription'] = ''

new_df['Valid Values'] = new_df['Valid Values'].astype(str).map(lambda x: x.lstrip())

for atr in new_df['Valid Values'].astype(str).tolist():
    if atr in valid_vals:
        new_df.loc[new_df['Valid Values'] == atr, 'valueDescription'] = atbr_to_desc.get(atr)

In [377]:
# drop rows where attribute is a valid value
attributes = new_df['Attribute'].astype(str).tolist()
atrb_in_vv = [a for a in attributes if a in valid_vals]

fin_df = new_df[~new_df['Attribute'].isin(atrb_in_vv)]
fin_df.to_excel('ad_model_table.xlsx')

### Tentative column mapping:

#### ad_model_df['Parent'] = annotation_modules_df['key']
#### ad_model_df['Attribute'] = annotation_modules_df['value']
#### ad_model_df['columnType'] = annotation_modules_df['columnType']
#### ad_model_df['Source'] = annotation_modules_df['source']
#### ad_model_df['module'] = annotation_modules_df['module']

### If attribute is a valid value:
#### ad_model_df['Description'] = annotation_modules_df['valueDescription']

### If attribute is not a valid value:
#### ad_model_df['Description'] = annotation_modules_df['description']

In [23]:
annotation_modules_schema = syn.get('syn53010627')
annotation_modules_results = syn.tableQuery(f"SELECT * from {annotation_modules_schema.id}")
annotation_modules_df = pd.read_csv(annotation_modules_results.filepath)

annotation_modules_df['admID'] = ad_model_df['admID']
annotation_modules_df

Unnamed: 0,ROW_ID,ROW_VERSION,key,description,columnType,maximumSize,value,valueDescription,source,module,admID
0,,,,,,,,,,,adm00000000
1,,,,,,,,,,,adm00000001
2,,,,,,,,,,,adm00000002
3,,,,,,,,,,,adm00000003
4,,,,,,,,,,,adm00000004
5,,,,,,,,,,,adm00000005
6,,,,,,,,,,,adm00000006
7,,,,,,,,,,,adm00000007
8,,,,,,,,,,,adm00000008
9,,,,,,,,,,,adm00000009


In [23]:
annotation_modules_df['value'] = ad_model_df['Attribute']
annotation_modules_df['key'] = ad_model_df['Parent']
annotation_modules_df['columnType'] = ad_model_df['columnType']
annotation_modules_df['source'] = ad_model_df['Source']
annotation_modules_df['module'] = ad_model_df['module']