### Author - Ajaya Kumar Sahoo

#### Download the latest AOP-Wiki XML file from https://aopwiki.org/downloads
#### Before using the downloaded XML file, manually modify the following:
```xml
<data xmlns="http://www.aopkb.org/aop-xml"> to <data xmlns="">

```

#### This code parse the AOP-Wiki XML file and gives the AOP, KE and KER data

In [1]:
import xml.etree.ElementTree as et
import pandas as pd
import numpy as np

In [2]:
# read the edited aop-wiki xml data file
tree = et.parse('aop-wiki-xml_edited') # change the file name

In [3]:
# Get root
root = tree.getroot()
root

# Vendor-specific data

## Key-Events

In [4]:
# Creating a dataframe of the keyevent reference id and the corresponding aop-wiki id 

Key_event_id = pd.DataFrame({'ref':[i.attrib.get('id') for i in  root.findall('./vendor-specific/key-event-reference')],
                            'iden':[i.attrib.get('aop-wiki-id') for i in  root.findall('./vendor-specific/key-event-reference')]
                            },columns = ['ref','iden'])
print(Key_event_id.shape)
Key_event_id.head()

In [6]:
# Create a dictionary of the above dataframe to map the ref with id in later cells

ke_ref_to_id_map_dict = dict(zip(Key_event_id['ref'],Key_event_id['iden']))
print(len(ke_ref_to_id_map_dict))

#ke_ref_to_id_map_dict

## Key-Events-Relationship

In [9]:
# Creating a dataframe of the keyevent relationship reference id and the corresponding aop-wiki id 

Key_event_relationship_id = pd.DataFrame({'ref':[i.attrib.get('id') for i in  root.findall('./vendor-specific/key-event-relationship-reference')],
                            'iden':[i.attrib.get('aop-wiki-id') for i in  root.findall('./vendor-specific/key-event-relationship-reference')]
                            },columns = ['ref','iden'])
print(Key_event_relationship_id.shape)
Key_event_relationship_id.head()

In [10]:
# Create a dictionary of the above dataframe to map the ref with id in later cells

ker_ref_to_id_map_dict = dict(zip(Key_event_relationship_id['ref'],Key_event_relationship_id['iden']))
print(len(ker_ref_to_id_map_dict))

#ker_ref_to_id_map_dict

## AOP

In [11]:
# Creating a dataframe of the keyevent relationship reference id and the corresponding aop-wiki id 

aop_id = pd.DataFrame({'ref':[i.attrib.get('id') for i in  root.findall('./vendor-specific/aop-reference')],
                            'iden':[i.attrib.get('aop-wiki-id') for i in  root.findall('./vendor-specific/aop-reference')]
                            },columns = ['ref','iden'])
print(aop_id.shape)
aop_id.head()

In [12]:
# Create a dictionary of the above dataframe to map the ref with id in later cells

aop_ref_to_id_map_dict = dict(zip(aop_id['ref'],aop_id['iden']))
print(len(aop_ref_to_id_map_dict))

#aop_ref_to_id_map_dict

## Stressor

In [13]:
# Creating a dataframe of the keyevent relationship reference id and the corresponding aop-wiki id 

stressor_id = pd.DataFrame({'ref':[i.attrib.get('id') for i in  root.findall('./vendor-specific/stressor-reference')],
                            'iden':[i.attrib.get('aop-wiki-id') for i in  root.findall('./vendor-specific/stressor-reference')]
                            })
print(stressor_id.shape)
stressor_id.head()


## Chemical

In [14]:
# Creating a dataframe of the keyevent relationship reference id and the corresponding aop-wiki id 

chemical_id = pd.DataFrame({'ref':[i.attrib.get('id') for i in  root.findall('./vendor-specific/chemical-reference')],
                            'iden':[i.attrib.get('aop-wiki-id') for i in  root.findall('./vendor-specific/chemical-reference')]
                            },columns = ['ref','iden'])
print(chemical_id.shape)
chemical_id.head()

In [15]:
set(chemical_id['ref']).intersection(set(stressor_id['ref']))

## Taxonomy

In [16]:
# Creating a dataframe of the keyevent relationship reference id and the corresponding aop-wiki id 

taxonomy_id = pd.DataFrame({'ref':[i.attrib.get('id') for i in  root.findall('./vendor-specific/taxonomy-reference')],
                            'iden':[i.attrib.get('aop-wiki-id') for i in  root.findall('./vendor-specific/taxonomy-reference')]
                            },columns = ['ref','iden'])
print(taxonomy_id.shape)
taxonomy_id.head()

In [17]:
# Create a dictionary of the above dataframe to map the ref with id in later cells

taxonomy_ref_to_id_map_dict = dict(zip(taxonomy_id['ref'],taxonomy_id['iden']))
print(len(taxonomy_ref_to_id_map_dict))

#taxonomy_ref_to_id_map_dict

### Taxonomy details

In [18]:
taxonomy_details = pd.DataFrame({'ref':[i.attrib.get('id') for i in  root.findall('./taxonomy')],
                                   'source_id':[i.text if i is not None else None for i in root.findall('./taxonomy/source-id')],
                                 'source':[i.text if i is not None else None for i in root.findall('./taxonomy/source')],
                                 'name':[i.text if i is not None else None for i in root.findall('./taxonomy/name')],
                                })
print(taxonomy_details.shape)
taxonomy_details.head()


In [19]:
# Create a dictionary of the above dataframe to map the ref with id in later cells

taxonomy_ref_to_name_map_dict = dict(zip(taxonomy_details['ref'],taxonomy_details['name']))
print(len(taxonomy_ref_to_name_map_dict))

#taxonomy_ref_to_name_map_dict

# Chemical Data

In [20]:
chemical_details = pd.DataFrame({'chem_ref':[i.attrib.get('id') for i in  root.findall('./chemical')],
                                 'casid':[i.text if i is not None else None for i in root.findall('./chemical/casrn')],
                                 'name':[i.text if i is not None else None for i in root.findall('./chemical/preferred-name')],
                                 'dsstox':[i.text if i is not None else None for i in root.findall('./chemical/dsstox-id')],
                                 'indigo_inchi':[i.text if i is not None else None for i in root.findall('./chemical/indigo-inchi-key')],
                                 'chem_inchi':[i.text if i is not None else None for i in root.findall('./chemical/jchem-inchi-key')]})
print(chemical_details.shape)
chemical_details.head()


In [21]:
# Getting the chemical synonyms

synomap = {}

for node in root:
    if node.tag == 'chemical':
        #synonyms = []
        ref = node.attrib.get('id')
        synonyms = '|'.join([ele.text.strip().replace('\n','') if ele is not None else None for ele in node.findall('./synonyms/synonym')])
        synomap[ref] = synonyms
print(len(synomap)) 


In [22]:
# merging the synonyms columns in the chemical details dataframe

chemical_details['Synonyms'] = chemical_details['chem_ref'].map(synomap)
print(chemical_details.shape)
chemical_details.head()

# Stressor Data

In [23]:
stressor_details = pd.DataFrame({'stress_ref':[i.attrib.get('id') for i in  root.findall('./stressor')],
                                 'stressor_name':[i.text if i is not None else None for i in root.findall('./stressor/name')],
                                 'description':[i.text if i is not None else None for i in root.findall('./stressor/description')],
                                 'exposure_characterization':[i.text if i is not None else None for i in root.findall('./stressor/exposure-characterization')],
                                 'creation_timestamp':[i.text if i is not None else None for i in root.findall('./stressor/creation-timestamp')],
                                 'last_modification_timestamp':[i.text if i is not None else None for i in root.findall('./stressor/last-modification-timestamp')]
                                })

print(stressor_details.shape)
stressor_details.head()


In [24]:
# Get the chemical id and user term from the stressor id

temp_df = []
for node in root:
    if node.tag == 'stressor':
        stressor_id = node.attrib.get('id')
        tmp_root = node
        for subnodes in tmp_root:
            if subnodes.tag == 'chemicals':
                tmp_root2 = subnodes
                for subsubnode in tmp_root2:
                    if subsubnode.tag == 'chemical-initiator':
                        chem_id = subsubnode.attrib.get('chemical-id')
                        user_term = subsubnode.attrib.get('user-term')
                        temp_df.append([stressor_id,chem_id,user_term])
                        
temp_df = pd.DataFrame(temp_df,columns=['stress_id','chem_id','user_term'])
print(temp_df.shape)
temp_df.tail()

In [25]:
# getting the stressor chemical mapping

stressor_ref_chemical_ref_mapping = {}

for ind,row in temp_df.iterrows():
    stressor = temp_df.at[ind,'stress_id']
    chem_ref = temp_df.at[ind,'chem_id']
    cas_id = chemical_details.loc[chemical_details['chem_ref']==chem_ref,'casid'].iloc[0]
    
    stressor_ref_chemical_ref_mapping[stressor] = cas_id

print(len(stressor_ref_chemical_ref_mapping))

In [26]:
# Getting the chemical information of the stressors using the chemical-initiator chemical identifier as the overlapping column name

stressor_details_merged = stressor_details.merge(temp_df,left_on='stress_ref',right_on='stress_id',how='outer')
print(stressor_details_merged.shape)
stressor_details_merged = stressor_details_merged.replace(np.nan,'',regex=True)
stressor_details_merged.head()


In [27]:
# merging the stressor details with chemical details file

stressor_chemical_details = stressor_details_merged.merge(chemical_details,left_on='chem_id',right_on='chem_ref',how='inner')
print(stressor_chemical_details.shape)
stressor_chemical_details.head()

In [28]:
# removing the unnecessary columns from the stressor chemical merged table

stressor_chemical_details = pd.DataFrame(stressor_chemical_details[['stress_ref','stressor_name','description',
                                                                   'exposure_characterization','creation_timestamp',
                                                                    'last_modification_timestamp','chem_ref','casid',
                                                                   'name','dsstox','indigo_inchi','chem_inchi','Synonyms']])
print(stressor_chemical_details.shape)

for col in stressor_chemical_details.columns:
    stressor_chemical_details[col] = stressor_chemical_details[col].str.replace('\n','')
    stressor_chemical_details[col] = stressor_chemical_details[col].str.replace('\t','')
stressor_chemical_details.head()

In [29]:
#unique stressor ids available
len(stressor_chemical_details['stress_ref'].unique())

In [30]:
# mapping stress ref and chemicals name
stress_ref_to_chemname_map = dict(zip(stressor_chemical_details['stress_ref'],stressor_chemical_details['name']))
len(stress_ref_to_chemname_map)

# AOP

In [31]:
aop_title_status = pd.DataFrame({'aop_ref':[i.attrib.get('id') for i in  root.findall('./aop')],
                                 'aop_title':[i.text if i is not None else None for i in root.findall('./aop/title')],
                                })

print(aop_title_status.shape)

# Getting all the status information

shortname = {}
wikistatus = {}
oecdstatus = {}
saaopstatus = {}

for node in root:
    if node.tag == 'aop':
        #synonyms = []
        ref = node.attrib.get('id')
        #print(node)
        short_n = '|'.join([ele.text.strip().replace('\n','') if ele is not None else None for ele in node.findall('./short-name')])
            
        wiki_s = '|'.join([ele.text.strip().replace('\n','') if ele is not None else None for ele in node.findall('./status/wiki-status')])
        oecd_s = '|'.join([ele.text.strip().replace('\n','') if ele is not None else None for ele in node.findall('./status/oecd-status')])
        saaop_s = '|'.join([ele.text.strip().replace('\n','') if ele is not None else None for ele in node.findall('./status/saaop-status')])
        
        shortname[ref] = short_n
        wikistatus[ref] = wiki_s
        oecdstatus[ref] = oecd_s
        saaopstatus[ref] = saaop_s 
print(len(shortname),len(wikistatus),len(oecdstatus),len(saaopstatus))

aop_title_status['short_name'] = aop_title_status['aop_ref'].map(shortname)
aop_title_status['wiki_status'] = aop_title_status['aop_ref'].map(wikistatus)
aop_title_status['oecd_status'] = aop_title_status['aop_ref'].map(oecdstatus)
aop_title_status['saaop_status'] = aop_title_status['aop_ref'].map(saaopstatus)

print(aop_title_status.shape)

for col in aop_title_status.columns:
    aop_title_status[col] = aop_title_status[col].str.replace('\n','')
    aop_title_status[col] = aop_title_status[col].str.strip().str.replace('\t','')
    
aop_title_status.head()

In [32]:
# mapping with respect to aop_ref to get the corresponding aop_iden
aop_title_status['aop_iden'] = aop_title_status['aop_ref'].map(aop_ref_to_id_map_dict)
print(aop_title_status.shape)
aop_title_status.head()

In [37]:
# Saving the AOP title status file
#aop_title_status.to_csv('AOP-wiki-data/parsed_data/AOP_title_status.csv',sep='\t',index=None,encoding='UTF-8')

# Key Events

In [33]:
key_events_details = pd.DataFrame({'ref':[i.attrib.get('id') for i in  root.findall('./key-event')],
                                   'title':[i.text if i is not None else None for i in root.findall('./key-event/title')],
                                 'BOL':[i.text if i is not None else None for i in root.findall('./key-event/biological-organization-level')], # BOL - biological organization level
                                })
print(key_events_details.shape)

# Getting the short name, organ source id, organ source, organ name

shortname = {}
organsourceid = {}
organsource = {}


for node in root:
    if node.tag == 'key-event':
        #synonyms = []
        ref = node.attrib.get('id')
        #print(ref)
        short_n = '|'.join([ele.text.strip().replace('\n','') if ele is not None else None for ele in node.findall('./short-name')])
        organ_s_id = '|'.join([ele.text.strip().replace('\n','') if ele is not None else None for ele in node.findall('./organ-term/source-id')])
        organ_s = '|'.join([ele.text.strip().replace('\n','') if ele is not None else None for ele in node.findall('./organ-term/source')])

        
        shortname[ref] = short_n
        organsourceid[ref] = organ_s_id
        organsource[ref] = organ_s

        
key_events_details['short_name'] = key_events_details['ref'].map(shortname)
key_events_details['organ_source_id'] = key_events_details['ref'].map(organsourceid)
key_events_details['organ_source'] = key_events_details['ref'].map(organsource)


print(key_events_details.shape)

for col in key_events_details.columns:
    key_events_details[col] = key_events_details[col].str.replace('\n','')
    key_events_details[col] = key_events_details[col].str.strip().str.replace('\t','')

print(key_events_details.shape)    
key_events_details.head()

In [34]:
# getting the Action, object and process details for Key Events for AOPs 
count = 0
noGO = 0
KE_action_object_process = []

for node in root:
    if node.tag == 'key-event':
        #synonyms = []
        ref = node.attrib.get('id')
        if len(node.findall('./biological-events/biological-event')) ==0:
            noGO = noGO+1
            KE_action_object_process.append([ref,'','',''])
        else:    
            #print(node.findall('./biological-events/biological-event'))
            noGO = noGO
            
            for k in node.findall('./biological-events/biological-event'):
                action = k.attrib.get('action-id') if k.attrib.get('action-id') is not None else ''
                object_id = k.attrib.get('object-id') if k.attrib.get('object-id') is not None else ''
                process = k.attrib.get('process-id') if k.attrib.get('process-id') is not None else ''
                KE_action_object_process.append([ref,action,object_id,process])
                                 
        count = count+1
print('Total number of KEs:', count)
print('Total number of KEs which donot have any GO ids:',noGO)

KE_action_object_process_df = pd.DataFrame(KE_action_object_process,columns= ['ref_id','action','object_id','process'])        
print(KE_action_object_process_df.shape)
KE_action_object_process_df.head()

In [35]:
# Merging key_events_details dataframe with KE_action_object_process_df dataframe

KE_info = key_events_details.merge(KE_action_object_process_df,left_on = 'ref',right_on='ref_id',how='inner')
print(KE_info.shape)
KE_info = pd.DataFrame(KE_info[['ref','title','BOL','action','object_id','process']])
print(KE_info.shape)
for col in KE_info.columns:
    KE_info[col] = KE_info[col].str.strip().str.replace('\t','')
    KE_info[col] = KE_info[col].str.replace('\n','')
KE_info.head()

In [36]:
# Getting action name and other information

Action_info = []
for node in root:
    if node.tag == 'biological-action':
        #synonyms = []
        ref = node.attrib.get('id')
        #print(ref)
        iden = '|'.join([ele.text.strip().replace('\n','') if ele is not None else None for ele in node.findall('./source-id')])
        source = '|'.join([ele.text.strip().replace('\n','') if ele is not None else None for ele in node.findall('./source')])
        name ='|'.join([ele.text.strip().replace('\n','') if ele is not None else None for ele in node.findall('./name')])
        Action_info.append([ref,iden,source,name])
Action_info_df = pd.DataFrame(Action_info,columns= ['ref_id','iden','source','name'])
print(Action_info_df.shape)
Action_info_df.head()

In [37]:
# Getting object name and other information

Object_info = []
for node in root:
    if node.tag == 'biological-object':
        #synonyms = []
        ref = node.attrib.get('id')
        #print(ref)
        iden = '|'.join([ele.text.strip().replace('\n','') if ele is not None else None for ele in node.findall('./source-id')])
        source = '|'.join([ele.text.strip().replace('\n','') if ele is not None else None for ele in node.findall('./source')])
        name ='|'.join([ele.text.strip().replace('\n','') if ele is not None else None for ele in node.findall('./name')])
        Object_info.append([ref,iden,source,name])
Object_info_df = pd.DataFrame(Object_info,columns= ['ref_id','iden','source','name'])
print(Object_info_df.shape)
Object_info_df.head()

In [38]:
# Getting Process name and other information

Process_info = []
for node in root:
    if node.tag == 'biological-process':
        #synonyms = []
        ref = node.attrib.get('id')
        #print(ref)
        iden = '|'.join([ele.text.strip().replace('\n','') if ele is not None else None for ele in node.findall('./source-id')])
        source = '|'.join([ele.text.strip().replace('\n','') if ele is not None else None for ele in node.findall('./source')])
        name ='|'.join([ele.text.strip().replace('\n','') if ele is not None else None for ele in node.findall('./name')])
        Process_info.append([ref,iden,source,name])
Process_info_df = pd.DataFrame(Process_info,columns= ['ref_id','iden','source','name'])
print(Process_info_df.shape)
for col in Process_info_df.columns:
    Process_info_df[col] = Process_info_df[col].str.strip().str.replace('\t','')
    Process_info_df[col] = Process_info_df[col].str.strip().str.replace('\n','')
Process_info_df.head()  


In [39]:
def Check_value_return(val):
    try:
        return val.iloc[0].strip()
    except:
        return ''
    
    
for ind,row in KE_info.iterrows():
    ref = KE_info.at[ind,'ref']
    action = KE_info.at[ind,'action']
    object_id = KE_info.at[ind,'object_id']
    process = KE_info.at[ind,'process']
    
    
    KE_iden = Check_value_return(Key_event_id.loc[Key_event_id['ref']== ref,'iden'])
    
    action = Check_value_return(Action_info_df.loc[Action_info_df['ref_id']== action,'name'])
    
    object_iden = Check_value_return(Object_info_df.loc[Object_info_df['ref_id']== object_id,'iden'])
    object_name = Check_value_return(Object_info_df.loc[Object_info_df['ref_id']== object_id,'name'])
    
    process_iden = Check_value_return(Process_info_df.loc[Process_info_df['ref_id']== process,'iden'])
    process_source = Check_value_return(Process_info_df.loc[Process_info_df['ref_id']== process,'source'])
    process_name = Check_value_return(Process_info_df.loc[Process_info_df['ref_id']== process,'name'])
    
    KE_info.at[ind,'KE_iden'] = KE_iden
    KE_info.at[ind,'action name'] = action
    
    KE_info.at[ind,'object_iden'] = object_iden
    KE_info.at[ind,'object_name'] = object_name
    
    KE_info.at[ind,'process_iden'] = process_iden
    KE_info.at[ind,'process_source'] = process_source
    KE_info.at[ind,'process_name'] = process_name
print(KE_info.shape)
KE_info

In [41]:
KE_info = pd.DataFrame(KE_info[['ref','KE_iden','title','BOL','action name','object_iden','object_name','process_iden','process_source','process_name']])

print(KE_info.shape)
KE_info.head()

In [46]:
# Saving the Key Events details file

KE_info.to_csv('Key_events_details.csv',sep='\t',index=None,encoding='UTF-8') # output file for KEs and their info


### KE - Taxonomy and evidence links

In [42]:
# loop through the key events node under root, check if they have taxonomy applicability, and create a table that collates KE ref and taxonomy (with evidence)

ke_taxonomy = []
for node in root: # loop through all nodes and check if it is a key-event and proceed
    if node.tag == 'key-event':
        taxonomies = [i for i in node.findall('./applicability/taxonomy')] # taxonomy nodes
        # loop through all the mies and append the aop id and the mie in each row of the map
        for tax in taxonomies:
            tax_id = tax.attrib.get('taxonomy-id')
            evidence = '|'.join([i.text if i is not None else None for i in tax.findall('./evidence')])
            ke_taxonomy.append([node.attrib.get('id'),tax_id,evidence])
            
ke_taxonomy_df = pd.DataFrame(ke_taxonomy,columns= ['ke_ref','tax_ref','tax_evidence'])
print(ke_taxonomy_df.shape)
ke_taxonomy_df.head()

In [43]:
ke_taxonomy_df['ke_id'] = ke_taxonomy_df['ke_ref'].map(ke_ref_to_id_map_dict)
print(ke_taxonomy_df.shape)
ke_taxonomy_df['tax_id'] = ke_taxonomy_df['tax_ref'].map(taxonomy_ref_to_id_map_dict)
print(ke_taxonomy_df.shape)
ke_taxonomy_df['tax_name'] = ke_taxonomy_df['tax_ref'].map(taxonomy_ref_to_name_map_dict)
print(ke_taxonomy_df.shape)
ke_taxonomy_df = ke_taxonomy_df[['ke_id','tax_id','tax_name','tax_evidence']]
print(ke_taxonomy_df.shape)
ke_taxonomy_df.head()

In [49]:
#save file
#ke_taxonomy.to_csv('KE_taxonomy_details.tsv',sep='\t',index=None)

# Key Events relationship 

In [44]:
key_events_relationship_details = pd.DataFrame({'ref':[i.attrib.get('id') for i in  root.findall('./key-event-relationship')],
                                   'upstream_id':[i.text if i is not None else None for i in root.findall('./key-event-relationship/title/upstream-id')],
                                 'downstream_id':[i.text if i is not None else None for i in root.findall('./key-event-relationship/title/downstream-id')],
                                })
print(key_events_relationship_details.shape)
key_events_relationship_details.head()


In [45]:
iden = set(key_events_relationship_details['upstream_id']).union(set(key_events_relationship_details['downstream_id']))
print(len(iden))
len(iden.intersection(set(key_events_details['ref'])))

In [46]:
key_events_relationship_details['ker_id'] = key_events_relationship_details['ref'].map(ker_ref_to_id_map_dict)
print(key_events_relationship_details.shape)
key_events_relationship_details['upstream_ke_id'] = key_events_relationship_details['upstream_id'].map(ke_ref_to_id_map_dict)
print(key_events_relationship_details.shape)
key_events_relationship_details['downstream_ke_id'] = key_events_relationship_details['downstream_id'].map(ke_ref_to_id_map_dict)
print(key_events_relationship_details.shape)
key_events_relationship_details = key_events_relationship_details[['ref','ker_id','upstream_ke_id','downstream_ke_id']]
print(key_events_relationship_details.shape)
key_events_relationship_details.head()

In [53]:
key_events_relationship_details.to_csv('KER_details.tsv',sep='\t',index=None) # output file for KERs

## AOP - MIE

In [47]:
mie = pd.DataFrame({#'aop_ref': [i.attrib.get('id') for i in root.findall('./aop')],
                        'mie_ref': [i.attrib.get('key-event-id') if i is not None else None for i in root.findall('./aop/molecular-initiating-event')]
                       })
print(mie.shape)
mie.head()

In [48]:
aop_refs = pd.DataFrame({'aop_ref': [i.attrib.get('id') for i in root.findall('./aop')],
                        #'mie_ref': [i.attrib.get('key-event-id') for i in root.findall('./aop/molecular-initiating-event')]
                       })
print(aop_refs.shape)
aop_refs.head()

In [49]:
# loop through the aop ids and check if they have the mie to tag along, otherwise append None is the mie column in the datafrmae

aop_mie_map = []
for node in root: # loop through all nodes in root and then check if the attrib id is in the aop reference
    if node.attrib.get('id') in aop_refs['aop_ref'].tolist():
        mies = [i.attrib.get('key-event-id') for i in node.findall('./molecular-initiating-event')]
        # loop through all the mies and append the aop id and the mie in each row of the map
        for mie in mies:
            aop_mie_map.append([node.attrib.get('id'),mie])
        
aop_mie_map_df = pd.DataFrame(aop_mie_map,columns= ['aop_ref','mie_ref'])
print(aop_mie_map_df.shape)
aop_mie_map_df.head()

In [50]:
# number of aops
len(aop_mie_map_df['aop_ref'].unique())

In [51]:
#len of unique mies
len(aop_mie_map_df['mie_ref'].unique())

In [52]:
# mapping aop ref and mie ref to ids

aop_mie_map_df['aop_id'] = aop_mie_map_df['aop_ref'].map(aop_ref_to_id_map_dict)
print(aop_mie_map_df.shape)
aop_mie_map_df['mie_id'] = aop_mie_map_df['mie_ref'].map(ke_ref_to_id_map_dict)
print(aop_mie_map_df.shape)
aop_mie_map_df = aop_mie_map_df[['aop_ref','aop_id','mie_id']]
print(aop_mie_map_df.shape)
aop_mie_map_df.head()

## AOP - KEs

In [53]:
# loop through the aop ids and check if they have KEs to tag along, otherwise append None in ke column in the datafrmae

aop_ke_map = []
for node in root: # loop through all nodes in root and then check if the attrib id is in the aop reference
    if node.attrib.get('id') in aop_refs['aop_ref'].tolist():
        ke = [i.attrib.get('key-event-id') for i in node.findall('./key-events/key-event')]
        # loop through all the KEs and append the aop id and the Ke in each row of the map
        for keyevent in ke:
            aop_ke_map.append([node.attrib.get('id'),keyevent])
        
aop_ke_map_df = pd.DataFrame(aop_ke_map,columns= ['aop_ref','ke_ref'])
print(aop_ke_map_df.shape)
aop_ke_map_df.head()

In [54]:
# unique aops
len(aop_ke_map_df['aop_ref'].unique())

In [55]:
# mapping aop ref and mie ref to ids

aop_ke_map_df['aop_id'] = aop_ke_map_df['aop_ref'].map(aop_ref_to_id_map_dict)
print(aop_ke_map_df.shape)
aop_ke_map_df['ke_id'] = aop_ke_map_df['ke_ref'].map(ke_ref_to_id_map_dict)
print(aop_ke_map_df.shape)
aop_ke_map_df = aop_ke_map_df[['aop_ref','aop_id','ke_id']]
print(aop_ke_map_df.shape)
aop_ke_map_df.head()

## AOP - AO

In [56]:
# loop through the aop ids and check if they have aos to tag along, otherwise append None in ao column in the datafrmae

aop_ao_map = []
for node in root: # loop through all nodes in root and then check if the attrib id is in the aop reference
    if node.attrib.get('id') in aop_refs['aop_ref'].tolist():
        aos = [i.attrib.get('key-event-id') for i in node.findall('./adverse-outcome')]
        # loop through all the aos and append the aop id and the Ke in each row of the map
        for ao in aos:
            aop_ao_map.append([node.attrib.get('id'),ao])
        
aop_ao_map_df = pd.DataFrame(aop_ao_map,columns= ['aop_ref','ao_ref'])
print(aop_ao_map_df.shape)
aop_ao_map_df.head()

In [57]:
# unique aops
len(aop_ao_map_df['aop_ref'].unique())

In [58]:
# mapping aop ref and mie ref to ids

aop_ao_map_df['aop_id'] = aop_ao_map_df['aop_ref'].map(aop_ref_to_id_map_dict)
print(aop_ao_map_df.shape)
aop_ao_map_df['ao_id'] = aop_ao_map_df['ao_ref'].map(ke_ref_to_id_map_dict)
print(aop_ao_map_df.shape)
aop_ao_map_df = aop_ao_map_df[['aop_ref','aop_id','ao_id']]
print(aop_ao_map_df.shape)
aop_ao_map_df.head()

## AOP-KER

In [59]:
# loop through the aop ids and check if they have kers to tag along, otherwise append None in ker column in the datafrmae

aop_ker_map = []
for node in root: # loop through all nodes in root and then check if the attrib id is in the aop reference
    if node.attrib.get('id') in aop_refs['aop_ref'].tolist():
        ker_nodes = [i for i in node.findall('./key-event-relationships/relationship')]
        # loop through all the kers and append the aop id and the Ker in each row of the map
        for ker_node in ker_nodes:
            adjacency = '|'.join([i.text if i is not None else None for i in ker_node.findall('./adjacency')])
            quantitative_understading_value = '|'.join([i.text if i is not None else None for i in ker_node.findall('./quantitative-understanding-value')])
            evidence = '|'.join([i.text if i is not None else None for i in ker_node.findall('./evidence')])
            aop_ker_map.append([node.attrib.get('id'),ker_node.attrib.get('id'),adjacency,quantitative_understading_value,evidence])
        
aop_ker_map_df = pd.DataFrame(aop_ker_map,columns= ['aop_ref','ker_ref','adjacency','quantitative_understanding','evidence'])
print(aop_ker_map_df.shape)
aop_ker_map_df.head()

In [60]:
# mapping aop ref and ker ref to ids

aop_ker_map_df['aop_id'] = aop_ker_map_df['aop_ref'].map(aop_ref_to_id_map_dict)
print(aop_ker_map_df.shape)
aop_ker_map_df['ker_id'] = aop_ker_map_df['ker_ref'].map(ker_ref_to_id_map_dict)
print(aop_ker_map_df.shape)
aop_ker_map_df = aop_ker_map_df[['aop_ref','aop_id','ker_id','adjacency','quantitative_understanding','evidence']]
print(aop_ker_map_df.shape)
aop_ker_map_df.head()

In [79]:
aop_ker_map_df.to_csv('AOP_KER_info.tsv',sep='\t',index=None) # output file

## AO-taxonomy

In [61]:
# loop through the aop ids and check if they have taxonomy to tag along, otherwise append None in taxonomy column in the datafrmae

aop_tax_map = []
for node in root: # loop through all nodes in root and then check if the attrib id is in the aop reference
    if node.attrib.get('id') in aop_refs['aop_ref'].tolist():
        tax_nodes = [i for i in node.findall('./applicability/taxonomy')]
        # loop through all the tax and append the aop id and the tax in each row of the map
        for tax_node in tax_nodes:
            evidence = '|'.join([i.text if i is not None else None for i in tax_node.findall('./evidence')])
            aop_tax_map.append([node.attrib.get('id'),tax_node.attrib.get('taxonomy-id'),evidence])
        
aop_tax_map_df = pd.DataFrame(aop_tax_map,columns= ['aop_ref','taxonomy_ref','evidence'])
print(aop_tax_map_df.shape)
aop_tax_map_df.head()

In [62]:
# mapping aop ref and tax ref to ids

aop_tax_map_df['aop_id'] = aop_tax_map_df['aop_ref'].map(aop_ref_to_id_map_dict)
print(aop_tax_map_df.shape)
aop_tax_map_df['tax_name'] = aop_tax_map_df['taxonomy_ref'].map(taxonomy_ref_to_name_map_dict)
print(aop_tax_map_df.shape)
aop_tax_map_df = aop_tax_map_df[['aop_ref','aop_id','tax_name','evidence']]
print(aop_tax_map_df.shape)
aop_tax_map_df.head()

In [85]:
aop_tax_map_df.to_csv('ParsedData/AOP_taxonomy_info.tsv',sep='\t',index=None) # output file

## AOP-Stressor

In [63]:
# loop through the aop ids and check if they have stressors to tag along, otherwise append None in stressor column in the datafrmae

aop_stress_map = []
for node in root: # loop through all nodes in root and then check if the attrib id is in the aop reference
    if node.attrib.get('id') in aop_refs['aop_ref'].tolist():
        stress_nodes = [i for i in node.findall('./aop-stressors/aop-stressor')]
        # loop through all the stressors and append the aop id and the stressor in each row of the map
        for stress_node in stress_nodes:
            evidence = '|'.join([i.text if i is not None else None for i in stress_node.findall('./evidence')])
            aop_stress_map.append([node.attrib.get('id'),stress_node.attrib.get('stressor-id'),evidence])
        
aop_stress_map_df = pd.DataFrame(aop_stress_map,columns= ['aop_ref','stressor_ref','evidence'])
print(aop_stress_map_df.shape)
aop_stress_map_df.head()

In [64]:
aops_with_chemical_stressors = pd.DataFrame(aop_stress_map_df[aop_stress_map_df['stressor_ref'].isin(stressor_chemical_details['stress_ref'].tolist())])
print(aops_with_chemical_stressors.shape)
aops_with_chemical_stressors.head()


In [65]:
# mapping aop ref and stress ref to ids

aops_with_chemical_stressors['aop_id'] = aops_with_chemical_stressors['aop_ref'].map(aop_ref_to_id_map_dict)
print(aops_with_chemical_stressors.shape)
aops_with_chemical_stressors['stress_name'] = aops_with_chemical_stressors['stressor_ref'].map(stress_ref_to_chemname_map)
print(aops_with_chemical_stressors.shape)
aops_with_chemical_stressors = aops_with_chemical_stressors[['aop_ref','aop_id','stress_name','evidence']]
print(aops_with_chemical_stressors.shape)
aops_with_chemical_stressors.head()

## Getting an overall AOP file with MIE, KEs, AOs, KERs, stressors, taxonomies

In [66]:
# function to check if the input is empty, then append '-', else keep the value
def check_empty(string):
    if string == '':
        return ''
    else:
        return string

In [67]:
# getting the casid of these stressors

aop_stress_map_df['casid'] = aop_stress_map_df['stressor_ref'].map(stressor_ref_chemical_ref_mapping)

print(aop_stress_map_df.shape)
aop_stress_map_df = aop_stress_map_df.replace(np.nan,'',regex=True)

aop_stress_map_df.head()

In [68]:
aop_mie_ke_ao_ker_stressor = []
for node in root: # loop through all nodes in root and then check if the attrib id is in the aop reference
    if node.attrib.get('id') in aop_refs['aop_ref'].tolist():
        aop_ref = node.attrib.get('id')
        aop_iden = aop_title_status.loc[aop_title_status['aop_ref'] == aop_ref, 'aop_iden'].iloc[0]
        
        aop_title = aop_title_status.loc[aop_title_status['aop_ref'] == aop_ref, 'aop_title'].iloc[0] # newly added for AOP title
        oecd_status = aop_title_status.loc[aop_title_status['aop_ref'] == aop_ref, 'oecd_status'].iloc[0] # newly added for OECD status
        
        saaop_status = aop_title_status.loc[aop_title_status['aop_ref'] == aop_ref, 'saaop_status'].iloc[0]
        all_mies =check_empty("|".join(aop_mie_map_df.loc[aop_mie_map_df['aop_ref'] == aop_ref,'mie_id'].tolist()))
        all_kes =check_empty("|".join(aop_ke_map_df.loc[aop_ke_map_df['aop_ref'] == aop_ref,'ke_id'].tolist()))
        all_aos =check_empty("|".join(aop_ao_map_df.loc[aop_ao_map_df['aop_ref'] == aop_ref,'ao_id'].tolist()))
        all_kers =check_empty("|".join(aop_ker_map_df.loc[aop_ker_map_df['aop_ref'] == aop_ref,'ker_id'].tolist()))
        all_stressors =check_empty("|".join(aops_with_chemical_stressors.loc[aop_stress_map_df['aop_ref'] == aop_ref,'stress_name'].tolist()))
        all_casid = check_empty("|".join(list(set(aop_stress_map_df.loc[aop_stress_map_df['aop_ref']==aop_ref,'casid'].tolist()) - {''}) )) # latest added
        
        all_taxonomy = check_empty("|".join(aop_tax_map_df.loc[aop_tax_map_df['aop_ref'] == aop_ref,'tax_name'].tolist()))
        
        aop_mie_ke_ao_ker_stressor.append([aop_ref,aop_iden,aop_title,oecd_status,saaop_status,all_mies,all_kes,all_aos,all_kers,all_stressors,all_casid,all_taxonomy])

AOP_data = pd.DataFrame(aop_mie_ke_ao_ker_stressor,columns = ['AOP_ref','AOP_iden','aop_title','oecd_status','SAAOP_status','MIEs','KEs','AOs','KERs',"Stressors","chemicalID","Taxonomies"])
print(AOP_data.shape)
AOP_data.head()

In [107]:
AOP_data.to_csv('ParsedData/AOP_metadata_Title_OECDStatus_added_CAS_added.tsv',sep='\t',index=None) # output file