### Author - Ajaya Kumar Sahoo

#### This code creates stressor-AOP edgelist file from the chemical-KE data and the AOP-KE information

In [1]:
import pandas as pd
import numpy as np
import networkx as nx

In [4]:
Chemical_KE_mapping = pd.read_csv('Chemical_KE_mapping.tsv', sep='\t', dtype=str) 
# This file gives chemical-KE mapping curated through a data-centric integration, 1st column - chemical id, 2nd column = KE id

print(Chemical_KE_mapping.shape)
Chemical_KE_mapping.columns = ['Chemical','KEID']

Chemical_KE_mapping.drop_duplicates(inplace=True)
print(Chemical_KE_mapping.shape)

Chemical_KE_mapping.head()


In [5]:
## High confidence AOP list with MIE, AO, KE, KER information

HighCAOPs = pd.read_csv('high_confidence_AOP.tsv',sep='\t',dtype=str) # this file contains following information

# High confidence AOP identifier as 1st column, 
# MIEs identifiers ('|' separated) as 2nd column
# KEs identifiers ('|' separated) excluding MIE and AO as 3rd column
# AOs identifiers ('|' separated) as 4th column
# Total KEs = MIEs+KEs+AOs ('|' separated) as 5th column
# KER identifiers ('|' separated) as 6th column

print(HighCAOPs.shape)

HighCAOPs = HighCAOPs.replace(np.nan,'',regex=True)

print(HighCAOPs.shape)

HighCAOPs.columns = ['AOP_iden','MIEs','KEs','AOs','total_KEs','KERs']

print(HighCAOPs.shape)

HighCAOPs.head()


In [6]:
# Get the chemicals for each of these AOPs

for ind,row in HighCAOPs.iterrows():
    total_KEs = HighCAOPs.at[ind,'total_KEs'].split('|')
    chemical_list = '|'.join(set(list(Chemical_KE_mapping.loc[Chemical_KE_mapping['KEID'].isin(total_KEs),'Chemical'])))
    
    HighCAOPs.at[ind,'chemical'] = chemical_list
    
print(HighCAOPs.shape)

HighCAOPs = HighCAOPs.set_index(['AOP_iden','MIEs','KEs','AOs','total_KEs','KERs']).apply(lambda x:x.str.split('|').explode()).reset_index()

print(HighCAOPs.shape)

HighCAOPs.head()



In [7]:
## checking how many high confidence AOPs have chemical stressor mapping available 

print('Number of high confidence AOPs linked to the stressors:',len(set(list(HighCAOPs[HighCAOPs['chemical'] != '']['AOP_iden']))))


print('Number of stressors linked to the high confidence AOPs:',len(set(list(HighCAOPs[HighCAOPs['chemical'] != '']['chemical']))))



In [10]:
print(Chemical_KE_mapping.shape)

Chemical_KE_mapping = Chemical_KE_mapping.groupby(['Chemical']).agg(lambda x:'|'.join(list(set([i for i in set(x)])))).reset_index()

Chemical_KE_mapping_dict = dict(zip(Chemical_KE_mapping['Chemical'],Chemical_KE_mapping['KEID']))

print(len(Chemical_KE_mapping_dict))


In [11]:
def Get_mapped_events(row):
    '''
    Get the mapped events i.e., whether only MIE, AO or KEs are linked with stressor 
    '''
    chemical = row['chemical']
    
    MIEs = row['MIEs'].split('|')
    OnlyKEs = row['KEs'].split('|')
    AOs = row['AOs'].split('|')
    if chemical != '':
        mapped_events = Chemical_KE_mapping_dict[chemical].split('|')
    
        MIEs_mapped = '|'.join(list(set(mapped_events).intersection(set(MIEs))))
        OnlyKEs_mapped = '|'.join(list(set(mapped_events).intersection(set(OnlyKEs))))
        AOs_mapped = '|'.join(list(set(mapped_events).intersection(set(AOs))))
    
        totalKEs_mapped = '|'.join(list(set(MIEs_mapped.split('|')+OnlyKEs_mapped.split('|')+AOs_mapped.split('|'))-{''}))
    else:
        MIEs_mapped = ''
        OnlyKEs_mapped = ''
        AOs_mapped = ''
        totalKEs_mapped = ''
    return (MIEs_mapped,OnlyKEs_mapped,AOs_mapped,totalKEs_mapped)

In [28]:
# Applying the function to the dataframe

HighCAOPs['MIEs_mapped'] = HighCAOPs.apply(lambda row:Get_mapped_events(row)[0],axis=1)
HighCAOPs['KEs_mapped'] = HighCAOPs.apply(lambda row:Get_mapped_events(row)[1],axis=1)
HighCAOPs['AOs_mapped'] = HighCAOPs.apply(lambda row:Get_mapped_events(row)[2],axis=1)
HighCAOPs['totalKEs_mapped'] = HighCAOPs.apply(lambda row:Get_mapped_events(row)[3],axis=1)

print(HighCAOPs.shape)
HighCAOPs.head()

#### Defining the edge attributes (https://link.springer.com/article/10.1007/s00204-024-03825-z)

edge weight = covergae score

coverage score is defined as number of KEs mapped/total number of KEs

evidence level

     Level 1 relevance - Only KE mapped
     
     Level 2 relevance - Only AO mapped but not MIE mapped
     
     Level 3 relevance - Only MIE mapped but not AO mapped
     
     Level 4 relevance - Both MIE and AO mapped
     
     Level 5 relevance - directed path exist between the mapped MIE and AO

In [12]:
## Taking the high confidence AOPs which got mapped to any stressor

HighCAOPs_mapped_AOPs = pd.DataFrame(HighCAOPs[HighCAOPs['chemical'] != ''])

print(HighCAOPs_mapped_AOPs.shape)
print(len(set(HighCAOPs_mapped_AOPs['AOP_iden'])))
HighCAOPs_mapped_AOPs.head()


In [13]:
# computing the coverage score for each stressor-AOP associations

def Get_coverage_score(row):
    '''
    Get the coverage score for the plastic addiitves
    '''
    total_KE_mapped = row['totalKEs_mapped'].split('|')
    total_KE = row['total_KEs'].split('|')
    
    coverage_score = round((len(total_KE_mapped)/len(total_KE)),2)
    return coverage_score

In [14]:
HighCAOPs_mapped_AOPs['Coverage score'] = HighCAOPs_mapped_AOPs.apply(lambda row:Get_coverage_score(row),axis=1)

print(HighCAOPs_mapped_AOPs.shape)
HighCAOPs_mapped_AOPs.head()



In [15]:
# getting the KER details file

KER_details = pd.read_csv('KER_details.tsv',sep='\t',dtype=str)
# KER_details file contains KERs identifier as first column, upstream KE identifier as 2nd column and downstream KE identifier as 3rd column 

print(KER_details.shape)
KER_details.head()


In [16]:
# Get the level of relevance

def Get_evidence_category(row):
    MIEs_mapped = list(set(row['MIEs_mapped'].split('|'))-{''})
    onlyKEs_mapped = list(set(row['KEs_mapped'].split('|'))-{''})
    AOs_mapped = list(set(row['AOs_mapped'].split('|'))-{''})
    totalKEs_mapped = list(set(row['totalKEs_mapped'].split('|'))-{''})
    Coverage_score =  float(row['Coverage score'])
    
    if len(MIEs_mapped) >=1 and len(AOs_mapped) >= 1:
        
        KERs = list(set(row['KERs'].split('|'))-{''})
        edge_list = list(zip(list(KER_details[KER_details['ker_id'].isin(KERs)]['upstream_ke_id']), list(KER_details[KER_details['ker_id'].isin(KERs)]['downstream_ke_id'])))
        
        graph = nx.DiGraph()
        graph.add_edges_from(edge_list) # creating graph from edge list
        
        paths = []
        for MIE in MIEs_mapped:
            #print(MIE)
            for AO in AOs_mapped:
                #print(AO)
                paths = paths+[path for path in nx.all_simple_paths(graph,source=MIE,target=AO)]
                
        paths = ['|'.join(i) for i in paths] # flattening the list of list by joining the list using '|'
        paths = list(set(paths)-{''})
        #print(paths)
        
        if len(paths) >= 1:
            evidence = 'Level 5' # directed path exists between mapped MIE and mapped AO, and the coverage score >= 0.4
        else:
            evidence = 'Level 4' # directed path does not exists between mapped MIE and AO. So evidence level 4 is satisfied
    
    elif len(MIEs_mapped) >=1 and len(AOs_mapped) == 0: 
        evidence = 'Level 3' # Only MIEs are mapped but not AO
    
    elif len(AOs_mapped) >= 1 and len(MIEs_mapped) == 0:
        evidence = 'Level 2' # only AOs are mapped but not MIE
    else:
        evidence = 'Level 1' # only KEs are mapped
    
    return evidence   



In [17]:
HighCAOPs_mapped_AOPs['Level of relevance'] = HighCAOPs_mapped_AOPs.apply(lambda row:Get_evidence_category(row),axis=1)

print(HighCAOPs_mapped_AOPs.shape)

HighCAOPs_mapped_AOPs.head()


In [18]:
Stressor_AOP_edgelist = pd.DataFrame(HighCAOPs_mapped_AOPs[['AOP_iden','chemical','Coverage score','Evidence Level']])

Stressor_AOP_edgelist = pd.DataFrame(Stressor_AOP_edgelist[['chemical','AOP_iden','Coverage score','Evidence Level']])

print(Stressor_AOP_edgelist.shape)

Stressor_AOP_edgelist.head()

In [19]:
Stressor_AOP_edgelist.to_csv('Stressor_AOP_edgelist.tsv',sep='\t',index=None) # output file