### Author - Ajaya Kumar Sahoo

#### This code filter the active ToxCast assay endpoints for chemicals using ‘cytotoxicity-associated bursts’ approach


In [2]:
import numpy as np
import pandas as pd
import math

In [3]:
## reading the final mc5-6 file from ToxCast (https://www.epa.gov/comptox-tools/exploring-toxcast-data; https://clowder.edap-cluster.com/spaces/647f710ee4b08a6b394e426b)

toxcast = pd.read_csv('mc5-6_winning_model_fits-flags_invitrodb_v4_1_SEPT2023.csv',dtype=str) # use the latest data from ToxCast

toxcast = toxcast.replace(np.nan,'',regex=True)

print(toxcast.shape)
toxcast.head()


In [11]:
# reading data from all models, mc4 file from ToxCast (https://www.epa.gov/comptox-tools/exploring-toxcast-data; https://clowder.edap-cluster.com/spaces/647f710ee4b08a6b394e426b)

allfits = pd.read_csv('../../../../ToxCast/V4.1/INVITRODB_V4_1_SUMMARY/mc4_all_model_fits_invitrodb_v4_1_SEPT2023.csv') # use the latest data from ToxCast

print(allfits.shape)
allfits.head()



In [4]:
print([i for i in allfits.columns])

In [5]:
#check the sign of the chemical regulation.
sign_column = []
for i in range(toxcast.shape[0]): # iterate through the rows in toxcast df
    model = toxcast['modl'][i]
    if pd.isna(model):
        sign_column.append(0)
        continue
    elif model == 'none':
        sign_column.append(0)
        continue
    else:
        # column value to check in the allfits dataframe
        col = model + '_top'
        if allfits[col][i] > 0:
            sign_column.append(1)
        elif allfits[col][i] < 0 :
            sign_column.append(-1)
        else:
            sign_column.append(0)

print(toxcast.shape)
toxcast['sign_of_model'] = sign_column
print(toxcast.shape)

In [6]:
toxcast_f = pd.DataFrame(toxcast[['casn','chnm','dsstox_substance_id','aeid','aenm','nconc','modl','hitc','sign_of_model','ac50','conc_unit']])

print(toxcast_f.shape)
toxcast_f.head()

In [7]:
# Filtering the toxcast data based on the chemical list

chemicals = pd.read_csv('chemicals.tsv',sep='\t',dtype=str)
chemicals = chemicals.replace(np.nan,'',regex=True)

print(chemicals.shape)

chemicals.head()

In [8]:
## filter the toxcast data for the chemical list

toxcast_common = pd.DataFrame(toxcast_f[toxcast_f['casn'].isin(list(set(chemicals['CAS'])-{''}))])

print(toxcast_common.shape)

toxcast_common.head()

In [74]:
toxcast_common['hitc'] = toxcast_common['hitc'].astype(float)

In [9]:
len(set(toxcast_common['casn']))

In [10]:
# Checking for active chemicals, hitc or hitcall >= 0.9 for active chemicals 
# the above cutoff is based on the readme file given in toxcast v4.1 data 

toxcast_common_active = pd.DataFrame(toxcast_common[toxcast_common['hitc'] >= 0.9])

print(toxcast_common_active.shape)
toxcast_common_active.head()

In [12]:
## reading the cytotoxicity values for chemicals

cytotox = pd.read_excel('cytotox_invitrodb_v4_1_SEPT2023.xlsx',dtype=str) # use the latest data from ToxCast
print(cytotox.shape)

chemical_cytotox = dict(zip(cytotox['casn'],cytotox['cytotox_median_log']))

print(len(chemical_cytotox))


cytotox.head()

In [13]:
cytotox['global_mad'].unique()

In [14]:
## Computing Z scores
## We followed https://academic.oup.com/toxsci/article/152/2/323/2578946?login=true and taken the following formula

def get_z_score(row):
    '''
    compute the z score based on the following formula
    [-log(AC50) - (-cytotox_median_log)]/global_mad
    '''
    casid = row['casn']
    if casid in chemical_cytotox: # chemical has cytotoxicity value
        zscore = round((-math.log10(float(row['ac50']))+float(chemical_cytotox[casid]))/float(cytotox.at[1,'global_mad']),2)
    else:
        zscore = 0
    return zscore

In [15]:
toxcast_common_active['zscore'] = toxcast_common_active.apply(lambda row:get_z_score(row),axis=1)

print(toxcast_common_active.shape)
toxcast_common_active.head()

In [16]:
# checking if the absolute value of the zscore >= 3
# We followed https://academic.oup.com/toxsci/article/152/2/323/2578946?login=true and taken the following formula

toxcast_common_active_zscore = pd.DataFrame(toxcast_common_active[toxcast_common_active['zscore'].abs() >= 3])


print(toxcast_common_active_zscore.shape)
toxcast_common_active_zscore.head()

In [17]:
## Getting the assay endpoint 

endpoint_annotations = pd.read_excel('assay_annotations_invitrodb_v4_1_SEPT2023.xlsx',dtype=str) # use the latest data from ToxCast

endpoint_annotations = endpoint_annotations.replace(np.nan,'',regex=True)

print(endpoint_annotations.shape)

endpoint_annotations.head()


In [18]:
endpoint_annotations_sliced =  pd.DataFrame(endpoint_annotations[['aeid','assay_component_endpoint_desc','intended_target_type','signal_direction']])
print(endpoint_annotations_sliced.shape)

endpoint_annotations_sliced.head()



In [19]:
## filtering the endpoints from endpoint_annotations_sliced

endpoint_annotations_common_aeid = pd.DataFrame(endpoint_annotations_sliced[endpoint_annotations_sliced['aeid'].isin(set(toxcast_common_active_zscore['aeid']))])

print(endpoint_annotations_common_aeid.shape)

endpoint_annotations_common_aeid.head()



In [20]:
endpoint_gene = pd.read_excel('assay_gene_mappings_invitrodb_v4_1_SEPT2023.xlsx',dtype=str) # use the latest data from ToxCast

print(endpoint_gene.shape)

endpoint_gene = endpoint_gene.replace(np.nan,'',regex=True)

endpoint_gene = pd.DataFrame(endpoint_gene[['aeid','entrez_gene_id','official_full_name','official_symbol','uniprot_accession_number']])

print(endpoint_gene.shape)


endpoint_gene.head()

In [21]:
# grouping the endpoint gene by aeid

endpoint_gene_grouped = endpoint_gene.groupby(['aeid']).agg(lambda x:'|'.join(list(set([i for i in set(x)])-{''}))).reset_index()

print(endpoint_gene_grouped.shape)

endpoint_gene_grouped.head()


In [22]:
## merging the assay details and gene mapping

endpoint_details = endpoint_annotations_sliced.merge(endpoint_gene_grouped,on='aeid',how='inner')

print(endpoint_details.shape)

endpoint_details.head()

In [23]:
### combining toxcast data with the assay details

toxcast_common_endpoints =  toxcast_common_active_zscore.merge(endpoint_details,on='aeid',how='left')

print(toxcast_common_endpoints.shape)

toxcast_common_endpoints.head()



In [24]:
toxcast_common_endpoints['assay_component_endpoint_desc'] = toxcast_common_endpoints['assay_component_endpoint_desc'].str.replace('\n','')


print(toxcast_common_endpoints.shape)

toxcast_common_endpoints.head()



In [117]:
## Getting the response direction for bidirectional assay endpoints

def get_response(row):
    signal_direction = row['signal_direction']
    sign = row['sign_of_model']
    
    if signal_direction == 'bidirectional':
        if sign == 1:
            return 'gain'
        else:
            return 'loss'
    else:
        return ''   

In [25]:
toxcast_common_endpoints['response'] = toxcast_common_endpoints.apply(lambda row:get_response(row),axis=1)

print(toxcast_common_endpoints.shape)

toxcast_common_endpoints.head()

In [26]:
toxcast_common_response = pd.DataFrame(toxcast_common_endpoints[['casn','chnm','dsstox_substance_id','aeid','aenm','nconc','modl',
                                                         'hitc','ac50','conc_unit','zscore','response','assay_component_endpoint_desc',
                                                         'intended_target_type','entrez_gene_id','official_full_name','official_symbol',
                                                         'uniprot_accession_number']])
print(toxcast_common_response.shape)

toxcast_common_response.head()


In [121]:
toxcast_common_response.to_csv('toxcast_response.tsv',sep='\t',index=None,encoding='UTF-8') # output file