### Author - Ajaya Kumar Sahoo

#### This code gives the list of tetramers by utilizing the following data: 
#### chemical-gene, chemical-phenotype, chemical-disease and gene-disease associations from CTD - https://ctdbase.org/downloads/
#### Gene-phenotype associations from NCBI Gene resource  - https://ftp.ncbi.nih.gov/gene/DATA/


In [2]:
import pandas as pd
import numpy as  np

In [3]:
## Read the chemical list

chemicals = pd.read_csv('chemicals.tsv',sep='\t',dtype=str) # get the chemical list (CAS identifier)

chemicals = chemicals.replace(np.nan,'',regex=True)

print(chemicals.shape)

chemical_list = list(chemicals['iden']) # only consider the chemical CAS ids for next step

chemicals.head()

## Chemical - gene interaction from CTD

In [4]:
chem_gene = pd.read_csv('CTD_chem_gene_ixns.tsv',sep='\t',comment='#',dtype=str,header=None) # download CTD_chem_gene_ixns.tsv from CTD
print(chem_gene.shape)
chem_gene.columns = ['ChemicalName', 'ChemicalID', 'CasRN', 'GeneSymbol', 'GeneID', 'GeneForms', 'Organism', 'OrganismID', 'Interaction', 'InteractionActions', 'PubMedIDs']

print(chem_gene.shape)
#chem_gene.head()
chem_gene_common = pd.DataFrame(chem_gene[chem_gene['CasRN'].isin(chemical_list)])
print(chem_gene_common.shape)

chem_gene_common.head()


## chemical phenotype interaction from CTD 

In [5]:
chem_pheno = pd.read_csv('CTD_pheno_term_ixns.tsv',sep='\t',comment='#',dtype=str,header=None) # download CTD_pheno_term_ixns.tsv from CTD

print(chem_pheno.shape)

chem_pheno.columns = ['chemicalname', 'chemicalid', 'casrn', 'phenotypename', 'phenotypeid', 'comentionedterms', 'organism', 'organismid', 'interaction', 'interactionactions', 'anatomyterms', 'inferencegenesymbols', 'pubmedids', 'some']

print(chem_pheno.shape)
#chem_pheno.head()
chem_pheno_common = pd.DataFrame(chem_pheno[chem_pheno['casrn'].isin(chemical_list)])

print(chem_pheno_common.shape)
chem_pheno_common.head()



## Chemical disease from CTD

In [6]:
chem_dis = pd.read_csv('CTD_chemicals_diseases.tsv',sep='\t',comment='#',dtype=str,header=None) # download CTD_chemicals_diseases.tsv from CTD

print(chem_dis.shape)

chem_dis.columns = ['ChemicalName', 'ChemicalID', 'CasRN', 'DiseaseName', 'DiseaseID', 'DirectEvidence', 'InferenceGeneSymbol', 'InferenceScore', 'OmimIDs', 'PubMedIDs']

print(chem_dis.shape)
#chem_pheno.head()
chem_dis_common = pd.DataFrame(chem_dis[chem_dis['CasRN'].isin(chemical_list)])

print(chem_dis_common.shape)
chem_dis_common.head()


In [7]:
set(chem_dis[chem_dis['DirectEvidence'] == 'marker/mechanism']['InferenceScore'])

In [9]:
chem_dis['DirectEvidence'].unique()

## Gene disease from CTD

In [25]:
gene_dis = pd.read_csv('CTD_genes_diseases.tsv',sep='\t',comment='#',dtype=str,header=None) # download CTD_genes_diseases.tsv from CTD

print(gene_dis.shape)

gene_dis.columns = ['GeneSymbol', 'GeneID', 'DiseaseName', 'DiseaseID', 'DirectEvidence', 'InferenceChemicalName', 'InferenceScore', 'OmimIDs', 'PubMedIDs']

print(gene_dis.shape)
#chem_pheno.head()
gene_dis_common = pd.DataFrame(gene_dis[gene_dis['GeneID'].isin(list(chem_gene_common['GeneID'].unique()))])

print(gene_dis_common.shape)
gene_dis_common.head()


## Gene-Go phenotype (From NCBI gene)

In [10]:
gene_GO = pd.read_csv('gene2go',sep='\t',dtype=str) # download this filr from NCBI Gene

print(gene_GO.shape)
gene_GO_common = pd.DataFrame(gene_GO[gene_GO['GeneID'].isin(list(chem_gene_common['GeneID'].unique()))])
print(gene_GO_common.shape)
gene_GO_common.head()
#gene_GO.head()

In [11]:
# these are the experimental evidence codes for GO terms listed in https://geneontology.org/docs/guide-go-evidence-codes/
# We consider the experimental evidence for the GO terms from NCBI to get the gene-GO association

GO_evidence = ['EXP','IDA','IPI','IMP','IGI','IEP','HTP','HDA','HMP','HGI','HEP']

gene_GO_common_exp_evidence = pd.DataFrame(gene_GO_common[gene_GO_common['Evidence'].isin(GO_evidence)].reset_index(drop=True))
print(gene_GO_common_exp_evidence.shape)
gene_GO_common_exp_evidence.head()


In [12]:
len(list(gene_GO_common_exp_evidence['GeneID'].unique()))

In [13]:
len(list(chem_gene_common['GeneID'].unique()))

In [14]:
# Consider the direct evidence for chemical disease

# chem_dis_common = chem_dis_common.replace(np.nan,'',regex=True).reset_index(drop=True)

print(chem_dis_common.shape)
print(chem_dis_common['DirectEvidence'].unique())
chem_dis_common_direct = pd.DataFrame(chem_dis_common[chem_dis_common['DirectEvidence'] == 'marker/mechanism'].reset_index(drop=True))
chem_dis_common_direct = chem_dis_common_direct.replace(np.nan,'',regex=True)
print(chem_dis_common_direct.shape)
chem_dis_common_direct.head()

In [15]:
len(chem_dis_common_direct['DiseaseID'].unique())

In [16]:
# Consider the direct evidence for gene disease association

#gene_dis_common = gene_dis_common.replace(np.nan,'',regex=True).reset_index(drop=True)

print(gene_dis_common.shape)
#gene_dis_common.head()
print(gene_dis_common['DirectEvidence'].unique())

gene_dis_common_direct = pd.DataFrame(gene_dis_common[(gene_dis_common['DirectEvidence'] == 'marker/mechanism')|(gene_dis_common['DirectEvidence'] == 'marker/mechanism|therapeutic')])
gene_dis_common_direct = gene_dis_common_direct.replace(np.nan,'',regex=True)
print(gene_dis_common_direct.shape)
gene_dis_common_direct.head()

In [17]:
gene_dis_common['DirectEvidence'].unique()

In [18]:
#gene_dis_common_direct.head()

In [19]:
# Finding chemical-gene (CG) tuple
# this step considers chemical-gene data

chem_gene_tuple = tuple(set(tuple(zip(chem_gene_common['CasRN'],chem_gene_common['GeneID']))))
print(len(chem_gene_tuple))


In [20]:
# Finding chemical gene phenotype map
# this step considers chemical-phenotype data and gene-phenotype data

chem_gene_phenotype = {}
for ele in chem_gene_tuple:
    chem_gene_phenotype[ele] = list(set(chem_pheno_common.loc[chem_pheno_common['casrn'] == ele[0],'phenotypeid']).intersection(set(gene_GO_common_exp_evidence.loc[gene_GO_common_exp_evidence['GeneID'] ==ele[1],'GO_ID'])))
print(len(chem_gene_phenotype)) 



In [21]:
# Finding chemical gene disease map
# this step considers chemical-disease data, gene-disease data

chem_gene_disease = {}
for ele in chem_gene_tuple:
    chem_gene_disease[ele] = list(set(chem_dis_common_direct.loc[chem_dis_common_direct['CasRN'] == ele[0],'DiseaseID']).intersection(set(gene_dis_common_direct.loc[gene_dis_common_direct['GeneID'] ==ele[1],'DiseaseID'])))
print(len(chem_gene_disease))


In [22]:
def Get_combinations(lis):
    # consider all the tetramers from the list
    tetramers = []
    chemical = lis[0][0]
    gene = lis[0][1]
    for k in lis[1]: # looping through phenotypes
        for l in lis[2]: # looping through diseases
            phenotype = k
            disease = l
            tetramers.append([chemical, gene, phenotype, disease])
    return tetramers         

In [23]:
tetramers = []
# for (chemical, gene) as keys, check if it has atleast some phenotypes and diseases 
for ele in chem_gene_disease.keys():
    if len(chem_gene_phenotype[ele]) >=1 and len(chem_gene_disease[ele]) >=1:
        tetramers.append(Get_combinations([ele,chem_gene_phenotype[ele], chem_gene_disease[ele]]))

#print(tetramers)
tetramers_list = []
chemical_list = []
gene_list = []
phenotype_list = []
disease_list = []

for k in tetramers:
    for tetramer in k:
        tetramers_list.append(tetramer)
        
        chemical_list.append(tetramer[0])
        gene_list.append(tetramer[1])
        phenotype_list.append(tetramer[2])
        disease_list.append(tetramer[3])
        
print('Number of total tetramers:', len(tetramers_list))
print('Number of total chemicals:', len(set(chemical_list)))
print('Number of total genes:', len(set(gene_list)))
print('Number of total phenotypes:', len(set(phenotype_list)))
print('Number of total diseases:', len(set(disease_list)))



In [24]:
f1 = open('Tetramers/Tetramer_list.tsv','w') # output file for tetramer list
f1.write('Chemical'+'\t'+'Gene'+'\t'+'Phenotype'+'\t'+'Disease'+'\n')

for ele in tetramers_list:
    f1.write(ele[0]+'\t'+ele[1]+'\t'+ele[2]+'\t'+ele[3]+'\n')
f1.close()