Want to check if all of the drug combinations with toxicity that we're using have drug target and pathway intersections

In [27]:
# Import everything needed
from scipy import stats
from toxicity_ranking import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scikit_posthocs as sp

In [28]:
ddinter_df = get_ddinter_data()
drugcomb_df = get_drug_comb_data(bliss=True, loewe=True, hsa=True, zip=True)
drug_syntox_df, major_pairs, moderate_pairs, minor_pairs, unknown_pairs = find_drugcomb_ddinter_intersect(drugcomb_df, ddinter_df)

# remove pairs that have unknown toxicity
drug_syntox_df = drug_syntox_df[~drug_syntox_df['toxicity_category'].str.contains('Unknown')]

  drugcomb_df = pd.read_csv('data/DrugComb/drugcomb_summary_v_1_5.csv', sep=',', index_col=False)


Original shape of drugcomb data:  (1432351, 26)
Final shape of filtered drugcomb data:  (123882, 26)
Number of drugs in common between drugcomb and ddinter [lowercase enforced]:  486
Major pairs in both DrugComb and in DDInter:  335
Moderate pairs in both DrugComb and in DDInter:  1027
Minor pairs in both DrugComb and in DDInter:  59
Unknown toxicity pairs in both DrugComb and in DDInter:  573
Total common pairs:  1994
Total known pairs:  1421


In [29]:
# File names for reactome pathways and drugbank target csv
drug_target_fp = 'data_processed/drugbank_drug_targets.csv'
reactome_lowest_pw_fp = 'data_processed/reactome_lowest_pathways_homo_sapiens.csv'
reactome_all_pw_fp = 'data_processed/reactome_all_pathways_homo_sapiens.csv'

drug_target_df = pd.read_csv(drug_target_fp)
lowest_pw_df = pd.read_csv(reactome_lowest_pw_fp)
all_pw_df = pd.read_csv(reactome_all_pw_fp)

drug_target_df['drug_name'] = drug_target_df['drug_name'].str.lower()

drug_target_df

Unnamed: 0,drug_name,target_name,target_DrugBank_ID,GenBank_Protein_ID,GenBank_Gene_ID,UniProtKB_ID,GenAtlas_ID,HGNC_ID
0,lepirudin,Prothrombin,BE0000048,339641.0,M17262,P00734,F2,HGNC:3535
1,cetuximab,Epidermal growth factor receptor,BE0000767,757924.0,X00588,P00533,EGFR,HGNC:3236
2,cetuximab,Low affinity immunoglobulin gamma Fc region re...,BE0000901,31322.0,X16863,O75015,FCGR3B,HGNC:3620
3,cetuximab,Complement C1q subcomponent subunit A,BE0002094,4894854.0,AF135157,P02745,C1QA,HGNC:1241
4,cetuximab,Complement C1q subcomponent subunit B,BE0002095,573114.0,X03084,P02746,C1QB,HGNC:1242
...,...,...,...,...,...,...,...,...
19430,lotilaner,"Gaba-gated chloride channel, putative",BE0010256,,,E0W492,,
19431,(r)-9b,Activated CDC42 kinase 1,BE0000772,8850245.0,L13738,Q07912,TNK2,HGNC:19297
19432,lovotibeglogene autotemcel,,BE0010962,,,,,
19433,efbemalenograstim alfa,Granulocyte colony-stimulating factor receptor,BE0000793,31697.0,X55721,Q99062,CSF3R,HGNC:2439


Check:
- If all drugs in drugcomb and ddinter intersection with known toxicity have drug target information

In [30]:
# all drugs in syntox
all_syntox_drugs = set(drug_syntox_df['drug_row'].values).union(set(drug_syntox_df['drug_col'].values))
print("Num of drugs in the syntox intersection with KNOWN toxicity: " + str(len(all_syntox_drugs)))

drugs_with_no_drug_target_info = []

# Do all these drugs in syntox have drug target information
for drug in all_syntox_drugs:
    drug_target_subset = drug_target_df[drug_target_df['drug_name'] == drug]
    if drug_target_subset.shape[0] == 0:
        drugs_with_no_drug_target_info.append(drug)

print("Num of drugs in syntox intersection WITHOUT target info: " + str(len(drugs_with_no_drug_target_info)))
print(drugs_with_no_drug_target_info)

Num of drugs in the syntox intersection with KNOWN toxicity: 280
Num of drugs in syntox intersection WITHOUT target info: 9
['trimethobenzamide', 'phensuximide', 'metaxalone', 'lumefantrine', 'artemether', 'ixazomib', 'chlorphenesin', 'ivermectin', 'bendamustine']


FOUND SOME DRUG TARGETS MANUALLY EITHER THROUGH DRUGBANK DATABASE OR LITERATURE
- Phensuximide (no targets)
- Chlorphenesin (no targets)

In [None]:
# Create a list of dictionaries containing the manually added drug-target data
drug_target_data = [
    {
        'drug_name': 'trimethobenzamide', 
        'target_name': 'D(2) dopamine receptor',
        'target_DrugBank_ID': 'DB00662',
        'GenBank_Protein_ID': '181432',
        'GenBank_Gene_ID': 'M30625',
        'UniProtKB_ID': 'P14416',
        'GenAtlas_ID': 'DRD2',
        'HGNC_ID': 'HGNC:3023',
    },
    {
        'drug_name': 'ixazomib', 
        'target_name': 'Proteasome subunit beta type-5',
        'target_DrugBank_ID': 'BE0002348',
        'GenBank_Protein_ID': '558526',
        'GenBank_Gene_ID': 'D29011',
        'UniProtKB_ID': 'P28074',
        'GenAtlas_ID': 'PSMB5',
        'HGNC_ID': 'HGNC:9542',
    },
    {
        'drug_name': 'metaxalone', 
        'target_name': 'Amine oxidase [flavin-containing] A',
        'target_DrugBank_ID': 'BE0002198',
        'GenBank_Protein_ID': '187353',
        'GenBank_Gene_ID': 'M68840',
        'UniProtKB_ID': 'P21397',
        'GenAtlas_ID': 'MAOA',
        'HGNC_ID': 'HGNC:6833',
    },
    {
        'drug_name': 'bendamustine',
        'target_name': 'DNA',
        'target_DrugBank_ID': np.nan,
        'GenBank_Protein_ID': np.nan,
        'GenBank_Gene_ID': np.nan,
        'UniProtKB_ID': np.nan,
        'GenAtlas_ID': np.nan,
        'HGNC_ID': np.nan,
    },
    {
        'drug_name': 'ivermectin',
        'target_name': 'Glutamate-gated chloride channel subunit beta',
        'target_DrugBank_ID': 'BE0027480',
        'GenBank_Protein_ID': np.nan,
        'GenBank_Gene_ID': np.nan,
        'UniProtKB_ID': 'Q17328',
        'GenAtlas_ID': np.nan,
        'HGNC_ID': np.nan,
    },
    {
        'drug_name': 'ivermectin',
        'target_name': 'Glutamate-gated chloride channel alpha',
        'target_DrugBank_ID': 'BE0027481',
        'GenBank_Protein_ID': np.nan,
        'GenBank_Gene_ID': np.nan,
        'UniProtKB_ID': 'G5EBR3',
        'GenAtlas_ID': np.nan,
        'HGNC_ID': np.nan,
    },
    {
        'drug_name': 'artemether',
        'target_name': 'Sodium/potassium-transporting ATPase subunit alpha-1',
        'target_DrugBank_ID': 'BE0000732',
        'GenBank_Protein_ID': '219942',
        'GenBank_Gene_ID': 'D00099',
        'UniProtKB_ID': 'P05023',
        'GenAtlas_ID': 'ATP1A1',
        'HGNC_ID': 'HGNC:799',
    },
    {
        'drug_name': 'lumefantrine',
        'target_name': 'Sodium/potassium-transporting ATPase subunit alpha-1',
        'target_DrugBank_ID': 'BE0000732',
        'GenBank_Protein_ID': '219942',
        'GenBank_Gene_ID': 'D00099',
        'UniProtKB_ID': 'P05023',
        'GenAtlas_ID': 'ATP1A1',
        'HGNC_ID': 'HGNC:799',
    }
]

manually_added_drug_target_df = pd.DataFrame(drug_target_data)

# If you need to append this to an existing DataFrame, use concat:
drug_target_df = pd.concat([drug_target_df, manually_added_drug_target_df], ignore_index=True)

drugs_with_no_drug_target_info = []

# Do all these drugs in syntox have drug target information
for drug in all_syntox_drugs:
    drug_target_subset = drug_target_df[drug_target_df['drug_name'] == drug]
    if drug_target_subset.shape[0] == 0:
        drugs_with_no_drug_target_info.append(drug)

print("Num of drugs in syntox intersection WITHOUT target info: " + str(len(drugs_with_no_drug_target_info)))
print(drugs_with_no_drug_target_info) # Should be only 2 drugs now

# Also want to know how many of the intersection drugs with targets have UniProt IDs
drugs_with_no_uniprot_info = set()
for drug in all_syntox_drugs:
    drug_target_subset = drug_target_df[drug_target_df['drug_name'] == drug]
    if drug_target_subset['UniProtKB_ID'].isnull().all():
        drugs_with_no_uniprot_info.add(drug)

print("Num of drugs in syntox intersection WITHOUT UniProt info: " + str(len(drugs_with_no_uniprot_info)))
print(drugs_with_no_uniprot_info) # Should include the 2 drugs without target info

# Slim down the drug_target_df to only include drugs in the intersection with known toxicity
syntoxtarg_drugs = all_syntox_drugs - set(drugs_with_no_drug_target_info)
syntoxtarg_df = drug_target_df[drug_target_df['drug_name'].isin(syntoxtarg_drugs)]
syntoxtarg_df.to_csv('data_processed/syntoxtarg.csv', index=False)

Num of drugs in syntox intersection WITHOUT target info: 2
['phensuximide', 'chlorphenesin']
Num of drugs in syntox intersection WITHOUT UniProt info: 13
{'methoxsalen', 'trabectedin', 'praziquantel', 'busulfan', 'phensuximide', 'bendamustine', 'niclosamide', 'temozolomide', 'thiotepa', 'chlorphenesin', 'chlorambucil', 'tinidazole', 'altretamine'}


Check if
- If all drugs in drugcomb and ddinter intersection with known toxicity have pathway information

In [36]:
drugs_with_no_lowest_pathway_info = []
drugs_with_no_all_pathway_info = []

# Read in the UniProt2Reactome.tsv file
lowest_pw_df = pd.read_csv('data_processed/reactome_lowest_pathways_homo_sapiens.csv')
all_pw_df = pd.read_csv('data_processed/reactome_all_pathways_homo_sapiens.csv')

syntox_drugs_with_uniprot_targets = all_syntox_drugs - drugs_with_no_uniprot_info
print(len(syntox_drugs_with_uniprot_targets))

syntoxtarg_lowestpw_df = pd.DataFrame(columns=['drug_name', 'target_name', 'UniProtKB_ID', 'Reactome_ID', 'Pathway_Name'])
syntoxtarg_allpw_df = pd.DataFrame(columns=['drug_name', 'target_name', 'UniProtKB_ID', 'Reactome_ID', 'Pathway_Name'])

for drug in syntox_drugs_with_uniprot_targets:
    drug_target_subset = drug_target_df[drug_target_df['drug_name'] == drug]
    for iter, targ_row in drug_target_subset.iterrows():
        targ_uniprot = targ_row['UniProtKB_ID']
        # lowestpw
        lowest_pw_subset = lowest_pw_df[lowest_pw_df['UniProtKB_ID'] == targ_uniprot]
        for iter, lowpw_row in lowest_pw_subset.iterrows():
            syntoxtarg_lowestpw_df.loc[len(syntoxtarg_lowestpw_df)] = [targ_row['drug_name'], targ_row['target_name'], targ_uniprot, lowpw_row['Reactome_ID'], lowpw_row['Pathway_Name']]
        # allpw
        all_pw_subset = all_pw_df[all_pw_df['UniProtKB_ID'] == targ_uniprot]
        for iter, allpw_row in all_pw_subset.iterrows():
            syntoxtarg_allpw_df.loc[len(syntoxtarg_allpw_df)] = [targ_row['drug_name'], targ_row['target_name'], targ_uniprot, allpw_row['Reactome_ID'], allpw_row['Pathway_Name']]

syntoxtarg_lowestpw_df.to_csv('data_processed/syntoxtarg_lowestpw.csv', index=False)
syntoxtarg_allpw_df.to_csv('data_processed/syntoxtarg_allpw.csv', index=False)

# Now we have the syntox-target-pathway data for the intersection of drugs with known toxicity
# How many unique drugs are in this syntoxtarg intersection?
print("Lowest pw unique drugs in syntoxtarg: " + str(len(set(syntoxtarg_lowestpw_df['drug_name'].values))))
print("All pw unique drugs in syntoxtarg: " + str(len(set(syntoxtarg_allpw_df['drug_name'].values))))

267
Lowest pw unique drugs in syntoxtarg: 249
All pw unique drugs in syntoxtarg: 249


For the whole syntoxtarg both low and all pathway intersections, what's the range of targets (1 min and max per drug) and pathways (1 min and max per drug)

In [39]:
# What's the max number of targets per drug?
print("Lowest pw max targets per drug: " + str(syntoxtarg_lowestpw_df.groupby('drug_name')['target_name'].nunique().max()))
print("All pw max targets per drug: " + str(syntoxtarg_allpw_df.groupby('drug_name')['target_name'].nunique().max()))

# What's the max number of pathways per drug?
print("Lowest pw max pathways per drug: " + str(syntoxtarg_lowestpw_df.groupby('drug_name')['Pathway_Name'].nunique().max()))
print("All pw max pathways per drug: " + str(syntoxtarg_allpw_df.groupby('drug_name')['Pathway_Name'].nunique().max()))

# For targets
print("Drugs with most targets (lowest pw):")
print(syntoxtarg_lowestpw_df.groupby('drug_name')['target_name'].nunique().nlargest(3))

print("\nDrugs with most targets (all pw):")
print(syntoxtarg_allpw_df.groupby('drug_name')['target_name'].nunique().nlargest(3))

# For pathways
print("\nDrugs with most pathways (lowest pw):")
print(syntoxtarg_lowestpw_df.groupby('drug_name')['Pathway_Name'].nunique().nlargest(3))

print("\nDrugs with most pathways (all pw):")
print(syntoxtarg_allpw_df.groupby('drug_name')['Pathway_Name'].nunique().nlargest(3))

Lowest pw max targets per drug: 216
All pw max targets per drug: 216
Lowest pw max pathways per drug: 700
All pw max pathways per drug: 1055
Drugs with most targets (lowest pw):
drug_name
fostamatinib    216
aripiprazole     38
zonisamide       29
Name: target_name, dtype: int64

Drugs with most targets (all pw):
drug_name
fostamatinib    216
aripiprazole     38
zonisamide       29
Name: target_name, dtype: int64

Drugs with most pathways (lowest pw):
drug_name
fostamatinib    700
dasatinib       188
ponatinib       181
Name: Pathway_Name, dtype: int64

Drugs with most pathways (all pw):
drug_name
fostamatinib    1055
dasatinib        349
ponatinib        297
Name: Pathway_Name, dtype: int64
