# DDINTER VERSION #
Filter databases to all have DDInter toxicity, DrugComb combinations, DrugBank targets, Reactome pathways, and STRING IDs

In [1]:
# Import everything needed
from scipy import stats
from preprocessing_functions import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scikit_posthocs as sp

In [2]:
# Assume that known DDInter-DrugComb intersection has been created by preprocessing_function.py
drug_syntox_df = pd.read_csv("data_processed/ddinter_syntox_known.csv")

# Assume that drug target, reactome, and STRING information has been preprocessed by preprocessing_function.py
drug_target_df = pd.read_csv("data_processed/drugbank_drug_targets.csv")
lowest_pw_df = pd.read_csv('data_processed/reactome_lowest_pathways_homo_sapiens.csv')
all_pw_df = pd.read_csv('data_processed/reactome_all_pathways_homo_sapiens.csv')
STRING_G = get_STRING_graph()

drug_target_df.head()


Original shape of STRING edge list, physical detailed: (1477610, 6)


Unnamed: 0,drug_name,target_name,target_DrugBank_ID,GenBank_Protein_ID,GenBank_Gene_ID,UniProtKB_ID,GenAtlas_ID,HGNC_ID,SMILES
0,lepirudin,Prothrombin,BE0000048,339641.0,M17262,P00734,F2,HGNC:3535,
1,cetuximab,Epidermal growth factor receptor,BE0000767,757924.0,X00588,P00533,EGFR,HGNC:3236,
2,cetuximab,Low affinity immunoglobulin gamma Fc region re...,BE0000901,31322.0,X16863,O75015,FCGR3B,HGNC:3620,
3,cetuximab,Complement C1q subcomponent subunit A,BE0002094,4894854.0,AF135157,P02745,C1QA,HGNC:1241,
4,cetuximab,Complement C1q subcomponent subunit B,BE0002095,573114.0,X03084,P02746,C1QB,HGNC:1242,


Intersection between DDInter, DrugComb, Drug Targets

In [3]:
known_syntox_drugs = set(drug_syntox_df['drug_row'].values).union(set(drug_syntox_df['drug_col'].values))
print("How many drugs are originally in the knownDDInter+DrugComb intersection? " + str(len(known_syntox_drugs)))

drugs_with_no_drug_target_info = []

# Do all these drugs in syntox have drug target information
for drug in known_syntox_drugs:
    drug_target_subset = drug_target_df[drug_target_df['drug_name'] == drug]
    if drug_target_subset.shape[0] == 0:
        drugs_with_no_drug_target_info.append(drug)

print("Missing drugs in knownDDInter+DrugComb+DrugBankTargets intersection: " + str(len(drugs_with_no_drug_target_info)))
print(drugs_with_no_drug_target_info)

How many drugs are originally in the knownDDInter+DrugComb intersection? 385
Missing drugs in knownDDInter+DrugComb+DrugBankTargets intersection: 9
['lumefantrine', 'phensuximide', 'artemether', 'metaxalone', 'trimethobenzamide', 'ivermectin', 'chlorphenesin', 'bendamustine', 'ixazomib']


FOUND SOME DRUG TARGETS MANUALLY EITHER THROUGH DRUGBANK DATABASE OR LITERATURE
- Phensuximide (no targets)
- Chlorphenesin (no targets)

In [4]:
# Create a list of dictionaries containing the manually added drug-target data
drug_target_data = [
    {
        'drug_name': 'trimethobenzamide', 
        'target_name': 'D(2) dopamine receptor',
        'target_DrugBank_ID': 'DB00662',
        'GenBank_Protein_ID': '181432',
        'GenBank_Gene_ID': 'M30625',
        'UniProtKB_ID': 'P14416',
        'GenAtlas_ID': 'DRD2',
        'HGNC_ID': 'HGNC:3023',
    },
    {
        'drug_name': 'ixazomib', 
        'target_name': 'Proteasome subunit beta type-5',
        'target_DrugBank_ID': 'BE0002348',
        'GenBank_Protein_ID': '558526',
        'GenBank_Gene_ID': 'D29011',
        'UniProtKB_ID': 'P28074',
        'GenAtlas_ID': 'PSMB5',
        'HGNC_ID': 'HGNC:9542',
    },
    {
        'drug_name': 'metaxalone', 
        'target_name': 'Amine oxidase [flavin-containing] A',
        'target_DrugBank_ID': 'BE0002198',
        'GenBank_Protein_ID': '187353',
        'GenBank_Gene_ID': 'M68840',
        'UniProtKB_ID': 'P21397',
        'GenAtlas_ID': 'MAOA',
        'HGNC_ID': 'HGNC:6833',
    },
    {
        'drug_name': 'bendamustine',
        'target_name': 'DNA',
        'target_DrugBank_ID': np.nan,
        'GenBank_Protein_ID': np.nan,
        'GenBank_Gene_ID': np.nan,
        'UniProtKB_ID': np.nan,
        'GenAtlas_ID': np.nan,
        'HGNC_ID': np.nan,
    },
    {
        'drug_name': 'ivermectin',
        'target_name': 'Glutamate-gated chloride channel subunit beta',
        'target_DrugBank_ID': 'BE0027480',
        'GenBank_Protein_ID': np.nan,
        'GenBank_Gene_ID': np.nan,
        'UniProtKB_ID': 'Q17328',
        'GenAtlas_ID': np.nan,
        'HGNC_ID': np.nan,
    },
    {
        'drug_name': 'ivermectin',
        'target_name': 'Glutamate-gated chloride channel alpha',
        'target_DrugBank_ID': 'BE0027481',
        'GenBank_Protein_ID': np.nan,
        'GenBank_Gene_ID': np.nan,
        'UniProtKB_ID': 'G5EBR3',
        'GenAtlas_ID': np.nan,
        'HGNC_ID': np.nan,
    },
    {
        'drug_name': 'artemether',
        'target_name': 'Sodium/potassium-transporting ATPase subunit alpha-1',
        'target_DrugBank_ID': 'BE0000732',
        'GenBank_Protein_ID': '219942',
        'GenBank_Gene_ID': 'D00099',
        'UniProtKB_ID': 'P05023',
        'GenAtlas_ID': 'ATP1A1',
        'HGNC_ID': 'HGNC:799',
    },
    {
        'drug_name': 'lumefantrine',
        'target_name': 'Sodium/potassium-transporting ATPase subunit alpha-1',
        'target_DrugBank_ID': 'BE0000732',
        'GenBank_Protein_ID': '219942',
        'GenBank_Gene_ID': 'D00099',
        'UniProtKB_ID': 'P05023',
        'GenAtlas_ID': 'ATP1A1',
        'HGNC_ID': 'HGNC:799',
    }
]

manually_added_drug_target_df = pd.DataFrame(drug_target_data)

# If you need to append this to an existing DataFrame, use concat:
drug_target_df = pd.concat([drug_target_df, manually_added_drug_target_df], ignore_index=True)

drugs_with_no_drug_target_info = []

# Do all these drugs in syntox have drug target information
for drug in known_syntox_drugs:
    drug_target_subset = drug_target_df[drug_target_df['drug_name'] == drug]
    if drug_target_subset.shape[0] == 0:
        drugs_with_no_drug_target_info.append(drug)

print("Num of drugs in syntox intersection WITHOUT target info: " + str(len(drugs_with_no_drug_target_info)))
print(drugs_with_no_drug_target_info) # Should be only 2 drugs now

# Also want to know how many of the intersection drugs with targets have UniProt IDs
drugs_with_no_uniprot_info = set()
for drug in known_syntox_drugs:
    drug_target_subset = drug_target_df[drug_target_df['drug_name'] == drug]
    if drug_target_subset['UniProtKB_ID'].isnull().all():
        drugs_with_no_uniprot_info.add(drug)

print("Num of drugs in syntox intersection WITHOUT UniProt info: " + str(len(drugs_with_no_uniprot_info)))
print(drugs_with_no_uniprot_info) # Should include the 2 drugs without target info

# Slim down the drug_target_df to only include drugs in the intersection with known toxicity
syntoxtarg_drugs = known_syntox_drugs - set(drugs_with_no_drug_target_info)
syntoxtarg_df = drug_target_df[drug_target_df['drug_name'].isin(syntoxtarg_drugs)]
syntoxtarg_df.to_csv('data_processed/ddinter_syntoxtarg.csv', index=False)

with open('data_processed/ddinter_syntoxtarg_UniprotIDs.txt', 'w') as f:
    for id in set(syntoxtarg_df['UniProtKB_ID'].values):
        f.write(str(id) + '\n')

Num of drugs in syntox intersection WITHOUT target info: 2
['phensuximide', 'chlorphenesin']
Num of drugs in syntox intersection WITHOUT UniProt info: 20
{'octreotide', 'chlorphenesin', 'methoxsalen', 'niclosamide', 'telithromycin', 'valganciclovir', 'tinidazole', 'trabectedin', 'melphalan', 'bendamustine', 'mechlorethamine', 'temozolomide', 'busulfan', 'thiotepa', 'altretamine', 'deferasirox', 'phensuximide', 'praziquantel', 'nalidixic acid', 'chlorambucil'}


Intersection between DDInter, DrugComb, Drug Targets, Reactome Lowest vs Reactome All Pathways

In [5]:
drugs_with_no_lowest_pathway_info = []
drugs_with_no_all_pathway_info = []

syntox_drugs_with_uniprot_targets = known_syntox_drugs - drugs_with_no_uniprot_info
print("Number of known DDInter-DrugComb-DrugTarget drugs with UniProt IDs: " + str(len(syntox_drugs_with_uniprot_targets)))

syntoxtarg_lowestpw_df = pd.DataFrame(columns=['drug_name', 'target_name', 'target_DrugBank_ID', 'UniProtKB_ID', 'Reactome_ID', 'Pathway_Name'])
syntoxtarg_allpw_df = pd.DataFrame(columns=['drug_name', 'target_name', 'target_DrugBank_ID', 'UniProtKB_ID', 'Reactome_ID', 'Pathway_Name'])

for drug in syntox_drugs_with_uniprot_targets:
    drug_target_subset = drug_target_df[drug_target_df['drug_name'] == drug]
    for iter, targ_row in drug_target_subset.iterrows():
        targ_uniprot = targ_row['UniProtKB_ID']
        # lowestpw
        lowest_pw_subset = lowest_pw_df[lowest_pw_df['UniProtKB_ID'] == targ_uniprot]
        for iter, lowpw_row in lowest_pw_subset.iterrows():
            syntoxtarg_lowestpw_df.loc[len(syntoxtarg_lowestpw_df)] = [targ_row['drug_name'], \
                targ_row['target_name'], targ_row['target_DrugBank_ID'], targ_uniprot, \
                lowpw_row['Reactome_ID'], lowpw_row['Pathway_Name']]
        # allpw
        all_pw_subset = all_pw_df[all_pw_df['UniProtKB_ID'] == targ_uniprot]
        for iter, allpw_row in all_pw_subset.iterrows():
            syntoxtarg_allpw_df.loc[len(syntoxtarg_allpw_df)] = [targ_row['drug_name'], \
                targ_row['target_name'], targ_row['target_DrugBank_ID'], targ_uniprot, \
                allpw_row['Reactome_ID'], allpw_row['Pathway_Name']]

# Now we have the syntox-target-pathway data for the intersection of drugs with known toxicity
syntoxtarg_lowestpw_df.to_csv('data_processed/ddinter_syntoxtarg_lowestpw.csv', index=False)
syntoxtarg_allpw_df.to_csv('data_processed/ddinter_syntoxtarg_allpw.csv', index=False)

# Filter the drug combinations based on the drugs in the syntoxtargpw intersection
syntoxtarg_allpw_drugs = set(syntoxtarg_allpw_df[syntoxtarg_allpw_df['Reactome_ID'].notna()]['drug_name'].values)
syntoxtarg_lowestpw_drugs = set(syntoxtarg_lowestpw_df[syntoxtarg_lowestpw_df['Reactome_ID'].notna()]['drug_name'].values)
print("Number of unique drugs in syntoxtargpw intersection in all pw: " + str(len(syntoxtarg_allpw_drugs)))
print("Number of unique drugs in syntoxtargpw intersection with lowest pw: " + str(len(syntoxtarg_lowestpw_drugs)))

# Is the set of allpw drugs equivalent to the lowestpw drugs?
if syntoxtarg_allpw_drugs == syntoxtarg_lowestpw_drugs:
    print("The drugs in the allpw and lowestpw intersections are the same")

# Now we can filter the drug combinations based on the drugs in the syntoxtargpw intersection
syntoxtargallpw_combos_df = drug_syntox_df[drug_syntox_df['drug_row'].isin(syntoxtarg_allpw_drugs) & drug_syntox_df['drug_col'].isin(syntoxtarg_allpw_drugs)]
print("Drug combination dataset for syntoxtargallpw: " + str(syntoxtargallpw_combos_df.shape))
#syntoxtargallpw_combos_df.to_csv('data_processed/ddinter_combos_syntoxtarg_allpw.csv', index=False)



Number of known DDInter-DrugComb-DrugTarget drugs with UniProt IDs: 365
Number of unique drugs in syntoxtargpw intersection in all pw: 333
Number of unique drugs in syntoxtargpw intersection with lowest pw: 333
The drugs in the allpw and lowestpw intersections are the same
Drug combination dataset for syntoxtargallpw: (29064, 11)


Intersection between DDInter, DrugComb, Drug Targets, All Reactome Pathways, and STRING

Used manual mapping tool online at: https://www.uniprot.org/id-mapping/dbe8449f1fd4505ca8d39bad70ff36493e2d4585/overview
- 825 UniProtIDs mapped, stored in data_processed/uniprot_id_to_string_mapping_2025_01_24.tsv
- 36 UniProtIDs not mapped, stored in data_processed/unmapped_uniprot_ids.txt


In [6]:
uniprot_to_string = {}
with open('data_processed/uniprot_id_to_string_mapping_2025_01_24.tsv', 'r') as f:
    f.readline()
    for lines in f:
        uniprot_id, string_id = lines.strip('\n').split('\t')
        uniprot_to_string[uniprot_id] = string_id

# update syntoxtargpw dataframes with string ids syntoxtarg_df
syntoxtarg_allpw_df['STRING_ID'] = syntoxtarg_allpw_df['UniProtKB_ID'].map(uniprot_to_string)
syntoxtarg_lowestpw_df['STRING_ID'] = syntoxtarg_lowestpw_df['UniProtKB_ID'].map(uniprot_to_string)
''' When tried to use the genebank IDs -> uniprot IDs they all overlapped with uniprot IDs that
were not present in the string database, so we will just stick with the ones where we do
have information '''
syntoxtarg_allpw_df.to_csv('data_processed/ddinter_syntoxtarg_allpw_string.csv', index=False)
syntoxtarg_lowestpw_df.to_csv('data_processed/ddinter_syntoxtarg_lowestpw_string.csv', index=False)
print("Shape of target info for syntoxtarg_allpw_string: " + str(syntoxtarg_allpw_df.shape))
print("Shape of target info for syntoxtarg_lowestpw_string: " + str(syntoxtarg_lowestpw_df.shape))

drugs_with_string_IDs = set(syntoxtarg_allpw_df[syntoxtarg_allpw_df['STRING_ID'].notnull()]['drug_name'].values)
print("Number of drugs in syntoxtargallpw intersection with STRING IDs: " + str(len(drugs_with_string_IDs)))

# Filter out drug combinations that do not have STRING information, just use all pathways
syntoxtargallpwstring_combos_df = syntoxtargallpw_combos_df[syntoxtargallpw_combos_df['drug_row'].isin(drugs_with_string_IDs) & syntoxtargallpw_combos_df['drug_col'].isin(drugs_with_string_IDs)]
print("Drug combination dataset for syntoxtargallpwstring: " + str(syntoxtargallpwstring_combos_df.shape))

not_printed_first = True

for index, row in syntoxtargallpwstring_combos_df.iterrows():
    drugA = row['drug_row']
    drugB = row['drug_col']
    A_targets = set(syntoxtarg_allpw_df[syntoxtarg_allpw_df['drug_name'] == drugA]['STRING_ID'].dropna().values)
    B_targets = set(syntoxtarg_allpw_df[syntoxtarg_allpw_df['drug_name'] == drugB]['STRING_ID'].dropna().values)
    avg_shortest_path_dist = None
    if len(A_targets) > 0 and len(B_targets) > 0:
        running_target_count = 0.0
        running_target_total = 0.0
        for targA in A_targets:
             for targB in B_targets:
                  if targA in STRING_G and targB in STRING_G and targA != targB:
                    splen = nx.shortest_path_length(STRING_G, source=targA, target=targB)
                    running_target_count += 1
                    running_target_total += splen
                    if not_printed_first:
                        print('Shortest path distance for ' + str(targA) + ' and ' + str(targB) + ' = ' + str(splen))
        if running_target_count != 0:
            avg_shortest_path_dist = running_target_total / running_target_count
            if not_printed_first:
                print("Running target is: " + str(running_target_total))
                print("running count is " + str(running_target_count))
    if not_printed_first:
        print(avg_shortest_path_dist)
        print(syntoxtargallpwstring_combos_df.loc[index])
        not_printed_first = False
    syntoxtargallpwstring_combos_df.at[index, 'avg_short_path_btwn_targets'] = avg_shortest_path_dist
   

# Save the dataframe now filtered with DDInter-DrugComb-DrugTarget-ReactomeAll-STRING and average target distance
syntoxtargallpwstring_combos_df.to_csv('data_processed/ddinter_processed_combos_syntoxtargallpw_string.csv', index=False)

Shape of target info for syntoxtarg_allpw_string: (31603, 7)
Shape of target info for syntoxtarg_lowestpw_string: (11018, 7)
Number of drugs in syntoxtargallpw intersection with STRING IDs: 333
Drug combination dataset for syntoxtargallpwstring: (29064, 11)
Shortest path distance for 9606.ENSP00000480012 and 9606.ENSP00000408695 = 2
Shortest path distance for 9606.ENSP00000480012 and 9606.ENSP00000417052 = 2
Shortest path distance for 9606.ENSP00000480012 and 9606.ENSP00000378974 = 3
Shortest path distance for 9606.ENSP00000480012 and 9606.ENSP00000363822 = 3
Shortest path distance for 9606.ENSP00000480012 and 9606.ENSP00000355904 = 3
Shortest path distance for 9606.ENSP00000480012 and 9606.ENSP00000262186 = 3
Shortest path distance for 9606.ENSP00000480012 and 9606.ENSP00000343925 = 2
Shortest path distance for 9606.ENSP00000480012 and 9606.ENSP00000405330 = 2
Running target is: 20.0
running count is 8.0
2.5
drug_row             mefloquine
drug_col              tamoxifen
cell_line_nam

Test average path distance worked

In [7]:
# Verify that the shortest path distance between the same target is 0
test_A = '9606.ENSP00000480012'
test_B = '9606.ENSP00000480012'
print(nx.shortest_path_length(STRING_G, source=test_A, target=test_B))

# What's the max shortest path distance in drug_combos_df?
print(syntoxtargallpwstring_combos_df['avg_short_path_btwn_targets'].max())

# What's the min shortest path distance in drug_combos_df?
print(syntoxtargallpwstring_combos_df['avg_short_path_btwn_targets'].min())

# How many values of "Major" in 'toxicity_category'?
print(len(syntoxtargallpwstring_combos_df[syntoxtargallpwstring_combos_df['toxicity_category'] == 'Major']))
print(len(syntoxtargallpwstring_combos_df[syntoxtargallpwstring_combos_df['toxicity_category'] == 'Moderate']))
print(len(syntoxtargallpwstring_combos_df[syntoxtargallpwstring_combos_df['toxicity_category'] == 'Minor']))

0
5.0
1.0
5776
22457
831


What are the stats on the final intersection between DDInter, DrugComb, DrugBank Targets, Reactome All Pathways, STRING?

In [8]:
# What's the max number of targets per drug?
print("Lowest pw max targets per drug: " + str(syntoxtarg_lowestpw_df.groupby('drug_name')['target_name'].nunique().max()))
print("All pw max targets per drug: " + str(syntoxtarg_allpw_df.groupby('drug_name')['target_name'].nunique().max()))

# What's the max number of pathways per drug?
print("Lowest pw max pathways per drug: " + str(syntoxtarg_lowestpw_df.groupby('drug_name')['Pathway_Name'].nunique().max()))
print("All pw max pathways per drug: " + str(syntoxtarg_allpw_df.groupby('drug_name')['Pathway_Name'].nunique().max()))

# For targets
print("Drugs with most targets (lowest pw):")
print(syntoxtarg_lowestpw_df.groupby('drug_name')['target_name'].nunique().nlargest(3))

print("\nDrugs with most targets (all pw):")
print(syntoxtarg_allpw_df.groupby('drug_name')['target_name'].nunique().nlargest(3))

# For pathways
print("\nDrugs with most pathways (lowest pw):")
print(syntoxtarg_lowestpw_df.groupby('drug_name')['Pathway_Name'].nunique().nlargest(3))

print("\nDrugs with most pathways (all pw):")
print(syntoxtarg_allpw_df.groupby('drug_name')['Pathway_Name'].nunique().nlargest(3))

# How many unique drugs exist in the final combination dataset?
print("Number of unique drugs in final combination dataset: " + str(len(set(syntoxtargallpwstring_combos_df['drug_row'].values).union(set(syntoxtargallpwstring_combos_df['drug_col'].values)))))

# How many unique drug combinations (drugA, drugB, cell_line_name C) exist in the final combination dataset?
unique_triplicates = set()
for index, row in syntoxtargallpwstring_combos_df.iterrows():
    first_way = (row['drug_row'], row['drug_col'], row['cell_line_name'])
    second_way = (row['drug_col'], row['drug_row'], row['cell_line_name'])
    if first_way not in unique_triplicates and second_way not in unique_triplicates:
        unique_triplicates.add(first_way)
print("Number of unique drug combinations in final combination dataset: " + str(len(unique_triplicates)))

# How many unique cell lines exist in the final combination dataset?
print("Number of unique cell lines in final combination dataset: " + str(len(set(syntoxtargallpwstring_combos_df['cell_line_name'].values))))

# Print the columns of the final combination dataset
print("Columns in final combination dataset: " + str(syntoxtargallpwstring_combos_df.columns))

Lowest pw max targets per drug: 216
All pw max targets per drug: 216
Lowest pw max pathways per drug: 700
All pw max pathways per drug: 1055
Drugs with most targets (lowest pw):
drug_name
fostamatinib    216
cannabidiol      41
aripiprazole     38
Name: target_name, dtype: int64

Drugs with most targets (all pw):
drug_name
fostamatinib    216
cannabidiol      41
aripiprazole     38
Name: target_name, dtype: int64

Drugs with most pathways (lowest pw):
drug_name
fostamatinib    700
dasatinib       188
ponatinib       181
Name: Pathway_Name, dtype: int64

Drugs with most pathways (all pw):
drug_name
fostamatinib    1055
dasatinib        349
ponatinib        297
Name: Pathway_Name, dtype: int64
Number of unique drugs in final combination dataset: 331
Number of unique drug combinations in final combination dataset: 23415
Number of unique cell lines in final combination dataset: 149
Columns in final combination dataset: Index(['drug_row', 'drug_col', 'cell_line_name', 'synergy_bliss',
     

Note: for DDInter, was able to find all SMILES/Morgan Fingerprint representations needed, so did not have to filter more