Parse STRING database to find shortest distance between drug targets

Assume that drugbank has already been used to get the drug target information, which we will then use those UniProt IDs to convert to STRING

In [1]:
import pandas as pd
import networkx as nx
import numpy as np

In [2]:
syntoxtarg_df = pd.read_csv('data_processed/drugbank_syntoxtarg.csv')
with open('data_processed/drugbank_syntoxtarg_UniprotIDs.txt', 'w') as f:
    for id in set(syntoxtarg_df['UniProtKB_ID'].values):
        f.write(str(id) + '\n')

Used manual mapping tool online at: https://www.uniprot.org/id-mapping/dbe8449f1fd4505ca8d39bad70ff36493e2d4585/overview
CURRENTLY UNDER MAINTENANCE RIGHT NOW

Previously for DDInter-DrugComb intersect
- 825 UniProtIDs mapped, stored in data_processed/uniprot_id_to_string_mapping_2025_01_24.tsv
- 36 UniProtIDs not mapped, stored in data_processed/unmapped_uniprot_ids.txt

Found 243 proteins (IGNORE NAN) that are in DrugBank and not DDInter that have UniProt IDs
- Use mapping tool on STRING database
- Output of DrugBank leftovers is saved in data_processed/drugbank_leftovers_uniprot_string_mapping_2025_02_14.tsv


In [20]:
uniprot_to_string = {}
with open('data_processed/uniprot_id_to_string_mapping_2025_01_24.tsv', 'r') as f:
    f.readline()
    for lines in f:
        uniprot_id, string_id = lines.strip('\n').split('\t')
        uniprot_to_string[uniprot_id] = string_id

syntoxtarg_df['STRING_ID'] = syntoxtarg_df['UniProtKB_ID'].map(uniprot_to_string)

drugbank_unknown_uniprot = syntoxtarg_df[syntoxtarg_df['STRING_ID'].isna()]['UniProtKB_ID'].unique()
with open('data_processed/drugbank_notcoveredbyddinter_unknown_uniprot.txt', 'w') as f:
    for id in drugbank_unknown_uniprot:
        f.write(str(id) + '\n')

with open('data_processed/drugbank_leftovers_uniprot_string_mapping_2025_02_14.tsv', 'r') as f:
    f.readline() # skip header
    f.readline() # skip nan line
    for lines in f:
        entry = lines.strip('\n').split('\t')
        uniprot_id = entry[1]
        string_id = entry[2]
        uniprot_to_string[uniprot_id] = string_id

# update syntoxtarg_df with string ids
syntoxtarg_df['STRING_ID'] = syntoxtarg_df['UniProtKB_ID'].map(uniprot_to_string)

# Which of the other IDs have the most overlap?
genbank_protein_nostring_df = syntoxtarg_df[syntoxtarg_df['STRING_ID'].isna() & ~syntoxtarg_df['GenBank_Protein_ID'].isna()]
genbank_gene_nostring_df = syntoxtarg_df[syntoxtarg_df['STRING_ID'].isna() & ~syntoxtarg_df['GenBank_Gene_ID'].isna()]
genatlas_nostring_df = syntoxtarg_df[syntoxtarg_df['STRING_ID'].isna() & ~syntoxtarg_df['GenAtlas_ID'].isna()]
hgnc_nostring_df = syntoxtarg_df[syntoxtarg_df['STRING_ID'].isna() & ~syntoxtarg_df['HGNC_ID'].isna()]

print("GB Prot shape" + str(genbank_protein_nostring_df['GenBank_Protein_ID'].unique().shape))
print("GB Gene shape" + str(genbank_gene_nostring_df['GenBank_Gene_ID'].unique().shape))
print("GA shape" + str(genatlas_nostring_df['GenAtlas_ID'].unique().shape))
print("HGNC shape" + str(hgnc_nostring_df['HGNC_ID'].unique().shape))

# GB Gene looks the targets overlap, so let's start with that
genebank_gene_ids = genbank_gene_nostring_df['GenBank_Gene_ID'].values
with open('data_processed/drugbank_genebank_leftovers.txt', 'w') as f:
    for gb_id in genebank_gene_ids:
        f.write(str(gb_id) + '\n')

# Only one was found in STRING, just add manually
syntoxtarg_df.loc[syntoxtarg_df['GenBank_Gene_ID'] == 'AJ251501', 'STRING_ID'] = '9606.ENSP00000365437'

genebank_prot_ids = genbank_protein_nostring_df['GenBank_Protein_ID'].unique()
with open('data_processed/drugbank_genebank_prot_leftovers.txt', 'w') as f:
    for gb_id in genebank_prot_ids:
        f.write(str(gb_id) + '\n')

# None were found in STRING

genatlas_ids = genatlas_nostring_df['GenAtlas_ID'].unique()
with open('data_processed/drugbank_genatlas_leftovers.txt', 'w') as f:
    for ga_id in genatlas_ids:
        f.write(str(ga_id) + '\n')

# None were found in STRING

hgnc_ids = hgnc_nostring_df['HGNC_ID'].unique()
with open('data_processed/drugbank_hgnc_leftovers.txt', 'w') as f:
    for hgnc_id in hgnc_ids:
        f.write(str(hgnc_id) + '\n')

# None were found in STRING

completely_unknown_uniprot = syntoxtarg_df[syntoxtarg_df['STRING_ID'].isna()]['UniProtKB_ID'].unique()
with open('data_processed/drugbank_unknown_uniprot.txt', 'w') as f:
    for id in completely_unknown_uniprot:
        f.write(str(id) + '\n')

print("Number of unknowns: " + str(len(completely_unknown_uniprot)))

# Save syntoxtarg_df with new STRING IDs
syntoxtarg_df.to_csv('data_processed/drugbank_syntoxtarg_with_string.csv', index=False)


GB Prot shape(53,)
GB Gene shape(58,)
GA shape(6,)
HGNC shape(5,)
Number of unknowns: 80


In [14]:
# Read in the STRING database into a graph, then for each combination find the minimum shortest path distance between drug targets that are available, store in a new CSV file

string_edge_list_df = pd.read_csv('data/STRING/9606.protein.physical.links.detailed.v12.0.txt', sep=' ')
print(string_edge_list_df.shape)
string_edge_list_df.drop(columns=['database','textmining','combined_score'], inplace=True)
string_edge_list_df = string_edge_list_df[string_edge_list_df['experimental'] != 0]
print(string_edge_list_df.shape)
STRING_G = nx.from_pandas_edgelist(string_edge_list_df, 'protein1', 'protein2')
STRING_G = STRING_G.to_undirected()
STRING_G.number_of_edges()

(1477610, 6)
(1182858, 3)


591429

In [None]:
# get combos
drug_combos_df = pd.read_csv('data_processed/drugbank_filtered_combos_syntox_known_targallpw.csv')
not_printed_first = True

for index, row in drug_combos_df.iterrows():
    drugA = row['drug_row']
    drugB = row['drug_col']
    A_targets = set(syntoxtarg_df[syntoxtarg_df['drug_name'] == drugA]['STRING_ID'].dropna().values)
    B_targets = set(syntoxtarg_df[syntoxtarg_df['drug_name'] == drugB]['STRING_ID'].dropna().values)
    min_shortest_path_dist = None
    avg_shortest_path_dist = None
    if len(A_targets) > 0 and len(B_targets) > 0:
        running_target_count = 0.0
        running_target_total = 0.0
        for targA in A_targets:
             for targB in B_targets:
                  if targA in STRING_G and targB in STRING_G and targA != targB:
                    splen = nx.shortest_path_length(STRING_G, source=targA, target=targB)
                    if min_shortest_path_dist:
                        min_shortest_path_dist = min(min_shortest_path_dist, splen)
                    else:
                        min_shortest_path_dist = splen
                    running_target_count += 1
                    running_target_total += splen
        if running_target_count != 0:
            avg_shortest_path_dist = running_target_total / running_target_count
            if not_printed_first:
                print("Running target is: " + str(running_target_total))
                print("running count is " + str(running_target_count))
    if not_printed_first:
        print(avg_shortest_path_dist)
        print(min_shortest_path_dist)
        print(drug_combos_df.loc[index])
        not_printed_first = False
    drug_combos_df.at[index, 'avg_short_path_btwn_targets'] = avg_shortest_path_dist
    drug_combos_df.at[index, 'min_short_path_btwn_targets'] = min_shortest_path_dist
   

# Save the dataframe
drug_combos_df.to_csv('data_processed/drugbank_processed_combos_syntoxtargallpw_string.csv', index=False)


Running target is: 59.0
running count is 30.0
1.9666666666666666
1
drug_row              romidepsin
drug_col             carfilzomib
cell_line_name              EW-8
synergy_zip             3.397628
synergy_loewe           -1.79354
synergy_bliss            2.08705
synergy_hsa             1.989756
toxicity_category          Major
Name: 0, dtype: object


In [18]:
# How many combinations actually have the shortest path distance?
print(len(drug_combos_df['avg_short_path_btwn_targets'].dropna().values))
print(len(drug_combos_df['min_short_path_btwn_targets'].dropna().values))

72103
72103


In [19]:
# Verify that the shortest path distance between the same target is 0
test_A = '9606.ENSP00000480012'
test_B = '9606.ENSP00000480012'
print(nx.shortest_path_length(STRING_G, source=test_A, target=test_B))

# What's the max shortest path distance in drug_combos_df?
print(drug_combos_df['avg_short_path_btwn_targets'].max())
print(drug_combos_df['min_short_path_btwn_targets'].max())

# What's the min shortest path distance in drug_combos_df?
print(drug_combos_df['avg_short_path_btwn_targets'].min())
print(drug_combos_df['min_short_path_btwn_targets'].min())

# How many values of "Major" in 'toxicity_category'?
print(len(drug_combos_df[drug_combos_df['toxicity_category'] == 'Major']))
print(len(drug_combos_df[drug_combos_df['toxicity_category'] == 'Moderate']))
print(len(drug_combos_df[drug_combos_df['toxicity_category'] == 'Minor']))

0
5.0
5.0
1.0
1.0
44355
21553
7861
