Parse STRING database to find shortest distance between drug targets

Assume that drugbank has already been used to get the drug target information, which we will then use those UniProt IDs to convert to STRING

In [4]:
import pandas as pd
import networkx as nx
import numpy as np

In [5]:
syntoxtarg_df = pd.read_csv('data_processed/syntoxtarg.csv')
with open('data_processed/syntoxtarg_UniprotIDs.txt', 'w') as f:
    for id in set(syntoxtarg_df['UniProtKB_ID'].values):
        f.write(str(id) + '\n')

Used manual mapping tool online at: https://www.uniprot.org/id-mapping/dbe8449f1fd4505ca8d39bad70ff36493e2d4585/overview
- 825 UniProtIDs mapped, stored in data_processed/uniprot_id_to_string_mapping_2025_01_24.tsv
- 36 UniProtIDs not mapped, stored in data_processed/unmapped_uniprot_ids.txt


In [6]:
uniprot_to_string = {}
with open('data_processed/uniprot_id_to_string_mapping_2025_01_24.tsv', 'r') as f:
    f.readline()
    for lines in f:
        uniprot_id, string_id = lines.strip('\n').split('\t')
        uniprot_to_string[uniprot_id] = string_id

# update syntoxtarg_df with string ids
syntoxtarg_df['STRING_ID'] = syntoxtarg_df['UniProtKB_ID'].map(uniprot_to_string)
genbank_protein_nostring_df = syntoxtarg_df[syntoxtarg_df['STRING_ID'].isna() & ~syntoxtarg_df['GenBank_Protein_ID'].isna()]
genbank_gene_nostring_df = syntoxtarg_df[syntoxtarg_df['STRING_ID'].isna() & ~syntoxtarg_df['GenBank_Gene_ID'].isna()]
genatlas_nostring_df = syntoxtarg_df[syntoxtarg_df['STRING_ID'].isna() & ~syntoxtarg_df['GenAtlas_ID'].isna()]
hgnc_nostring_df = syntoxtarg_df[syntoxtarg_df['STRING_ID'].isna() & ~syntoxtarg_df['HGNC_ID'].isna()]

# Which one has the most overlap?
print("GB Prot shape" + str(genbank_protein_nostring_df.shape))
print("GB Gene shape" + str(genbank_gene_nostring_df.shape))
print("GA shape" + str(genatlas_nostring_df.shape))
print("HGNC shape" + str(hgnc_nostring_df.shape))

# GB Gene looks the largets overlap, so let's start with that

genebank_gene_ids = genbank_gene_nostring_df['GenBank_Gene_ID'].values
with open('data_processed/genebank_leftovers.txt', 'w') as f:
    for gb_id in genebank_gene_ids:
        f.write(str(gb_id) + '\n')


GB Prot shape(33, 9)
GB Gene shape(35, 9)
GA shape(11, 9)
HGNC shape(11, 9)


When tried to use the genebank IDs -> uniprot IDs they all overlapped with uniprot IDs that were not present in the string database, so probably just stick with the ones where we do have information

In [None]:
STRING_G = get_STRING_graph()

(1477610, 6)
(1182858, 3)


591429

In [8]:
syntoxtarg_df

Unnamed: 0,drug_name,target_name,target_DrugBank_ID,GenBank_Protein_ID,GenBank_Gene_ID,UniProtKB_ID,GenAtlas_ID,HGNC_ID,STRING_ID
0,eptifibatide,Integrin beta-3,BE0001155,306786.0,J02703,P05106,ITGB3,HGNC:6156,9606.ENSP00000452786
1,eptifibatide,Voltage-dependent N-type calcium channel subun...,BE0009351,,,A0A024R8I1,,,
2,octreotide,,BE0010008,,,,,,
3,fluvoxamine,Sodium-dependent serotonin transporter,BE0000749,36433.0,X70697,P31645,SLC6A4,HGNC:11050,9606.ENSP00000261707
4,fluvoxamine,Potassium voltage-gated channel subfamily H me...,BE0000090,487738.0,U04270,Q12809,KCNH2,HGNC:6251,9606.ENSP00000262186
...,...,...,...,...,...,...,...,...,...
1859,bendamustine,DNA,,,,,,,
1860,ivermectin,Glutamate-gated chloride channel subunit beta,BE0027480,,,Q17328,,,6239.F25F8.2.1
1861,ivermectin,Glutamate-gated chloride channel alpha,BE0027481,,,G5EBR3,,,6239.F11A5.10.2
1862,artemether,Sodium/potassium-transporting ATPase subunit a...,BE0000732,219942.0,D00099,P05023,ATP1A1,HGNC:799,9606.ENSP00000445306


In [9]:
# get combos
drug_combos_df = pd.read_csv('data_processed/filtered_combos_syntox_known_targallpw.csv')
not_printed_first = True

for index, row in drug_combos_df.iterrows():
    drugA = row['drug_row']
    drugB = row['drug_col']
    A_targets = set(syntoxtarg_df[syntoxtarg_df['drug_name'] == drugA]['STRING_ID'].dropna().values)
    B_targets = set(syntoxtarg_df[syntoxtarg_df['drug_name'] == drugB]['STRING_ID'].dropna().values)
    min_shortest_path_dist = None
    avg_shortest_path_dist = None
    if len(A_targets) > 0 and len(B_targets) > 0:
        running_target_count = 0.0
        running_target_total = 0.0
        for targA in A_targets:
             for targB in B_targets:
                  if targA in STRING_G and targB in STRING_G and targA != targB:
                    splen = nx.shortest_path_length(STRING_G, source=targA, target=targB)
                    if min_shortest_path_dist:
                        min_shortest_path_dist = min(min_shortest_path_dist, splen)
                    else:
                        min_shortest_path_dist = splen
                    running_target_count += 1
                    running_target_total += splen
                    if not_printed_first:
                        print('Shortest path distance for ' + str(targA) + ' and ' + str(targB) + ' = ' + str(splen))
        if running_target_count != 0:
            avg_shortest_path_dist = running_target_total / running_target_count
            if not_printed_first:
                print("Running target is: " + str(running_target_total))
                print("running count is " + str(running_target_count))
    if not_printed_first:
        print(avg_shortest_path_dist)
        print(min_shortest_path_dist)
        print(drug_combos_df.loc[index])
        not_printed_first = False
    drug_combos_df.at[index, 'avg_short_path_btwn_targets'] = avg_shortest_path_dist
    drug_combos_df.at[index, 'min_short_path_btwn_targets'] = min_shortest_path_dist
   

# Save the dataframe

drug_combos_df.to_csv('data_processed/processed_combos_syntoxtargallpw_string.csv', index=False)


Shortest path distance for 9606.ENSP00000480012 and 9606.ENSP00000355904 = 3
Shortest path distance for 9606.ENSP00000480012 and 9606.ENSP00000363822 = 3
Shortest path distance for 9606.ENSP00000480012 and 9606.ENSP00000336528 = 3
Shortest path distance for 9606.ENSP00000480012 and 9606.ENSP00000262186 = 3
Shortest path distance for 9606.ENSP00000480012 and 9606.ENSP00000369816 = 2
Shortest path distance for 9606.ENSP00000480012 and 9606.ENSP00000408695 = 2
Shortest path distance for 9606.ENSP00000480012 and 9606.ENSP00000405330 = 2
Shortest path distance for 9606.ENSP00000480012 and 9606.ENSP00000343925 = 2
Shortest path distance for 9606.ENSP00000480012 and 9606.ENSP00000417052 = 2
Shortest path distance for 9606.ENSP00000480012 and 9606.ENSP00000378974 = 3
Running target is: 25.0
running count is 10.0
2.5
2
drug_row             mefloquine
drug_col              tamoxifen
cell_line_name            TC-32
synergy_zip           19.667559
synergy_loewe          2.263512
synergy_bliss     

In [10]:
# How many combinations actually have the shortest path distance?
print(len(drug_combos_df['avg_short_path_btwn_targets'].dropna().values))
print(len(drug_combos_df['min_short_path_btwn_targets'].dropna().values))

31693
31693


In [16]:
# Verify that the shortest path distance between the same target is 0
test_A = '9606.ENSP00000480012'
test_B = '9606.ENSP00000480012'
print(nx.shortest_path_length(STRING_G, source=test_A, target=test_B))

# What's the max shortest path distance in drug_combos_df?
print(drug_combos_df['avg_short_path_btwn_targets'].max())
print(drug_combos_df['min_short_path_btwn_targets'].max())

# What's the min shortest path distance in drug_combos_df?
print(drug_combos_df['avg_short_path_btwn_targets'].min())
print(drug_combos_df['min_short_path_btwn_targets'].min())

# How many values of "Major" in 'toxicity_category'?
print(len(drug_combos_df[drug_combos_df['toxicity_category'] == 'Major']))
print(len(drug_combos_df[drug_combos_df['toxicity_category'] == 'Moderate']))
print(len(drug_combos_df[drug_combos_df['toxicity_category'] == 'Minor']))

0
5.0
5.0
1.0
1.0
6091
25092
876
