### Correlate Jaccard Similarity by DDI Severity ###

In [1]:
# Import everything needed
from toxicity_ranking import *
import numpy as np

In [2]:
# Get the DDInter data
ddinter_files = [
    'data/DDInter/ddinter_downloads_code_A.csv',
    'data/DDInter/ddinter_downloads_code_B.csv',
    'data/DDInter/ddinter_downloads_code_D.csv',
    'data/DDInter/ddinter_downloads_code_H.csv',
    'data/DDInter/ddinter_downloads_code_L.csv',
    'data/DDInter/ddinter_downloads_code_P.csv',
    'data/DDInter/ddinter_downloads_code_R.csv',
    'data/DDInter/ddinter_downloads_code_V.csv',
]
ddinter_df = None
for file in ddinter_files:
    if ddinter_df is None:
        ddinter_df = pd.read_csv(file, sep=',')
    else:
        ddinter_df = pd.concat([ddinter_df, pd.read_csv(file, sep=',')], ignore_index=True)

# Convert all values in Drug_A and Drug_B columns to lowercase
ddinter_df['Drug_A'] = ddinter_df['Drug_A'].str.lower()
ddinter_df['Drug_B'] = ddinter_df['Drug_B'].str.lower()

# Remove duplicates
ddinter_df = ddinter_df.drop_duplicates()

print('DDInter shape:', ddinter_df.shape)

DDInter shape: (160235, 5)


In [3]:
drugcomb_df = get_drug_comb_data(bliss=True, loewe=True, hsa=True, zip=True)
sider_cid_to_drugs_df, sider_all_side_effects_df = get_sider_data()

#Filter data and get jaccard similarity
filtered_drug_comb_data, common_drugs, jaccard_unique_drug_pairs = filter_drug_comb_data(drugcomb_df, sider_cid_to_drugs_df)
drug_pair_to_jaccard, drug_pair_to_side_effects = drug_pair_to_jaccard_similarity(jaccard_unique_drug_pairs, sider_cid_to_drugs_df, sider_all_side_effects_df)

  drugcomb_df = pd.read_csv('data/DrugComb/drugcomb_summary_v_1_5.csv', sep=',', index_col=False)


Original shape of drugcomb data:  (1432351, 26)
Final shape of filtered drugcomb data:  (123882, 26)
Original drugcomb data shape:  (123882, 26)
Number of drugs in common between drugcomb and sider [lowercase enforced]:  401
Filtered drugcomb data shape for both drugs being present in sider:  (20824, 26)
Number of unique drug pairs:  6552


In [4]:
# Find the intersection of unique drug pairs in DDInter that the severity of interaction is known and the Jaccard Similarity data

known_ddinter_drug_pairs = set()
for index, row in ddinter_df.iterrows():
    if row['Level'] == 'Unknown':
        continue
    drug_a = row['Drug_A']
    drug_b = row['Drug_B']
    known_ddinter_drug_pairs.add((drug_a, drug_b))
    known_ddinter_drug_pairs.add((drug_b, drug_a))

intersection_drug_pairs = known_ddinter_drug_pairs.intersection(jaccard_unique_drug_pairs)

print("Number of drug pairs both in known DDInter and Jaccard Similarity data:", len(intersection_drug_pairs))


Number of drug pairs both in known DDInter and Jaccard Similarity data: 1047


In [7]:
severity_dict = {
    'Major': 1,
    'Moderate': 0.5,
    'Minor': 0,
}

# Sort out the data for plotting
severity_val = []
for drug_pair in intersection_drug_pairs:
    drug_a, drug_b = drug_pair
    # Find the 'Level' value of the drug pair in the ddinter_df
    levels = ddinter_df[(ddinter_df['Drug_A'] == drug_a) & (ddinter_df['Drug_B'] == drug_b)]['Level'].values
    levels_swapped = ddinter_df[(ddinter_df['Drug_A'] == drug_b) & (ddinter_df['Drug_B'] == drug_a)]['Level'].values
    if len(levels) > 0 and len(levels_swapped) > 0:
        print("Found both directions of drug pair in DDInter data for pair:", drug_pair)
        continue
    elif len(levels) == 0 and len(levels_swapped) == 0:
        print("Could not find drug pair in DDInter data for pair:", drug_pair)
        continue
    elif len(levels) > 1 or len(levels_swapped) > 1:
        print("Found multiple entries for drug pair in DDInter data for pair:", drug_pair)
        print("Levels:", levels)
        print("Levels Swapped:", levels_swapped)
        continue
    elif len(levels) == 1 and len(levels_swapped) == 1:
        if levels[0] == levels_swapped[0]:
            severity_val.append(severity_dict[levels[0]])
        else:
            print("Found different levels for drug pair in DDInter data for pair:", drug_pair)
            print("Levels:", levels)
            print("Levels Swapped:", levels_swapped)
            continue
    elif len(levels) == 1:
        severity_val.append(severity_dict[levels[0]])
    elif len(levels_swapped) == 1:
        severity_val.append(severity_dict[levels_swapped[0]])
    
jaccard_sim = [jaccard_score for drug_pair in intersection_drug_pairs for jaccard_score in [drug_pair_to_jaccard[drug_pair]]]


Get the average synergy score for each unique drug pair and then plot an xy graph of jaccard similarity to synergy score, calculate pearson correlation

In [8]:
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

# Sort out the data for plotting

# correlation between jaccard similarity and bliss synergy score
ddinter_pearson_coef, ddinter_p_value = pearsonr(jaccard_sim, severity_val)
print("Pearson correlation coefficient for JS and DDInter Severity: ", ddinter_pearson_coef)
print("Pearson pval for JS and DDInter Severity", ddinter_p_value)

# Plot the data
plt.scatter(jaccard_sim, severity_val)
plt.xlabel('Jaccard similarity')
plt.ylabel('DDInter Severity Value (0: Minor, 0.5: Moderate, 1: Major)')
plt.title('DDInter Severity vs. Jaccard Similarity')
plt.savefig('results/jaccard_vs_ddinter_severity.png')
plt.close()



Pearson correlation coefficient for JS and DDInter Severity:  0.01796171952338169
Pearson pval for JS and DDInter Severity 0.5615459742606256


In [10]:
# What's the average Jaccard similarity for each severity level?
severity_to_jaccard = {
    'Major': [],
    'Moderate': [],
    'Minor': [],
}
for index in range(len(jaccard_sim)):
    severity = severity_val[index]
    jaccard = jaccard_sim[index]
    if severity == 1:
        severity_to_jaccard['Major'].append(jaccard)
    elif severity == 0.5:
        severity_to_jaccard['Moderate'].append(jaccard)
    elif severity == 0:
        severity_to_jaccard['Minor'].append(jaccard)

print("Average Jaccard Similarity for Major Severity:", np.mean(severity_to_jaccard['Major']))
print("Average Jaccard Similarity for Moderate Severity:", np.mean(severity_to_jaccard['Moderate']))
print("Average Jaccard Similarity for Minor Severity:", np.mean(severity_to_jaccard['Minor']))

print("Median Jaccard Similarity for Major Severity:", np.median(severity_to_jaccard['Major']))
print("Median Jaccard Similarity for Moderate Severity:", np.median(severity_to_jaccard['Moderate']))
print("Median Jaccard Similarity for Minor Severity:", np.median(severity_to_jaccard['Minor']))

Average Jaccard Similarity for Major Severity: 0.17656238775029426
Average Jaccard Similarity for Moderate Severity: 0.17405226523665593
Average Jaccard Similarity for Minor Severity: 0.16992887939765447
Median Jaccard Similarity for Major Severity: 0.1685855263157895
Median Jaccard Similarity for Moderate Severity: 0.17027027027027028
Median Jaccard Similarity for Minor Severity: 0.18106893106893107


How many of these additive or better combinations have actually been tested out where we know the interaction or not?

In [11]:
percentage_of_known_ddinter_pairs = len(intersection_drug_pairs) / len(jaccard_unique_drug_pairs)
print("Percentage of known DDInter pairs:", percentage_of_known_ddinter_pairs)

Percentage of known DDInter pairs: 0.15979853479853479
