# DrugBank STRING Distance Analysis #

File for examining the whether averaging the distance between targets in a drug combination is correlated with and toxicity levels (closer targets mean higher toxicity?)

In [1]:
# Import everything needed
from matplotlib.patches import Patch
from scipy import stats
from sklearn.metrics import r2_score
from statsmodels.stats.multitest import multipletests
from preprocessing_functions import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scikit_posthocs as sp
import seaborn as sns

In [2]:
drug_syntox_df = pd.read_csv('data_processed/drugbank_processed_combos_syntoxtargallpw_string.csv')

print("Original Drug Combination Dataframe Shape: ", drug_syntox_df.shape)
drug_syntox_df = drug_syntox_df.dropna(subset=['avg_short_path_btwn_targets'])
print("Drug Combination Dataframe Shape After Dropping NaNs: ", drug_syntox_df.shape)

# Test if the average shortest path between targets distribution is normal
print("Is the avg_short_path_btwn_targets distribution normal? Normal test p-value: ", stats.normaltest(drug_syntox_df['avg_short_path_btwn_targets']).pvalue)

# Look at histogram of target target distances
plt.hist(drug_syntox_df['avg_short_path_btwn_targets'], bins=20)
plt.xlabel('Average Shortest Path Between Targets')
plt.ylabel('Frequency')
plt.title('Histogram of Average Shortest Path Between Targets')
plt.savefig('results/avg_targ_distance_analysis/avg_short_path_btwn_targets_hist_drugbank.png')
plt.close()

avg_targ_dist_stats = drug_syntox_df['avg_short_path_btwn_targets'].describe()
avg_targ_dist_stats.to_csv('results/avg_targ_distance_analysis/avg_short_path_btwn_targets_stats_drugbank.csv')

Original Drug Combination Dataframe Shape:  (62728, 12)
Drug Combination Dataframe Shape After Dropping NaNs:  (62660, 12)
Is the avg_short_path_btwn_targets distribution normal? Normal test p-value:  0.0


- Kruskal Wallis Test
- Dunn Posthoc Test with Bonferroni Correction
- Jonckheere Terpestra Test
- ANOVA
- T Test with Bonferroni Correction

In [3]:
# Run Kruskal Wallis test on average drug target distances with Major, Moderate, and Minor toxicity

# Average target distance
major_target = drug_syntox_df[drug_syntox_df['toxicity_category'] == 'Major']['avg_short_path_btwn_targets'].dropna()
moderate_target = drug_syntox_df[drug_syntox_df['toxicity_category'] == 'Moderate']['avg_short_path_btwn_targets'].dropna()
minor_target = drug_syntox_df[drug_syntox_df['toxicity_category'] == 'Minor']['avg_short_path_btwn_targets'].dropna()
print(f'Major avg target distance: {len(major_target)}')
print(f'Moderate avg target distance: {len(moderate_target)}')
print(f'Minor avg target distance: {len(minor_target)}')

h_statistic_target_tox, p_value_target_tox = stats.kruskal(major_target, moderate_target, minor_target)
print(f'Kruskal-Wallis H statistic for avg target distance: {h_statistic_target_tox}')
print(f'P-value for avg target distance: {p_value_target_tox}')

# Filter drug_syntox_df for only Major, Moderate, and Minor toxicity categories and remove pairs with average target distances of None
drug_syntox_df_tox_target = drug_syntox_df[drug_syntox_df['toxicity_category'].isin(['Major', 'Moderate', 'Minor'])]
drug_syntox_df_tox_target = drug_syntox_df_tox_target[~drug_syntox_df_tox_target['avg_short_path_btwn_targets'].isnull()]
dunn_target_tox = sp.posthoc_dunn(drug_syntox_df_tox_target, val_col='avg_short_path_btwn_targets', group_col='toxicity_category', p_adjust='bonferroni')
print('Dunn post-hoc test for avg target distance:' + str(dunn_target_tox))

# jonkcheere terpestra test -- is there a trend in the average target distance as you increase toxicity from minor to major?
atdist_major_samples = major_target.values.tolist()
atdist_moderate_samples = moderate_target.values.tolist()
atdist_minor_samples = minor_target.values.tolist()
jt_incr_target = jonckheere_terpestra_test([atdist_minor_samples, atdist_moderate_samples, atdist_major_samples])
print(f"Increasing toxicity for avg target distance p value: {jt_incr_target[1]:0.3e}")
jt_decr_target = jonckheere_terpestra_test([atdist_major_samples, atdist_moderate_samples, atdist_minor_samples])
print(f"Decreasing toxicity for avg target distance p value: {jt_decr_target[1]:0.3e}")

# Run an ANOVA on average drug target distances for Major, Moderate, and Minor toxicity categories
f_statistic_anova_target_tox, p_value_anova_target_tox = stats.f_oneway(major_target, moderate_target, minor_target)
print(f'ANOVA F-statistic for avg target distance: {f_statistic_anova_target_tox}')
print(f'P-value for avg target distance: {p_value_anova_target_tox}')

# Run a T test on average drug target distances between Major/Minor, Major/Moderate, and Moderate/Minor toxicity categories
major_minor_target = stats.ttest_ind(major_target, minor_target)
major_moderate_target = stats.ttest_ind(major_target, moderate_target)
moderate_minor_target = stats.ttest_ind(moderate_target, minor_target)

# Run bonferroni correction on the p-values
ttest_p_values = [major_minor_target[1], major_moderate_target[1], moderate_minor_target[1]]
ttest_p_values_corrected = multipletests(ttest_p_values, method='bonferroni')
print(f'Major/Minor T-test: {major_minor_target[1]} Corrected: {ttest_p_values_corrected[1][0]}')
print(f'Major/Moderate T-test: {major_moderate_target[1]} Corrected: {ttest_p_values_corrected[1][1]}')
print(f'Moderate/Minor T-test: {moderate_minor_target[1]} Corrected: {ttest_p_values_corrected[1][2]}')

with open('results/avg_targ_distance_analysis/avg_targ_distance_tox_drugbank.tsv', 'w') as f:
    f.write('Level\tTest\tTest statistic\tP-value\n')
    f.write('Target\tKruskal-Wallis\t{:.4e}\t{:.4e}\n'.format(h_statistic_target_tox, p_value_target_tox))
    f.write('Target\tDunn post-hoc\tMajor/Minor\t{:.4e}\n'.format(dunn_target_tox.iloc[0, 1]))
    f.write('Target\tDunn post-hoc\tMajor/Moderate\t{:.4e}\n'.format(dunn_target_tox.iloc[0, 2]))
    f.write('Target\tDunn post-hoc\tModerate/Minor\t{:.4e}\n'.format(dunn_target_tox.iloc[1, 2]))
    f.write('Target\tJonckheere-Terpstra Increasing toxicity\t{:.4e}\t{:.4e}\n'.format(jt_incr_target[0], jt_incr_target[1]))
    f.write('Target\tJonckheere-Terpstra Decreasing toxicity\t{:.4e}\t{:.4e}\n'.format(jt_decr_target[0], jt_decr_target[1]))
    f.write('Target\tANOVA\t{:.4e}\t{:.4e}\n'.format(f_statistic_anova_target_tox, p_value_anova_target_tox))
    f.write('Target\tT-test (bonf cor)\tMajor/Minor\t{:.4e}\n'.format(ttest_p_values_corrected[1][0]))
    f.write('Target\tT-test (bonf cor)\tMajor/Moderate\t{:.4e}\n'.format(ttest_p_values_corrected[1][1]))
    f.write('Target\tT-test (bonf cor)\tModerate/Minor\t{:.4e}\n'.format(ttest_p_values_corrected[1][2]))


Major avg target distance: 36802
Moderate avg target distance: 19069
Minor avg target distance: 6789
Kruskal-Wallis H statistic for avg target distance: 6672.431211006749
P-value for avg target distance: 0.0
Dunn post-hoc test for avg target distance:          Major          Minor       Moderate
Major       1.0   0.000000e+00   0.000000e+00
Minor       0.0   1.000000e+00  1.621376e-274
Moderate    0.0  1.621376e-274   1.000000e+00
Increasing toxicity for avg target distance p value: 1.000e+00
Decreasing toxicity for avg target distance p value: 0.000e+00
ANOVA F-statistic for avg target distance: 4502.224490269037
P-value for avg target distance: 0.0
Major/Minor T-test: 0.0 Corrected: 0.0
Major/Moderate T-test: 0.0 Corrected: 0.0
Moderate/Minor T-test: 1.1048210663780817e-239 Corrected: 3.314463199134245e-239


- Violin Plot

In [4]:
colors = ['#20965D', '#FFBC42', '#D81159']
tox_order = ['Minor', 'Moderate', 'Major']
color_dict = dict(zip(tox_order, colors))
legend_elements = [
    Patch(facecolor=color_dict[cat], label = cat) for cat in tox_order
]

ax = sns.violinplot(data=drug_syntox_df, x='toxicity_category', y='avg_short_path_btwn_targets', palette=color_dict, hue='toxicity_category', order=tox_order)
ax.set(xlabel='', ylabel='Average Target Distance')
plt.xticks(fontsize=20)
ax.yaxis.label.set_size(20)
plt.tight_layout()
plt.savefig('results/avg_targ_distance_analysis/avgtargdist_v_toxcats_drugbank_violin.png', dpi=700)
plt.close()


- Strip Plot

In [5]:
ax = sns.stripplot(data=drug_syntox_df, x='toxicity_category', y='avg_short_path_btwn_targets', palette=color_dict, hue='toxicity_category', order=tox_order)
sns.boxplot( # plot the mean line
    showmeans=True,
    meanline=True,
    meanprops={'color': 'k', 'ls': '-', 'lw': 1},
    medianprops={'visible': False},
    whiskerprops={'visible': False},
    zorder=10,
    x="toxicity_category",
    y="avg_short_path_btwn_targets",
    data=drug_syntox_df,
    showfliers=False,
    showbox=False,
    showcaps=False,
    ax=ax
)
ax.set(xlabel='', ylabel='Average Target Distance')
plt.xticks(fontsize=20)
ax.yaxis.label.set_size(20)
plt.tight_layout()
plt.savefig('results/avg_targ_distance_analysis/avgtargdist_v_toxcats_drugbank_strip.png', dpi=700)
plt.close()

- Correlation Scatter Plots (Average Target Distance v Synergy Scores)
- R^2 value
- Best fit line
- Pearson correlation coefficient
- Spearman correlation coefficient

In [6]:
x_avgtargdist = drug_syntox_df['avg_short_path_btwn_targets'].dropna()
y_bliss_true = drug_syntox_df[drug_syntox_df['avg_short_path_btwn_targets'].notna()]['synergy_bliss']
y_loewe_true = drug_syntox_df[drug_syntox_df['avg_short_path_btwn_targets'].notna()]['synergy_loewe']
y_hsa_true = drug_syntox_df[drug_syntox_df['avg_short_path_btwn_targets'].notna()]['synergy_hsa']
y_zip_true = drug_syntox_df[drug_syntox_df['avg_short_path_btwn_targets'].notna()]['synergy_zip']
y_smax_true = drug_syntox_df[drug_syntox_df['avg_short_path_btwn_targets'].notna()]['S_max']
y_smean_true = drug_syntox_df[drug_syntox_df['avg_short_path_btwn_targets'].notna()]['S_mean']
y_ssum_true = drug_syntox_df[drug_syntox_df['avg_short_path_btwn_targets'].notna()]['S_sum']

####### BLISS ########
# Let's plot average STRING target distance against bliss synergy scores
plt.scatter(x_avgtargdist, y_bliss_true)

# Best fit line
z_avgtargdist_bliss = np.polyfit(x_avgtargdist, y_bliss_true, 1)
p_avgtargdist_bliss = np.poly1d(z_avgtargdist_bliss)
y_bliss_pred = p_avgtargdist_bliss(x_avgtargdist)
r_squared_avgtargdist_bliss = r2_score(y_bliss_true, y_bliss_pred)
plt.plot(x_avgtargdist, y_bliss_pred, "r-", alpha=0.8, label=f'R² = {r_squared_avgtargdist_bliss:.3f}')
plt.xlabel('Average Target Distance', fontsize=20)
plt.ylabel('Bliss Synergy Score', fontsize=20)
plt.legend()
plt.tight_layout()
plt.savefig('results/avg_targ_distance_analysis/bliss_v_avgtargdist_drugbank_scatter.png', dpi=700)
plt.close()

# Calculate Pearson/Spearman correlation coefficient between average STRING target distance and bliss synergy score
avgtargdist_bliss_corr = x_avgtargdist.corr(y_bliss_true)
avgtargdist_bliss_spearman_corr = x_avgtargdist.corr(y_bliss_true, method='spearman')


####### HSA ########
# Let's plot average STRING target distance against hsa synergy scores
plt.scatter(x_avgtargdist, y_hsa_true)

# Best fit line
z_avgtargdist_hsa = np.polyfit(x_avgtargdist, y_hsa_true, 1)
p_avgtargdist_hsa = np.poly1d(z_avgtargdist_hsa)
y_hsa_pred = p_avgtargdist_hsa(x_avgtargdist)
r_squared_avgtargdist_hsa = r2_score(y_hsa_true, y_hsa_pred)
plt.plot(x_avgtargdist, y_hsa_pred, "r-", alpha=0.8, label=f'R² = {r_squared_avgtargdist_hsa:.3f}')
plt.xlabel('Average Target Distance', fontsize=20)
plt.ylabel('HSA Synergy Score', fontsize=20)
plt.legend()
plt.tight_layout()
plt.savefig('results/avg_targ_distance_analysis/hsa_v_avgtargdist_drugbank_scatter.png', dpi=700)
plt.close()

# Calculate Pearson/Spearman correlation coefficient between average STRING target distance and hsa synergy score
avgtargdist_hsa_corr = x_avgtargdist.corr(y_hsa_true)
avgtargdist_hsa_spearman_corr = x_avgtargdist.corr(y_hsa_true, method='spearman')

####### LOEWE ########
# Let's plot average STRING target distance against loewe synergy scores
plt.scatter(x_avgtargdist, y_loewe_true)

# Best fit line
z_avgtargdist_loewe = np.polyfit(x_avgtargdist, y_loewe_true, 1)
p_avgtargdist_loewe = np.poly1d(z_avgtargdist_loewe)
y_loewe_pred = p_avgtargdist_loewe(x_avgtargdist)
r_squared_avgtargdist_loewe = r2_score(y_loewe_true, y_loewe_pred)
plt.plot(x_avgtargdist, y_loewe_pred, "r-", alpha=0.8, label=f'R² = {r_squared_avgtargdist_loewe:.3f}')
plt.xlabel('Average Target Distance', fontsize=20)
plt.ylabel('Loewe Synergy Score', fontsize=20)
plt.legend()
plt.tight_layout()
plt.savefig('results/avg_targ_distance_analysis/loewe_v_avgtargdist_drugbank_scatter.png', dpi=700)
plt.close()

# Calculate Pearson/Spearman correlation coefficient between average STRING target distance and loewe synergy score
avgtargdist_loewe_corr = x_avgtargdist.corr(y_loewe_true)
avgtargdist_loewe_spearman_corr = x_avgtargdist.corr(y_loewe_true, method='spearman')

####### ZIP ########
# Let's plot average STRING target distance against zip synergy scores
plt.scatter(x_avgtargdist, y_zip_true)

# Best fit line
z_avgtargdist_zip = np.polyfit(x_avgtargdist, y_zip_true, 1)
p_avgtargdist_zip = np.poly1d(z_avgtargdist_zip)
y_zip_pred = p_avgtargdist_zip(x_avgtargdist)
r_squared_avgtargdist_zip = r2_score(y_zip_true, y_zip_pred)
plt.plot(x_avgtargdist, y_zip_pred, "r-", alpha=0.8, label=f'R² = {r_squared_avgtargdist_zip:.3f}')
plt.xlabel('Average Target Distance', fontsize=20)
plt.ylabel('ZIP Synergy Score', fontsize=20)
plt.legend()
plt.tight_layout()
plt.savefig('results/avg_targ_distance_analysis/zip_v_avgtargdist_drugbank_scatter.png', dpi=700)
plt.close()

# Calculate Pearson correlation coefficient between average STRING target distance and zip synergy score
avgtargdist_zip_corr = x_avgtargdist.corr(y_zip_true)
avgtargdist_zip_spearman_corr = x_avgtargdist.corr(y_zip_true, method='spearman')

###### S_MAX ########
# Let's plot average STRING target distance against S_max synergy scores
plt.scatter(x_avgtargdist, y_smax_true)

# Best fit line
z_avgtargdist_smax = np.polyfit(x_avgtargdist, y_smax_true, 1)
p_avgtargdist_smax = np.poly1d(z_avgtargdist_smax)
y_smax_pred = p_avgtargdist_smax(x_avgtargdist)
r_squared_avgtargdist_smax = r2_score(y_smax_true, y_smax_pred)
plt.plot(x_avgtargdist, y_smax_pred, "r-", alpha=0.8, label=f'R² = {r_squared_avgtargdist_smax:.3f}')
plt.xlabel('Average Target Distance', fontsize=20)
plt.ylabel('S_max Synergy Score', fontsize=20)
plt.legend()
plt.tight_layout()
plt.savefig('results/avg_targ_distance_analysis/smax_v_avgtargdist_drugbank_scatter.png', dpi=700)
plt.close()

# Calculate Pearson/Spearman correlation coefficient between average STRING target distance and S_max synergy score
avgtargdist_smax_corr = x_avgtargdist.corr(y_smax_true)
avgtargdist_smax_spearman_corr = x_avgtargdist.corr(y_smax_true, method='spearman')

###### S_MEAN ########
# Let's plot average STRING target distance against S_mean synergy scores
plt.scatter(x_avgtargdist, y_smean_true)

# Best fit line
z_avgtargdist_smean = np.polyfit(x_avgtargdist, y_smean_true, 1)
p_avgtargdist_smean = np.poly1d(z_avgtargdist_smean)
y_smean_pred = p_avgtargdist_smean(x_avgtargdist)
r_squared_avgtargdist_smean = r2_score(y_smean_true, y_smean_pred)
plt.plot(x_avgtargdist, y_smean_pred, "r-", alpha=0.8, label=f'R² = {r_squared_avgtargdist_smean:.3f}')
plt.xlabel('Average Target Distance', fontsize=20)
plt.ylabel('S_mean Synergy Score', fontsize=20)
plt.legend()
plt.tight_layout()
plt.savefig('results/avg_targ_distance_analysis/smean_v_avgtargdist_drugbank_scatter.png', dpi=700)
plt.close()

# Calculate Pearson/Spearman correlation coefficient between average STRING target distance and S_mean synergy score
avgtargdist_smean_corr = x_avgtargdist.corr(y_smean_true)
avgtargdist_smean_spearman_corr = x_avgtargdist.corr(y_smean_true, method='spearman')

###### S_SUM ########
# Let's plot average STRING target distance against S_sum synergy scores
plt.scatter(x_avgtargdist, y_ssum_true)

# Best fit line
z_avgtargdist_ssum = np.polyfit(x_avgtargdist, y_ssum_true, 1)
p_avgtargdist_ssum = np.poly1d(z_avgtargdist_ssum)
y_ssum_pred = p_avgtargdist_ssum(x_avgtargdist)
r_squared_avgtargdist_ssum = r2_score(y_ssum_true, y_ssum_pred)
plt.plot(x_avgtargdist, y_ssum_pred, "r-", alpha=0.8, label=f'R² = {r_squared_avgtargdist_ssum:.3f}')
plt.xlabel('Average Target Distance', fontsize=20)
plt.ylabel('S_sum Synergy Score', fontsize=20)
plt.legend()
plt.tight_layout()
plt.savefig('results/avg_targ_distance_analysis/ssum_v_avgtargdist_drugbank_scatter.png', dpi=700)
plt.close()

# Calculate Pearson/Spearman correlation
avgtargdist_ssum_corr = x_avgtargdist.corr(y_ssum_true)
avgtargdist_ssum_spearman_corr = x_avgtargdist.corr(y_ssum_true, method='spearman')

# Write all the correlations and spearman correlations to a file
with open('results/avg_targ_distance_analysis/avgtargdist_v_synergy_correlations_drugbank.tsv', 'w') as f:
    f.write('Synergy score\tLevel\tPearson correlation coefficient\tSpearman correlation\tR squared value\n')
    f.write('Bliss\tAverage Target Distance\t{:.4e}\t{:.4e}\t{:.4e}\n'.format(avgtargdist_bliss_corr, avgtargdist_bliss_spearman_corr, r_squared_avgtargdist_bliss))
    f.write('HSA\tAverage Target Distance\t{:.4e}\t{:.4e}\t{:.4e}\n'.format(avgtargdist_hsa_corr, avgtargdist_hsa_spearman_corr, r_squared_avgtargdist_hsa))
    f.write('Loewe\tAverage Target Distance\t{:.4e}\t{:.4e}\t{:.4e}\n'.format(avgtargdist_loewe_corr, avgtargdist_loewe_spearman_corr, r_squared_avgtargdist_loewe))
    f.write('ZIP\tAverage Target Distance\t{:.4e}\t{:.4e}\t{:.4e}\n'.format(avgtargdist_zip_corr, avgtargdist_zip_spearman_corr, r_squared_avgtargdist_zip))
    f.write('S_max\tAverage Target Distance\t{:.4e}\t{:.4e}\t{:.4e}\n'.format(avgtargdist_smax_corr, avgtargdist_smax_spearman_corr, r_squared_avgtargdist_smax))
    f.write('S_mean\tAverage Target Distance\t{:.4e}\t{:.4e}\t{:.4e}\n'.format(avgtargdist_smean_corr, avgtargdist_smean_spearman_corr, r_squared_avgtargdist_smean))
    f.write('S_sum\tAverage Target Distance\t{:.4e}\t{:.4e}\t{:.4e}\n'.format(avgtargdist_ssum_corr, avgtargdist_ssum_spearman_corr, r_squared_avgtargdist_ssum))