**Author:** Benoît BAILLIF

**Purpose:** Produce figures for chemical and biological spaces plots

**Input:**
- data/processed/
 - cmp_info_cmap.csv : info on used compounds
 - used_pert_id_target_matrix.csv : find targets for each used compound
 - signatures/
 
**Output:** 
- plots/Figures 6 and Supplementary Figure 2 : space plots

In [3]:
import pandas            as pd
import numpy             as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib
import os
import time

from rdkit           import Chem
from rdkit.Chem      import AllChem
from sklearn.metrics import pairwise_distances

# Input

In [1]:
processed_data_directory = 'data/processed/'
sigs_directory = processed_data_directory + 'signatures/'

In [2]:
cmp_info_cmap_path = processed_data_directory + 'cmp_info_cmap.csv'
used_pert_id_target_matrix_path = processed_data_directory + 'used_pert_id_target_matrix.csv'

# Output

In [4]:
plots_directory = 'plots/'

In [5]:
figure6_path = plots_directory + 'Figure 6.tif'
supplementary_figure_2_path = plots_directory + 'Supplementary Figure 2.tif'

# Main

In [6]:
cell_lines = ['MCF7', 'HA1E', 'HT29', 'A549', 'HCC515', 'PC3', 'VCAP', 'A375']

## Data loading

In [7]:
cmp_info_cmap = pd.read_csv(cmp_info_cmap_path, index_col='pert_id')
print(cmp_info_cmap.shape)
cmp_info_cmap.head()

(21220, 6)


Unnamed: 0_level_0,pert_iname,is_touchstone,inchi_key,canonical_smiles,pubchem_cid,used_compound
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BRD-A00100033,nifurtimox,1.0,ARFHIAQFJWUCFH-UHFFFAOYSA-N,CC1CS(=O)(=O)CCN1N=Cc1ccc([N+](=O)[O-])o1,6842999.0,1
BRD-A00150179,5-hydroxytryptophan,0.0,QSHLMQDRPXXYEE-UHFFFAOYSA-N,NC(Cc1c[nH]c2cccc(O)c12)C(=O)O,589768.0,0
BRD-A00267231,hemado,1.0,KOCIMZNSNPOGOP-UHFFFAOYSA-N,CCCCC#Cc1nc(NC)c2ncn(C3OC(CO)C(O)C3O)c2n1,4043357.0,1
BRD-A00420644,SA-3676,0.0,ASCBUEVCEVGOFP-UHFFFAOYSA-N,CCN1c2ccccc2NC2N=C(OC)C(c3ccccc3)C21,2853908.0,1
BRD-A00474148,BRD-A00474148,0.0,RCGAUPRLRFZAMS-UHFFFAOYSA-N,O=C1Cc2cc([S+](=O)([O-])N3CCN(c4ccc(O)cc4)CC3)...,44825297.0,1


In [8]:
used_cmp_info_cmap = cmp_info_cmap[cmp_info_cmap['used_compound'] == True]

In [9]:
used_pert_id_target_matrix = pd.read_csv(used_pert_id_target_matrix_path, index_col='pert_id')
print(used_pert_id_target_matrix.shape)
used_pert_id_target_matrix.head()

(7825, 1358)


Unnamed: 0_level_0,AAK1,ABAT,ABCA1,ABCB1,ABCB11,ABCB6,ABCC1,ABCC2,ABCC3,ABCC5,...,WEE2,WNT3A,WRN,XDH,XIAP,XPO1,YES1,YWHAB,YWHAG,ZAP70
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BRD-A00100033,,,,,,,,,,,...,,,,,,,,,,
BRD-A00218260,,,,,,,,,,,...,,,,,,,,,,
BRD-A00267231,,,,,,,,,,,...,,,,,,,,,,
BRD-A00420644,,,,0.0,,0.0,0.0,,,,...,,,0.0,,,,,,0.0,
BRD-A00474148,,,,,,0.0,,,,,...,,,,,,,,,,


In [10]:
d_sigs = {}
for cell_line in cell_lines :
    d_sigs[cell_line] = pd.read_csv(sigs_directory + cell_line + '_used_signatures.csv', index_col='pert_id')

# Compute quadrant plots for active compounds

In [11]:
molecules = used_cmp_info_cmap['canonical_smiles'].apply(Chem.MolFromSmiles)
fps = molecules.apply(lambda mol : AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=1024))
binary_fps = np.stack(fps)
binary_fps_df = pd.DataFrame(binary_fps, index=used_cmp_info_cmap.index).astype(bool)

In [22]:
start = time.time()
targets = ['NR3C1', 'TUBB', 'DRD1']

distance_table = pd.DataFrame(columns=['perturbagen_1', 'perturbagen_2', 'smiles_1', 'smiles_2', 'target', 
                                         'cell_line', 'ECFP6_dice_dist', 'signature_cosine_dist'])

actives = []

for cell_line in ['MCF7', 'A549', 'PC3'] :
    print(cell_line)
    
    cell_line_sigs = d_sigs[cell_line]

    for target in targets :
        print(target)
        
        actives = used_pert_id_target_matrix[used_pert_id_target_matrix[target] == 1].index
        actives = actives[actives.isin(cell_line_sigs.index)]
            
        active_fps = binary_fps_df.loc[actives, ]
        active_sigs = cell_line_sigs.loc[actives, ]

        fp_dists = pairwise_distances(active_fps.values, metric='dice')
        # using index.copy() to avoid using the same index object
        fp_dists = pd.DataFrame(fp_dists, index=active_fps.index.copy(), columns=active_fps.index.copy())
        fp_dists.index.name = 'perturbagen_1'
        fp_dists.columns.name = 'perturbagen_2'
        fp_dists = fp_dists.reset_index().melt('perturbagen_1')
        fp_dists = fp_dists.rename({'value' : 'ECFP6_dice_dist'}, axis=1)

        sig_dists = pairwise_distances(active_sigs.values, metric='cosine')
        sig_dists = pd.DataFrame(sig_dists, index=active_sigs.index.copy(), columns=active_sigs.index.copy())
        sig_dists.index.name = 'perturbagen_1'
        sig_dists.columns.name = 'perturbagen_2'
        sig_dists = sig_dists.reset_index().melt('perturbagen_1')
        sig_dists = sig_dists.rename({'value' : 'signature_cosine_dist'}, axis=1)

        all_dists = fp_dists.merge(sig_dists, left_on=['perturbagen_1', 'perturbagen_2'], 
                                       right_on=['perturbagen_1', 'perturbagen_2'])

        all_dists = all_dists.merge(pd.DataFrame(cmp_info_cmap['canonical_smiles']).reset_index(), how='inner', 
                                                    left_on=['perturbagen_1'], right_on=['pert_id'])
        all_dists = all_dists.drop('pert_id', axis=1)
        all_dists = all_dists.rename({'canonical_smiles' : 'smiles_1'}, axis=1)

        all_dists = all_dists.merge(pd.DataFrame(cmp_info_cmap['canonical_smiles']).reset_index(), how='inner', 
                                                    left_on=['perturbagen_2'], right_on=['pert_id'])
        all_dists = all_dists.drop('pert_id', axis=1)
        all_dists = all_dists.rename({'canonical_smiles' : 'smiles_2'}, axis=1)
        all_dists['target'] = target
        all_dists['cell_line'] = cell_line
        distance_table = pd.concat([distance_table, all_dists], axis=0)
                
end = time.time()
print (end - start)

MCF7
NR3C1
TUBB
DRD1
A549
NR3C1
TUBB
DRD1
PC3
NR3C1
TUBB
DRD1
0.5185329914093018


In [23]:
unique_pairs = distance_table['perturbagen_1'].copy() + '__' + distance_table['perturbagen_2'].copy()
unique_pairs = unique_pairs.apply(lambda s : '_'.join(sorted(s.split('__'))))
distance_table['unique_pair'] = unique_pairs + '__' + distance_table['cell_line'] + '__' + distance_table['target']

In [26]:
distance_table = distance_table.drop_duplicates(subset=['unique_pair'])
distance_table = distance_table[distance_table['perturbagen_1'] != distance_table['perturbagen_2']]

In [27]:
distance_table

Unnamed: 0,perturbagen_1,perturbagen_2,smiles_1,smiles_2,target,cell_line,ECFP6_dice_dist,signature_cosine_dist,unique_pair
1,BRD-K00824317,BRD-A34299591,CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)...,CCCC1O[C@@H]2C[C@H]3[C@@H]4CCC5=CC(=O)C=C[C@]5...,NR3C1,MCF7,0.530864,0.758737,BRD-A34299591_BRD-K00824317__MCF7__NR3C1
2,BRD-K02407574,BRD-A34299591,CCCCc1ccc2[nH]c(NC(=O)OC)nc2c1,CCCC1O[C@@H]2C[C@H]3[C@@H]4CCC5=CC(=O)C=C[C@]5...,NR3C1,MCF7,0.824818,1.167193,BRD-A34299591_BRD-K02407574__MCF7__NR3C1
3,BRD-K03981224,BRD-A34299591,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(C...,CCCC1O[C@@H]2C[C@H]3[C@@H]4CCC5=CC(=O)C=C[C@]5...,NR3C1,MCF7,0.702128,0.911739,BRD-A34299591_BRD-K03981224__MCF7__NR3C1
4,BRD-K09295900,BRD-A34299591,C[C@]12C[C@H](O)[C@H]3[C@@H](CCC4=CC(=O)C=C[C@...,CCCC1O[C@@H]2C[C@H]3[C@@H]4CCC5=CC(=O)C=C[C@]5...,NR3C1,MCF7,0.430303,1.039730,BRD-A34299591_BRD-K09295900__MCF7__NR3C1
5,BRD-K10799896,BRD-A34299591,CCC(=O)O[C@]1(C(=O)CCl)[C@@H](C)C[C@H]2[C@@H]3...,CCCC1O[C@@H]2C[C@H]3[C@@H]4CCC5=CC(=O)C=C[C@]5...,NR3C1,MCF7,0.617284,0.834606,BRD-A34299591_BRD-K10799896__MCF7__NR3C1
...,...,...,...,...,...,...,...,...,...
8552,BRD-K97440753,BRD-K97158071,CC(C)[C@@]1(NC(=O)[C@@H]2C[C@@H]3c4cccc5[nH]cc...,O=C(CCCN1CC=C(n2c(=O)[nH]c3ccccc32)CC1)c1ccc(F...,DRD1,PC3,0.775401,0.838584,BRD-K97158071_BRD-K97440753__PC3__DRD1
8553,BRD-K99792991,BRD-K97158071,Oc1c(Cl)cc(Cl)c(Cl)c1Cc1c(O)c(Cl)cc(Cl)c1Cl,O=C(CCCN1CC=C(n2c(=O)[nH]c3ccccc32)CC1)c1ccc(F...,DRD1,PC3,0.918367,0.978273,BRD-K97158071_BRD-K99792991__PC3__DRD1
8646,BRD-K97440753,BRD-K97309399,CC(C)[C@@]1(NC(=O)[C@@H]2C[C@@H]3c4cccc5[nH]cc...,CN1CCN(CC/C=C2\c3ccccc3Sc3ccc(S(=O)(=O)N(C)C)c...,DRD1,PC3,0.805405,0.840809,BRD-K97309399_BRD-K97440753__PC3__DRD1
8647,BRD-K99792991,BRD-K97309399,Oc1c(Cl)cc(Cl)c(Cl)c1Cc1c(O)c(Cl)cc(Cl)c1Cl,CN1CCN(CC/C=C2\c3ccccc3Sc3ccc(S(=O)(=O)N(C)C)c...,DRD1,PC3,0.875000,0.700614,BRD-K97309399_BRD-K99792991__PC3__DRD1


In [None]:
icol = 1
fig, axs = plt.subplots(3,3, dpi=300, sharex='col', sharey='row', gridspec_kw={'hspace': 0.1, 'wspace': 0.1}, figsize=(18/2.54,18/2.54))
(ax1, ax2, ax3), (ax4, ax5, ax6), (ax7, ax8, ax9) = axs
lax = [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9]
figure_subletter = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']

#fig.suptitle('Quadrant plot', fontsize=16, y=0.93)
i = 0
for target in ['NR3C1', 'TUBB', 'DRD1'] :
    for cell_line in ['A549', 'MCF7', 'PC3'] :
        
        cax = lax[i]
        sub_df = quadrant_data[(quadrant_data['cell_line'] == cell_line) & (quadrant_data['activity'] == target)]
        cax.scatter(sub_df['ECFP6_dice_dist'], sub_df['signature_cosine_dist'], s=1)
        cax.axhline(0.5, color='k', linestyle='--', linewidth=1)
        cax.axvline(0.5, color='k', linestyle='--', linewidth=1)
        cax.tick_params(labelsize=7)
        cax.set_xlim([-0.05, 1.05])
        cax.set_ylim([-0.05, 2.05])
        
        subgroup_total = group_total[(group_total['cell_line'] == cell_line) & (group_total['activity'] == target)]
        if (subgroup_total.shape[0] > 0) :
            q1_val = subgroup_total['Q1'].values[0] / 2
            q1_pourcent = round(subgroup_total['Q1 percent'].values[0])
            q2_val = subgroup_total['Q2'].values[0] / 2
            q2_pourcent = round(subgroup_total['Q2 percent'].values[0])
            q3_val = subgroup_total['Q3'].values[0] / 2
            q3_pourcent = round(subgroup_total['Q3 percent'].values[0])
            q4_val = subgroup_total['Q4'].values[0] / 2
            q4_pourcent = round(subgroup_total['Q4 percent'].values[0])
            cax.text(0,2, figure_subletter[i], horizontalalignment='left', verticalalignment='top', fontsize=10)
            cax.text(0,1.8, str(q1_val) + ' (' + str(q1_pourcent) + ' %)', horizontalalignment='left', verticalalignment='top', fontsize=6)
            cax.text(1,1.8, str(q2_val) + ' (' + str(q2_pourcent) + ' %)', horizontalalignment='right', verticalalignment='top', fontsize=6)
            cax.text(0,0, str(q3_val) + ' (' + str(q3_pourcent) + ' %)', horizontalalignment='left', verticalalignment='bottom', fontsize=6)
            cax.text(1,0, str(q4_val) + ' (' + str(q4_pourcent) + ' %)', horizontalalignment='right', verticalalignment='bottom', fontsize=6)
        
        i = i + 1
        
# Create a big subplot
ax = fig.add_subplot(111, frameon=False)
# hide tick and tick label of the big axes
plt.tick_params(labelcolor='none', top='off', bottom='off', left='off', right='off')

ax.set_xlabel('Morgan fingerprint Dice distance', fontsize=12)
ax.set_ylabel('Signature cosine distance', fontsize=12, labelpad=20)

ax1.set_title('A549', fontsize=8)
ax2.set_title('MCF7', fontsize=8)
ax3.set_title('PC3', fontsize=8)

ax1.set_ylabel('NR3C1', fontsize=8)
ax4.set_ylabel('TUBB', fontsize=8)
ax7.set_ylabel('DRD1', fontsize=8)

plt.savefig('Figure 6.tif', bbox_inches='tight', pil_kwargs={'compression' : 'LZW'})

In [46]:
start = time.time()
targets = ['NR3C1', 'TUBB', 'DRD1']

distance_table = pd.DataFrame(columns=['perturbagen_1', 'perturbagen_2', 'smiles_1', 'smiles_2', 'activity', 
                                         'cell_line', 'ECFP6_dice_dist', 'signature_cosine_dist'])

cell_line = 'MCF7'
    
cell_line_sigs = d_sigs[cell_line]
cell_line_fps = binary_fps_df.loc[cell_line_sigs.index, ]

target = 'NR3C1'
        
actives = used_pert_id_target_matrix[used_pert_id_target_matrix[target] == 1].index
        
if len(actives) :
            
    active_fps = cell_line_fps.loc[actives, ]
    active_sigs = cell_line_sigs.loc[actives, ]

    global fp_dists
    fp_dists = pairwise_distances(active_fps.values, metric='dice')
    fp_dists = pd.DataFrame(fp_dists, index=active_fps.index.copy(), columns=active_fps.index.copy())
    fp_dists.index.name = 'perturbagen_1'
    fp_dists.columns.name = 'perturbagen_2'
    #fp_dists = fp_dists.reset_index().melt('perturbagen_1')
    #fp_dists = fp_dists.rename({'value' : 'ECFP6_dice_dist'}, axis=1)
                
end = time.time()
print (end - start)

0.04689764976501465




In [None]:
sig_dists = pairwise_distances(active_sigs.values, metric='cosine')
sig_dists = pd.DataFrame(sig_dists, index=active_sigs.index, columns=active_sigs.index)
sig_dists.index.name = 'perturbagen_1'
sig_dists.columns.name = 'perturbagen_2'
sig_dists = sig_dists.reset_index().melt('perturbagen_1')
sig_dists = sig_dists.rename({'value' : 'signature_cosine_dist'}, axis=1)

all_dists = fp_dists.merge(sig_dists, left_on=['perturbagen_1', 'perturbagen_2'], 
                               right_on=['perturbagen_1', 'perturbagen_2'])

all_dists = all_dists.merge(pd.DataFrame(cmp_info_cmap['canonical_smiles']).reset_index(), how='inner', 
                                            left_on=['perturbagen_1'], right_on=['pert_id'])
all_dists = all_dists.drop('pert_id', axis=1)
all_dists = all_dists.rename({'canonical_smiles' : 'smiles_1'}, axis=1)

all_dists = all_dists.merge(pd.DataFrame(cmp_info_cmap['canonical_smiles']).reset_index(), how='inner', 
                                            left_on=['perturbagen_2'], right_on=['pert_id'])
all_dists = all_dists.drop('pert_id', axis=1)
all_dists = all_dists.rename({'canonical_smiles' : 'smiles_2'}, axis=1)
all_dists['activity'] = moa
all_dists['cell_line'] = cell_line
distance_table = pd.concat([distance_table, all_dists], axis=0)