**Author:** Benoît BAILLIF

**Purpose:** Compute t-SNE embedding for Morgan fingerprints and signatures of used compounds

**Input:**
- data/processed/
 - cmp_info_cmap : to find used compounds
 - signatures/ : to find used signatures in each cell lines
 
**Output:** 
- data/processed/
 - cell_line_sigs_tsne.csv for each cell line
 - all_sigs_tsne.csv : one t-SNE for all signatures

In [16]:
import numpy as np
import pandas as pd
import time
import os
from sklearn.manifold import TSNE
from rdkit            import Chem
from rdkit.Chem       import AllChem

# Input

In [32]:
processed_data_directory = 'data/processed/'
sig_directory = processed_data_directory + 'signatures/'

cmp_info_cmap_file_name = 'cmp_info_cmap.csv'
cmp_info_cmap_path = processed_data_directory + cmp_info_cmap_file_name

# Output

In [17]:
tsne_directory = 'data/processed/tsne/'
if not os.path.exists(tsne_directory) :
    os.makedirs(tsne_directory)

In [18]:
used_compounds_mfp_tsne_path = tsne_directory + 'used_compounds_mfp_tsne.csv'

# Main

## Compute Morgan fingerprints t-SNE, representing the used chemical space

In [6]:
cmp_info_cmap = pd.read_csv(cmp_info_cmap_path)
cmp_info_cmap = cmp_info_cmap.set_index('pert_id')
print(cmp_info_cmap.shape)
cmp_info_cmap.head()

(21220, 6)


Unnamed: 0_level_0,pert_iname,is_touchstone,inchi_key,canonical_smiles,pubchem_cid,used_compound
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BRD-A00100033,nifurtimox,1.0,ARFHIAQFJWUCFH-UHFFFAOYSA-N,CC1CS(=O)(=O)CCN1N=Cc1ccc([N+](=O)[O-])o1,6842999.0,1
BRD-A00150179,5-hydroxytryptophan,0.0,QSHLMQDRPXXYEE-UHFFFAOYSA-N,NC(Cc1c[nH]c2cccc(O)c12)C(=O)O,589768.0,0
BRD-A00267231,hemado,1.0,KOCIMZNSNPOGOP-UHFFFAOYSA-N,CCCCC#Cc1nc(NC)c2ncn(C3OC(CO)C(O)C3O)c2n1,4043357.0,1
BRD-A00420644,SA-3676,0.0,ASCBUEVCEVGOFP-UHFFFAOYSA-N,CCN1c2ccccc2NC2N=C(OC)C(c3ccccc3)C21,2853908.0,1
BRD-A00474148,BRD-A00474148,0.0,RCGAUPRLRFZAMS-UHFFFAOYSA-N,O=C1Cc2cc([S+](=O)([O-])N3CCN(c4ccc(O)cc4)CC3)...,44825297.0,1


In [7]:
used_cmp_info_cmap = cmp_info_cmap[cmp_info_cmap['used_compound'] == True]

In [10]:
molecules = used_cmp_info_cmap['canonical_smiles'].apply(Chem.MolFromSmiles)
binary_fps = np.array([list(AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=1024)) for mol in molecules])

In [11]:
time_start = time.time()
fp_tsne = TSNE(random_state=42, metric='dice', verbose=1).fit_transform(binary_fps)
print('t-SNE done! Time elapsed: {} seconds'.format(time.time() - time_start))

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 9035 samples in 1.784s...
[t-SNE] Computed neighbors for 9035 samples in 209.951s...
[t-SNE] Computed conditional probabilities for sample 1000 / 9035
[t-SNE] Computed conditional probabilities for sample 2000 / 9035
[t-SNE] Computed conditional probabilities for sample 3000 / 9035
[t-SNE] Computed conditional probabilities for sample 4000 / 9035
[t-SNE] Computed conditional probabilities for sample 5000 / 9035
[t-SNE] Computed conditional probabilities for sample 6000 / 9035
[t-SNE] Computed conditional probabilities for sample 7000 / 9035
[t-SNE] Computed conditional probabilities for sample 8000 / 9035
[t-SNE] Computed conditional probabilities for sample 9000 / 9035
[t-SNE] Computed conditional probabilities for sample 9035 / 9035
[t-SNE] Mean sigma: 0.242368
[t-SNE] KL divergence after 250 iterations with early exaggeration: 93.798813
[t-SNE] KL divergence after 1000 iterations: 1.663906
t-SNE done! Time elapsed: 240.385200

In [19]:
fp_tsne = pd.DataFrame(fp_tsne, index=molecules.index, columns=['Dice_TSNE_1', 'Dice_TSNE_2'])
fp_tsne.to_csv(used_compounds_mfp_tsne_path)

## Compute the signatures t-SNE, representing the biological spaces

In [29]:
cell_lines = ['MCF7', 'HA1E', 'HT29', 'A549', 'HCC515', 'PC3', 'VCAP', 'A375']

### One t-SNE per used signatures in a cell line

In [35]:
all_sigs = pd.DataFrame()

for cell_line in cell_lines :
    print('Computing ' + cell_line + ' signatures t-SNE')
        
    cell_line_df = pd.read_csv(sig_directory + cell_line + '_used_signatures.csv')
    cell_line_df = cell_line_df.set_index('pert_id')
        
    time_start = time.time()
    cell_line_tsne = TSNE(random_state=42, metric='cosine', verbose=1).fit_transform(cell_line_df.values)
    print('t-SNE done! Time elapsed: {} seconds'.format(time.time() - time_start))
    
    tsne_col_1 = 'Cosine_' + cell_line + '_sigs_TSNE_1'
    tsne_col_2 = 'Cosine_' + cell_line + '_sigs_TSNE_2'
    cell_line_tsne = pd.DataFrame(cell_line_tsne, index=cell_line_df.index, columns = [tsne_col_1, tsne_col_2])

    cell_line_tsne.to_csv(tsne_directory + cell_line + '_sigs_tsne.csv')
    
    cell_line_df['cell_line'] = cell_line
    all_sigs = pd.concat([all_sigs, cell_line_df])

Computing MCF7 signatures t-SNE
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 7546 samples in 0.005s...
[t-SNE] Computed neighbors for 7546 samples in 1.513s...
[t-SNE] Computed conditional probabilities for sample 1000 / 7546
[t-SNE] Computed conditional probabilities for sample 2000 / 7546
[t-SNE] Computed conditional probabilities for sample 3000 / 7546
[t-SNE] Computed conditional probabilities for sample 4000 / 7546
[t-SNE] Computed conditional probabilities for sample 5000 / 7546
[t-SNE] Computed conditional probabilities for sample 6000 / 7546
[t-SNE] Computed conditional probabilities for sample 7000 / 7546
[t-SNE] Computed conditional probabilities for sample 7546 / 7546
[t-SNE] Mean sigma: 0.156262
[t-SNE] KL divergence after 250 iterations with early exaggeration: 90.063797
[t-SNE] KL divergence after 1000 iterations: 2.546462
t-SNE done! Time elapsed: 25.304924726486206 seconds
Computing HA1E signatures t-SNE
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Ind

### One single t-SNE for all used signatures from all cell lines

In [36]:
time_start = time.time()
# -1 to remove the cell line column
all_sigs_tsne = TSNE(random_state=42, metric='cosine', verbose=1).fit_transform(all_sigs.iloc[:,:-1].values)
print('t-SNE done! Time elapsed: {} seconds'.format(time.time() - time_start))

tsne_col_1 = 'Cosine_sigs_TSNE_1'
tsne_col_2 = 'Cosine_sigs_TSNE_2'
all_sigs_tsne = pd.DataFrame(all_sigs_tsne, index=all_sigs.index, columns = ['sig_TSNE_1', 'sig_TSNE_2'])

all_sigs_tsne['cell_line'] = all_sigs['cell_line']
all_sigs_tsne.to_csv(tsne_directory + 'all_sigs_tsne.csv')

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 39544 samples in 0.023s...
[t-SNE] Computed neighbors for 39544 samples in 50.939s...
[t-SNE] Computed conditional probabilities for sample 1000 / 39544
[t-SNE] Computed conditional probabilities for sample 2000 / 39544
[t-SNE] Computed conditional probabilities for sample 3000 / 39544
[t-SNE] Computed conditional probabilities for sample 4000 / 39544
[t-SNE] Computed conditional probabilities for sample 5000 / 39544
[t-SNE] Computed conditional probabilities for sample 6000 / 39544
[t-SNE] Computed conditional probabilities for sample 7000 / 39544
[t-SNE] Computed conditional probabilities for sample 8000 / 39544
[t-SNE] Computed conditional probabilities for sample 9000 / 39544
[t-SNE] Computed conditional probabilities for sample 10000 / 39544
[t-SNE] Computed conditional probabilities for sample 11000 / 39544
[t-SNE] Computed conditional probabilities for sample 12000 / 39544
[t-SNE] Computed conditional probabilities for sa

FileNotFoundError: [Errno 2] No such file or directory: 'signatures/sig_tsne.csv'