In [1]:
import pandas as pd
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold, rdScaffoldNetwork
from rdkit.Chem import rdFMCS 
from rdkit import DataStructs, Chem
from scipy.spatial.distance import squareform, cdist, pdist
from scipy.cluster.hierarchy import fcluster, linkage, dendrogram
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmilesFromSmiles
import tmap
from faerun import Faerun
from tqdm import tqdm
from glob import glob
import os

In [2]:

def _calc_ecfp4(smiles):
    ecfp4 = AllChem.GetMorganFingerprint(Chem.MolFromSmiles(smiles), radius = 2)    
    return ecfp4

def pairwise_dist_tanimoto(smiles_list):    
    MorganFP_list = [_calc_ecfp4(i) for i in smiles_list]
    TanimotoDist =[]   
    for i, fp1 in enumerate(MorganFP_list):
        for fp2 in MorganFP_list[i+1:]:
            s = DataStructs.TanimotoSimilarity(fp1,fp2)
            #Available similarity metrics include Tanimoto, Dice, 
            # Cosine, Sokal, Russel, Kulczynski, McConnaughey, and Tversky
            d = 1. - s #distance
            TanimotoDist.append(d)
    dist_matrix = squareform(TanimotoDist)

    return dist_matrix

def get_core_fw(smi):
    mol = Chem.MolFromSmiles(smi)
    core = MurckoScaffold.GetScaffoldForMol(mol)
    fw = MurckoScaffold.MakeScaffoldGeneric(core)
    return Chem.MolToSmiles(core), Chem.MolToSmiles(fw)

def get_mcs(mols):
    scd = rdFMCS.FindMCS(mols, completeRingsOnly=True,)
    sc = Chem.MolFromSmarts(scd.smartsString)
    ##get matched
    # mol = mols[0]
    # match_aidx = mol.GetSubstructMatches(sc)
    # frag = AllChem.MolFragmentToSmarts(mol, match_aidx[0])
    return sc

def plot_tmap(df, task_name, save_dir):
    
    smiles_list = df.Smiles.to_list()

    dim = 2048
    data_save_folder = save_dir
    task_name = task_name

    mols = [Chem.MolFromSmiles(s) for s in smiles_list]
    ECFP4_fps = [AllChem.GetMorganFingerprintAsBitVect(x,2,dim) for x in tqdm(mols, ascii=True)]
    ecfps = [tmap.VectorUchar(list(fp)) for fp in ECFP4_fps]
    enc = tmap.Minhash(dim,seed = 42)
    lf = tmap.LSHForest(dim)
    lf.batch_add(enc.batch_from_binary_array(ecfps))
    lf.index()

    cfg = tmap.LayoutConfiguration()
    cfg.k = 50
    cfg.kc = 50
    cfg.sl_scaling_min = 1.0
    cfg.sl_scaling_max = 1.0
    cfg.sl_repeats = 1
    cfg.sl_extra_scaling_steps = 2
    cfg.placer = tmap.Placer.Barycenter
    cfg.merger = tmap.Merger.LocalBiconnected
    cfg.merger_factor = 2.0
    cfg.merger_adjustment = 0
    cfg.fme_iterations = 2000
    cfg.sl_scaling_type = tmap.ScalingType.RelativeToDesiredLength
    cfg.node_size = 1 / 2
    cfg.mmm_repeats = 1
    x, y, s, t, gp = tmap.layout_from_lsh_forest(lf, config = cfg)

    #======================================================
    c1 = df['pChEMBL Value'].round(3)
    c = [c1] #具体的属性数据
    series_title = ['pChEMBL Value'] ##具体的属性数据名称
    categorical = [False]
    cmap = ['jet']
    min_legend_label= [str(c1.min())]
    max_legend_label= [str(c1.max())]

    labels = (df.Smiles + "__" + df.index + "__" 
              + (df['Standard Value'].astype(str) + ' nM')).tolist() #显示smiles， 以及图片中的标签

    point_scale = 13
    legend_labels = [None]
    #========================================================


    faerun = Faerun(view="front", clear_color='#111111',coords=False) #'#ffffff'
    faerun.add_scatter(task_name, { "x": x, "y": y, 
                                  "c": c, "labels": labels},
                       legend_labels = legend_labels,
                       max_legend_label = max_legend_label,
                       min_legend_label = min_legend_label,
                       point_scale=point_scale,
                       colormap = cmap,
                       has_legend=True,
                       categorical = categorical,
                       series_title = series_title,
                       shader = 'smoothCircle') #"sphere", #


    faerun.add_tree(task_name + "_tree", {"from": s, "to": t}, point_helper=task_name,  color='#666666', ) #colors when no value

    # Choose the "smiles" template to display structure on hover
    faerun.plot(task_name, path = data_save_folder, template="smiles", notebook_height=750)

In [3]:
csvs = glob('./*.csv')
target_type_dict = {'mGluR2': 'GPCR', 
'USP7':  'Protease',
'RIP2':  'Kinase',
'PKCi':  'Kinase',
'PHGDH':  'Other Enzyme',
'RORg':  'Nuclear Receptor',
'IDO1':  'Other Enzyme',
'KLK5':  'Protease',
'Notum':  'Other Enzyme',
'EAAT3':  'Transporter',
'PLK1':  'Kinase',
'RXFP1':  'GPCR',
'UR2':  'GPCR',
'BRAF':  'Kinase'}

In [4]:
save_dir = './info'
names = {}
infos = []
for csv in csvs:
    df = pd.read_csv(csv, index_col=0)
    
    csv_name = os.path.basename(csv).replace('.csv', '')
    print(csv_name)
    
    name = csv_name.lower()
    display_name = csv_name
    url_name = os.path.basename(csv)
    col_list = df.columns.tolist()
    smiles_idx = col_list.index('Smiles')
    y_idx = col_list.index('Standard Value')
    
    # Format: name: [display_name, url_name, csv_name, smiles_idx, y_idx]
    res = {name: [display_name, url_name, csv_name, smiles_idx, y_idx]}
    names.update(res)
    #print(name, len(df), len(df1))
    #print(res)
    
    #dfs = df.Smiles.apply(get_core_fw).apply(pd.Series)
    #scaffolds = df.Smiles.apply(MurckoScaffoldSmilesFromSmiles).value_counts()

    dataset = name
    target = df['Target Name'].unique().tolist()[0]
    target_type = target_type_dict[csv_name]
    inhibitors = len(df)
    reference = '; '.join(df['Document ChEMBL ID'].unique().tolist())

    info = {'dataset': dataset, 'target':target, 
            'target_type':target_type, 'size':inhibitors,
            'reference':reference}
    infos.append(info)
    
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        
    plot_tmap(df, name, save_dir)
    
pd.DataFrame(names).to_json(os.path.join(save_dir, 'names.json'))
pd.DataFrame(infos).to_json(os.path.join(save_dir, 'infos.json'))

UR2


100%|################################################################################| 60/60 [00:00<00:00, 43774.26it/s]


PLK1


100%|################################################################################| 73/73 [00:00<00:00, 30091.81it/s]


RIP2


100%|################################################################################| 46/46 [00:00<00:00, 46784.19it/s]


RORg


100%|################################################################################| 68/68 [00:00<00:00, 27003.66it/s]


KLK5


100%|################################################################################| 65/65 [00:00<00:00, 50787.96it/s]


BRAF


100%|##############################################################################| 128/128 [00:00<00:00, 44143.31it/s]


USP7


100%|################################################################################| 45/45 [00:00<00:00, 31963.37it/s]


PHGDH


100%|################################################################################| 51/51 [00:00<00:00, 45697.39it/s]


Notum


100%|##############################################################################| 128/128 [00:00<00:00, 68574.65it/s]


RXFP1


100%|##############################################################################| 117/117 [00:00<00:00, 40356.38it/s]


mGluR2


100%|##############################################################################| 244/244 [00:00<00:00, 40315.55it/s]


PKCi


100%|################################################################################| 48/48 [00:00<00:00, 57736.33it/s]


EAAT3


100%|################################################################################| 59/59 [00:00<00:00, 60357.06it/s]


IDO1


100%|################################################################################| 78/78 [00:00<00:00, 58577.57it/s]


In [5]:
pd.DataFrame(infos).to_markdown()

'|    | dataset   | target                                      | target_type      |   size | reference                                                  |\n|---:|:----------|:--------------------------------------------|:-----------------|-------:|:-----------------------------------------------------------|\n|  0 | ur2       | Urotensin II receptor                       | GPCR             |     60 | CHEMBL1146083                                              |\n|  1 | plk1      | Serine/threonine-protein kinase PLK1        | Kinase           |     73 | CHEMBL4406868; CHEMBL4138231                               |\n|  2 | rip2      | Serine/threonine-protein kinase RIPK2       | Kinase           |     46 | CHEMBL4266012; CHEMBL4130524                               |\n|  3 | rorg      | Nuclear receptor ROR-gamma                  | Nuclear Receptor |     68 | CHEMBL4619752                                              |\n|  4 | klk5      | Kallikrein 5                                | Prot

In [7]:
names

{'ur2': ['UR2', 'UR2.csv', 'UR2', 6, 9],
 'plk1': ['PLK1', 'PLK1.csv', 'PLK1', 6, 9],
 'rip2': ['RIP2', 'RIP2.csv', 'RIP2', 6, 9],
 'rorg': ['RORg', 'RORg.csv', 'RORg', 6, 9],
 'klk5': ['KLK5', 'KLK5.csv', 'KLK5', 6, 9],
 'braf': ['BRAF', 'BRAF.csv', 'BRAF', 6, 9],
 'usp7': ['USP7', 'USP7.csv', 'USP7', 6, 9],
 'phgdh': ['PHGDH', 'PHGDH.csv', 'PHGDH', 6, 9],
 'notum': ['Notum', 'Notum.csv', 'Notum', 6, 9],
 'rxfp1': ['RXFP1', 'RXFP1.csv', 'RXFP1', 6, 9],
 'mglur2': ['mGluR2', 'mGluR2.csv', 'mGluR2', 6, 9],
 'pkci': ['PKCi', 'PKCi.csv', 'PKCi', 2, 5],
 'eaat3': ['EAAT3', 'EAAT3.csv', 'EAAT3', 6, 9],
 'ido1': ['IDO1', 'IDO1.csv', 'IDO1', 6, 9]}