In [64]:
import pandas as pd
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold, rdScaffoldNetwork
from rdkit.Chem import rdFMCS 
from rdkit import DataStructs, Chem
from scipy.spatial.distance import squareform, cdist, pdist
from scipy.cluster.hierarchy import fcluster, linkage, dendrogram
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmilesFromSmiles
import tmap
from faerun import Faerun
from tqdm import tqdm
from glob import glob
import os

In [68]:
from matplotlib.colors import ListedColormap
import seaborn as sns

color = sns.color_palette("jet_r", 100)
custom_cmap = ListedColormap(
    color[5:-5],
    name="custom",
)

In [71]:
def _calc_ecfp4(smiles):
    ecfp4 = AllChem.GetMorganFingerprint(Chem.MolFromSmiles(smiles), radius = 2)    
    return ecfp4

def pairwise_dist_tanimoto(smiles_list):    
    MorganFP_list = [_calc_ecfp4(i) for i in smiles_list]
    TanimotoDist =[]   
    for i, fp1 in enumerate(MorganFP_list):
        for fp2 in MorganFP_list[i+1:]:
            s = DataStructs.TanimotoSimilarity(fp1,fp2)
            #Available similarity metrics include Tanimoto, Dice, 
            # Cosine, Sokal, Russel, Kulczynski, McConnaughey, and Tversky
            d = 1. - s #distance
            TanimotoDist.append(d)
    dist_matrix = squareform(TanimotoDist)

    return dist_matrix

def get_core_fw(smi):
    mol = Chem.MolFromSmiles(smi)
    core = MurckoScaffold.GetScaffoldForMol(mol)
    fw = MurckoScaffold.MakeScaffoldGeneric(core)
    return Chem.MolToSmiles(core), Chem.MolToSmiles(fw)

def get_mcs(mols):
    scd = rdFMCS.FindMCS(mols, completeRingsOnly=True,)
    sc = Chem.MolFromSmarts(scd.smartsString)
    ##get matched
    # mol = mols[0]
    # match_aidx = mol.GetSubstructMatches(sc)
    # frag = AllChem.MolFragmentToSmarts(mol, match_aidx[0])
    return sc

def plot_tmap(df, task_name, save_dir):
    
    smiles_list = df.Smiles.to_list()

    dim = 2048
    data_save_folder = save_dir
    task_name = task_name

    mols = [Chem.MolFromSmiles(s) for s in smiles_list]
    ECFP4_fps = [AllChem.GetMorganFingerprintAsBitVect(x,2,dim) for x in tqdm(mols, ascii=True)]
    ecfps = [tmap.VectorUchar(list(fp)) for fp in ECFP4_fps]
    enc = tmap.Minhash(dim,seed = 42)
    lf = tmap.LSHForest(dim)
    lf.batch_add(enc.batch_from_binary_array(ecfps))
    lf.index()

    cfg = tmap.LayoutConfiguration()
    cfg.k = 50
    cfg.kc = 50
    cfg.sl_scaling_min = 1.0
    cfg.sl_scaling_max = 1.0
    cfg.sl_repeats = 1
    cfg.sl_extra_scaling_steps = 2
    cfg.placer = tmap.Placer.Barycenter
    cfg.merger = tmap.Merger.LocalBiconnected
    cfg.merger_factor = 2.0
    cfg.merger_adjustment = 0
    cfg.fme_iterations = 100
    cfg.sl_scaling_type = tmap.ScalingType.RelativeToDesiredLength
    cfg.node_size = 1/2
    cfg.mmm_repeats = 10
    x, y, s, t, gp = tmap.layout_from_lsh_forest(lf, config = cfg)

    #======================================================
    c1 = df['pChEMBL Value'].round(2)
    c = [c1] #具体的属性数据
    series_title = ['pChEMBL Value'] ##具体的属性数据名称
    categorical = [False]
    cmap = [custom_cmap]
    min_legend_label= [str(c1.min())]
    max_legend_label= [str(c1.max())]

    labels = (df.Smiles + "__" + df.index + "__" 
              + (df['Standard Value'].astype(str) + ' nM')).tolist() #显示smiles， 以及图片中的标签

    point_scale = 20
    legend_labels = [None]
    #========================================================


    faerun = Faerun(view="front", clear_color='#111111',coords=False) #'#ffffff'
    faerun.add_scatter(task_name, { "x": x, "y": y, 
                                  "c": c, "labels": labels},
                       legend_labels = legend_labels,
                       max_legend_label = max_legend_label,
                       min_legend_label = min_legend_label,
                       point_scale=point_scale,
                       colormap = cmap,
                       has_legend=True,
                       categorical = categorical,
                       series_title = series_title,
                       shader = 'smoothCircle') #"sphere", #


    faerun.add_tree(task_name + "_tree", {"from": s, "to": t}, point_helper=task_name,  color='#666666', ) #colors when no value

    # Choose the "smiles" template to display structure on hover
    faerun.plot(task_name+'.js', path = data_save_folder, template="smiles", notebook_height=750)

In [72]:
save_dir = './info'
csvs = ['./BRAF.csv']

target_type_dict = {'mglur2': 'GPCR', 
'usp7':  'Protease',
'rip2':  'Kinase',
'pkci':  'Kinase',
'phgdh':  'Other Enzyme',
'rorg':  'Nuclear Receptor',
'ido1':  'Other Enzyme',
'klk5':  'Protease',
'notum':  'Other Enzyme',
'eaat3':  'Transporter',
'plk1':  'Kinase',
'rxfp1':  'GPCR',
'ur2':  'GPCR',
'braf':  'Kinase'}

names = {}
infos = []
alldf = []
for csv in csvs:
    df = pd.read_csv(csv, index_col=0)
    
    csv_name = os.path.basename(csv).replace('.csv', '')
    print(csv_name)
    
    name = csv_name.lower()
    display_name = csv_name
    url_name = os.path.basename(csv)
    col_list = df.columns.tolist()
    smiles_idx = col_list.index('Smiles') + 1
    y_idx = col_list.index('pChEMBL Value') + 1
    
    # Format: name: [display_name, url_name, csv_name, smiles_idx, y_idx]
    res = {name: [display_name, url_name, csv_name, smiles_idx, y_idx]}
    names.update(res)
    #print(name, len(df), len(df1))
    #print(res)
    
    #dfs = df.Smiles.apply(get_core_fw).apply(pd.Series)
    #scaffolds = df.Smiles.apply(MurckoScaffoldSmilesFromSmiles).value_counts()

    dataset = name
    target = df['Target Name'].unique().tolist()[0]
    target_type = target_type_dict[dataset]
    inhibitors = len(df)
    reference = '; '.join(df['Document ChEMBL ID'].unique().tolist())

    info = {'dataset': dataset, 'target':target, 
            'target_type':target_type, 'size':inhibitors,
            'reference':reference}
    infos.append(info)
    
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        
    plot_tmap(df, name, save_dir)
    
    df1 = df[['Standard Value', 'pChEMBL Value', 'Smiles']]
    df1['dataset'] = name
    alldf.append(df1)


BRAF


100%|##############################################################################| 128/128 [00:00<00:00, 38824.91it/s]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['dataset'] = name
