## **Chemical multiverse visualization: antidiabetic compounds from Sida genus**

In [37]:
import pandas as pd
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, MACCSkeys
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from rdkit.Chem import rdFingerprintGenerator
import plotly.express as px
import molplotly
import kaleido
from plotly import io as pio
from dash import Dash

In [5]:
data = pd.read_csv("dbs_curated_sidas_dianat.csv")
unpda = pd.read_excel("0_UNPD_A_curated.xlsx")

In [6]:
unpda["DATASET"] = 'UNPD-A'
unpda = unpda[['ID', 'DATASET', 'curated_SMILES']]
unpda.rename(columns={'curated_SMILES': 'curated_smiles'}, inplace=True)
data.columns
data = data[['ID', 'DATASET', 'curated_smiles']]

In [7]:
data_1 = pd.concat([data, unpda], ignore_index=True)

In [8]:
data_1.tail(5)

Unnamed: 0,ID,DATASET,curated_smiles
15601,UNPD_subset_A_14996,UNPD-A,C=C1[C@@H]2CC[C@@H]3[C@](C2)(C(=O)O[C@H]2C[C@H...
15602,UNPD_subset_A_14997,UNPD-A,CC1(C)[C@@H](O)CC[C@]2(C)[C@H]3C=CC4C(=O)C(=O)...
15603,UNPD_subset_A_14998,UNPD-A,C[C@]12CC[C@H](O[C@@H]3OC[C@H](O)[C@@H](O)[C@H...
15604,UNPD_subset_A_14999,UNPD-A,COc1ccc(C(=O)O[C@H]2CC[C@@]3(C)[C@@H]4CC[C@H]5...
15605,UNPD_subset_A_15000,UNPD-A,CO[C@@]12C[C@H](C)O[C@@H](O[C@@H]3C[C@@H]4C[C@...


In [9]:
data_2 = pd.concat([data_1,
                    pd.DataFrame([list(rdFingerprintGenerator.GetMorganGenerator(radius = 2, fpSize = 1024, includeChirality = True).GetFingerprint(Chem.MolFromSmiles(smiles)).ToBitString()) for smiles in data_1['curated_smiles']])],
                    axis = 1)

In [10]:
data_2

Unnamed: 0,ID,DATASET,curated_smiles,0,1,2,3,4,5,6,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,DiaNatDB-1,DiaNat-DB,O=C(O)C1=C[C@@H](O)[C@@H](O)[C@H](O)C1,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
1,DiaNatDB-10,DiaNat-DB,C=C(C)[C@@H]1CC[C@]2(C)CC[C@]3(C)[C@H](CC[C@@H...,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0
2,DiaNatDB-100,DiaNat-DB,C=C(CC[C@@H](C)[C@H]1CC[C@@]2(C)C3CC[C@H]4C(C)...,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,DiaNatDB-101,DiaNat-DB,Cc1ccc2c(c1)C(=O)CCO2,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,DiaNatDB-102,DiaNat-DB,CC(=O)c1ccccc1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15601,UNPD_subset_A_14996,UNPD-A,C=C1[C@@H]2CC[C@@H]3[C@](C2)(C(=O)O[C@H]2C[C@H...,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
15602,UNPD_subset_A_14997,UNPD-A,CC1(C)[C@@H](O)CC[C@]2(C)[C@H]3C=CC4C(=O)C(=O)...,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
15603,UNPD_subset_A_14998,UNPD-A,C[C@]12CC[C@H](O[C@@H]3OC[C@H](O)[C@@H](O)[C@H...,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
15604,UNPD_subset_A_14999,UNPD-A,COc1ccc(C(=O)O[C@H]2CC[C@@]3(C)[C@@H]4CC[C@H]5...,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0


In [11]:
# Execute the reduction of components
data_tsne = data_2.iloc[:, 3:]
data_tsne = StandardScaler().fit_transform(data_tsne)
tsne_results_30 = TSNE(n_components = 2, verbose = 1, perplexity = 30, n_iter = 1000).fit_transform(data_tsne)



[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 15606 samples in 0.027s...
[t-SNE] Computed neighbors for 15606 samples in 1.232s...
[t-SNE] Computed conditional probabilities for sample 1000 / 15606
[t-SNE] Computed conditional probabilities for sample 2000 / 15606
[t-SNE] Computed conditional probabilities for sample 3000 / 15606
[t-SNE] Computed conditional probabilities for sample 4000 / 15606
[t-SNE] Computed conditional probabilities for sample 5000 / 15606
[t-SNE] Computed conditional probabilities for sample 6000 / 15606
[t-SNE] Computed conditional probabilities for sample 7000 / 15606
[t-SNE] Computed conditional probabilities for sample 8000 / 15606
[t-SNE] Computed conditional probabilities for sample 9000 / 15606
[t-SNE] Computed conditional probabilities for sample 10000 / 15606
[t-SNE] Computed conditional probabilities for sample 11000 / 15606
[t-SNE] Computed conditional probabilities for sample 12000 / 15606
[t-SNE] Computed conditional probabilities for sam

In [12]:
# Concatenate the labels as numpy array
labels = data_2[['ID', 'DATASET', "curated_smiles"]]
labels = labels.to_numpy()
array = np.concatenate((labels, tsne_results_30), axis = 1)
tsne_df_30 = pd.DataFrame(array, columns = ['ID', 'DATASET', "curated_smiles", 'component1', 'component2'])
tsne_df_30

Unnamed: 0,ID,DATASET,curated_smiles,component1,component2
0,DiaNatDB-1,DiaNat-DB,O=C(O)C1=C[C@@H](O)[C@@H](O)[C@H](O)C1,45.147614,-34.357094
1,DiaNatDB-10,DiaNat-DB,C=C(C)[C@@H]1CC[C@]2(C)CC[C@]3(C)[C@H](CC[C@@H...,-73.985214,-15.05434
2,DiaNatDB-100,DiaNat-DB,C=C(CC[C@@H](C)[C@H]1CC[C@@]2(C)C3CC[C@H]4C(C)...,-44.283718,16.796438
3,DiaNatDB-101,DiaNat-DB,Cc1ccc2c(c1)C(=O)CCO2,15.916361,-21.432692
4,DiaNatDB-102,DiaNat-DB,CC(=O)c1ccccc1,13.332603,0.976818
...,...,...,...,...,...
15601,UNPD_subset_A_14996,UNPD-A,C=C1[C@@H]2CC[C@@H]3[C@](C2)(C(=O)O[C@H]2C[C@H...,-39.44939,-18.44998
15602,UNPD_subset_A_14997,UNPD-A,CC1(C)[C@@H](O)CC[C@]2(C)[C@H]3C=CC4C(=O)C(=O)...,-16.523472,8.018917
15603,UNPD_subset_A_14998,UNPD-A,C[C@]12CC[C@H](O[C@@H]3OC[C@H](O)[C@@H](O)[C@H...,-76.297455,11.901145
15604,UNPD_subset_A_14999,UNPD-A,COc1ccc(C(=O)O[C@H]2CC[C@@]3(C)[C@@H]4CC[C@H]5...,12.405819,24.546255


In [13]:
tsne_df_30["DATASET"] = pd.Categorical(tsne_df_30["DATASET"],
                                    categories = ["UNPD-A", "DiaNat-DB", "SG-accumulated", "SR-accumulated", "SH-accumulated"])
tsne_df_30 = tsne_df_30.sort_values('DATASET').reset_index(drop=True)
tsne_df_30

Unnamed: 0,ID,DATASET,curated_smiles,component1,component2
0,UNPD_subset_A_7193,UNPD-A,C=C[C@@](C)(CC/C=C(\C)CC/C=C(\C)[C@H](O)[C@@H]...,1.167016,39.584553
1,UNPD_subset_A_9789,UNPD-A,OC[C@H]1O[C@@H](O[C@H]2[C@H](O)[C@@H](CO)O[C@@...,20.769939,54.034527
2,UNPD_subset_A_9790,UNPD-A,OC[C@]1(O)COc2cc(O)ccc2-c2cc(O)c(O)cc2C1,4.527165,-7.494673
3,UNPD_subset_A_9791,UNPD-A,OC[C@@H]1CC[C@H]2CCCC[C@H]2C1,-16.451649,-15.195016
4,UNPD_subset_A_9792,UNPD-A,OC[C@@H]1[C@@H](O)[C@H](O)[C@H]2[C@H](O)[C@@H]...,-21.917358,11.985225
...,...,...,...,...,...
15601,14055737,SH-accumulated,C[C@@H]1CC[C@]2(C(=O)O)CC[C@]3(C)C(=CC[C@@H]4[...,19.165894,84.347931
15602,73207,SH-accumulated,COc1c(OC)c(O)c2c(=O)cc(-c3ccc(O)cc3)oc2c1OC,82.629059,20.619614
15603,638014,SH-accumulated,CC(=O)/C=C/C1=C(C)CCCC1(C)C,-44.366432,28.744669
15604,10603919,SH-accumulated,CC1(C)CC[C@]2(CO)CC[C@]3(C)C(=CC[C@@H]4[C@@]5(...,16.014708,78.472542


In [16]:
tsne_df_30.to_csv("tsne_df_30.csv", index = False)

In [42]:
# Create the tSNE plot for all datasets
tsne_30 = px.scatter(tsne_df_30,
                            x = 'component1',
                            y = 'component2',
                            # symbol = 'Minimum Degree',
                            color = 'DATASET',
                            color_discrete_sequence = ["#D9DDDC",
                                                    "#979745",
                                                    "#4ab82b",
                                                    "#304be3",
                                                    "#19a3a6",
                                                    ],  # color sequence for the datasets
                            #color code: UNPD-A: #D9DDDC, Dianat: #979745, Glabra #4ab82b, Rhombi #304be3,  Hyssopi #19a3a6, 

                            title = False,
                            labels = {'tSNE dimension 1': 'tSNE dimension 1',
                                    'tSNE dimension 2': 'tSNE dimension 2'},
                            width = 1000,
                            height = 800)

# Aumentar tamaño de los puntos
tsne_30.update_traces(marker = dict(size = 10))

tsne_30.update_layout(
    xaxis = dict(showgrid = False, showline = True, linewidth = 1, linecolor = 'black', mirror = True, tickfont = dict(size = 24, color = 'black')),
    yaxis = dict(showgrid = False, showline = True, linewidth = 1, linecolor = 'black', mirror = True, tickfont = dict(size = 24, color = 'black')),
    plot_bgcolor = 'white',
    paper_bgcolor = 'white',
    showlegend = False  # Esta línea elimina la leyenda
)
000000


# Modify x-axis label
# Modify x-axis label
tsne_30.update_xaxes(
    title_text = "tSNE dimension 1",
    range = [-90, 90],
    title_font = dict(size = 30, color =  "black")
)

tsne_30.update_yaxes(
    title_text = "tSNE dimension 2",
    range = [-90, 90],
    title_font = dict(size = 30, color = "black")
)

# app_marker = molplotly.add_molecules(fig = tsne_30,
#                                         df = tsne_df_30,
#                                         smiles_col = 'curated_smiles',
#                                         title_col = 'ID',
#                                         color_col = 'DATASET'
#                                         )
pio.write_image(tsne_30, 'tsne_reference.png', format = "png", width=1000, height=800, scale=3)

#tsne_30.show()
# app_marker.run_server(mode = 'inline', port = 8060, height = 1000)
# app_marker.run(port = 8060)

ValueError: 
Image export using the "kaleido" engine requires the kaleido package,
which can be installed using pip:
    $ pip install -U kaleido
