In [2]:
### Ligand Database ###
import requests
import os
from bs4 import BeautifulSoup
import pandas as pd
from copy import deepcopy
from rdkit import Chem
from SBLMDCOVDOCK.SBLSettings import DOCKING_Settings
import numpy as np
settings = DOCKING_Settings()
import ast
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from sklearn.metrics import pairwise_distances

import plotly.express as px
from phate import PHATE

from sklearn.decomposition import PCA
import numpy as np


In [None]:
url =  "http://bldb.eu/S-BLDB.php"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find_all('table')  # Replace 'table' with the appropriate HTML element or attributes


In [None]:
df = pd.read_html(str(table))[1]
# drop the first row
df.drop(df.index[0], inplace=True)

# fill empty cells in Ligands with APO
df['Ligands'] = df['Ligands'].fillna('APO')
df.Ligands = df.Ligands.apply(lambda x: x.split(" "))

# drop Ambler class B-3
df = df[df['Ambler class'] != 'B1']
df = df[df['Ambler class'] != 'B2']
df = df[df['Ambler class'] != 'B3']
df.head()



In [None]:
for CLASS in df['Ambler class'].unique():
    print(CLASS)

In [None]:
df["Covalent"] = df.Ligands.apply(lambda x: [y for y in x if "*" in y])
df["Covalent"] = df.Covalent.apply(lambda x: x[0] if len(x) > 0 else None)
df["Covalent"] = df.Covalent.apply(lambda x: x.strip("*") if x is not None else None)

df["Michaelis"] = df.Ligands.apply(lambda x: [y for y in x if "$" in y])
df["Michaelis"] = df.Michaelis.apply(lambda x: x[0] if len(x) > 0 else None)    
df["Michaelis"] = df.Michaelis.apply(lambda x: x.strip("$") if x is not None else None)

df["APO"] = df.Ligands.apply(lambda x: "APO" if (("*" or "#" or "$") not in "".join(x)) else None)
df.Ligands = df.Ligands.apply(lambda x: "APO" if (("*" or "#" or "$") not in "".join(x)) else x)
df.Ligands = df.Ligands.apply(lambda x: [y for y in x if (("*" or "#" or "$") in y)])
df.Ligands = df.Ligands.apply(lambda x: ["APO"] if len(x) == 0 else x)


single_ligand = df[df.Ligands.apply(lambda x: len(x) == 1)]

# convert ligands to string
single_ligand.Ligands = single_ligand.Ligands.apply(lambda x: x[0])
# print(single_ligand.head())
# remove APO
single_ligand = single_ligand[single_ligand.Ligands != "APO"]
print(single_ligand.head())



In [None]:
print(df.Covalent.value_counts())

In [None]:
for lig in single_ligand.Ligands:
    print(lig)

In [None]:

for idx, lig in enumerate(df.Ligands):
    #
    if len(lig) > 1:

        print(lig)
        # for jdx, ligand in enumerate(lig):





In [None]:
# strip * from Ligand entries
single_ligand.Ligands = single_ligand.Ligands.apply(lambda x: x.strip('*'))


In [None]:
# drop duplicate entries in Ligands, keep first
unique_ligands = single_ligand.drop_duplicates(subset=['Ligands'], keep='first')
unique_ligands.Ligands

In [None]:
def download_ligands(ligcode,settings=settings):
    """
    Download ligand sdf from PDB database
    """
    url = f"https://files.rcsb.org/ligands/download/{ligcode}_ideal.sdf"
    sdf_file = os.path.join(settings.structures_input, f"{ligcode}.sdf")

    response = requests.get(url)
    response.raise_for_status()  # Ensure we got a valid response

    # Write the content of the response to a file
    with open(sdf_file, 'wb') as f:
        f.write(response.content)

    print(f"Ligand {ligcode} downloaded at {sdf_file}")

In [None]:
for ligandcode in unique_ligands.Ligands:
    break
    download_ligands(ligandcode)

In [None]:
def SDF_to_SMILES(path):
    """
    Convert SDF file to SMILES file
    """
    suppl = Chem.SDMolSupplier(path)
    for mol in suppl:
        if mol is None: continue
        return Chem.MolToSmiles(mol)

In [None]:
# add smiles to unique_ligands
for ligandcode in unique_ligands.Ligands:
    path = os.path.join(settings.structures_input, ligandcode+".sdf")
    smiles = SDF_to_SMILES(path)
    print(ligandcode, smiles)
    unique_ligands.loc[unique_ligands.Ligands == ligandcode, "SMILES"] = smiles
    
    

In [None]:
print(unique_ligands.Ligands)

In [None]:
# Add scaffolds: DBO, β-Lactam,Cyclic boronate inhibitor, Non-β-lactam
# Add scaffold: DBO: 
dbo_smiles = "C1CN2CCN1CC2"
dbo_name = "DABCO"
dbo_warhead = "C1CNC(=O)N1" #Imidazolidone
dbo_warhead_name = "DBO_warhead"
# Add scaffold: β-Lactam
blactam_smiles = "C1CNC2CC1C2"
blactam_name = "bLAC"
blactam_warhead = "C1CNC1=O" #beta-lactam
blactam_warhead_name = "bLAC_warhead"

# Add scaffold: Cyclic boronate inhibitor: https://pubs.acs.org/doi/10.1021/acs.jmedchem.9b00911
cbi_smiles = "OB1Oc2c(ccc(F)c2C(O)=O)[C@H]2C[C@@H]12"
cbi_name = "QPX7728" #https://drug-discovery.vm.uni-freiburg.de/covpdb/complex_card/pdb_ligand_id=1959
cbi_warhead = "B(C)(O)O" #Boronate
cbi_warhead_name = "CBI_warhead"


In [None]:
# Add scaffold information to the ligand database

scaffolds = {
    "SMILES": [dbo_smiles, dbo_warhead,blactam_smiles, blactam_warhead,cbi_smiles,cbi_warhead],
    "Ligands": [dbo_name, dbo_warhead_name,blactam_name, blactam_warhead_name,cbi_name,cbi_warhead_name]
}

# Add dictionary to the ligand database
scaffolds = pd.DataFrame(scaffolds)

unique_ligands = pd.concat([unique_ligands, scaffolds], axis=0, ignore_index=True)

print(unique_ligands)


In [4]:
# save to csv
smiles_csv_path = os.path.join(settings.structures_input, "unique_ligands.csv")
unique_ligands.to_csv(smiles_csv_path, index=False)

print(smiles_csv_path)

NameError: name 'unique_ligands' is not defined

In [19]:
smiles_csv_path = os.path.join(settings.structures_input, "unique_ligands.csv")

unique_ligands = pd.read_csv(smiles_csv_path)

In [20]:
from e3fp.pipeline import fprints_from_smiles

fprint_params = {'bits': 4096, 'radius_multiplier': 1.5, 'rdkit_invariants': True}

confgen_params = {'max_energy_diff': 20.0, 'first': 1}

smiles = "COC(=O)C(C1CCCCN1)C2=CC=CC=C2"
fprints = fprints_from_smiles(smiles, "ritalin", confgen_params=confgen_params, fprint_params=fprint_params)
print()

2023-08-11 00:14:42,555|INFO|Generating conformers for ritalin.
2023-08-11 00:14:43,740|INFO|Generated 1 conformers for ritalin.
2023-08-11 00:14:43,741|INFO|Generating fingerprints for ritalin.
2023-08-11 00:14:43,790|INFO|Generated 1 fingerprints for ritalin.


[ 188  206  224  356  401  489  561  562  763  914 1095 1346 1414 1455
 1513 1527 1528 1584 1738 1786 1802 2063 2287 2316 2318 2521 2700 2714
 2755 2793 2874 2978 3421 3693 4053]


In [33]:

fprints = []
for idx, row in unique_ligands.iterrows():
    print(row.Ligands)
    print(row.SMILES)

    smiles = row.SMILES
    name = row.Ligands
    fprint = fprints_from_smiles(smiles, name, confgen_params=confgen_params, fprint_params=fprint_params)
    fprints.append(fprint)

unique_ligands['fprint'] = fprints


2023-08-11 00:26:25,396|INFO|Generating conformers for NXL.


NXL
NC(=O)[C@@H]1CC[C@@H](NOS(=O)(=O)O)CN1C=O


2023-08-11 00:26:26,318|INFO|Generated 1 conformers for NXL.
2023-08-11 00:26:26,318|INFO|Generating fingerprints for NXL.
2023-08-11 00:26:26,364|INFO|Generated 1 fingerprints for NXL.
2023-08-11 00:26:26,366|INFO|Generating conformers for IM2.


IM2
C[C@@H](O)[C@@H](C=O)[C@H]1CC(SCCNC=N)=C(C(=O)O)N1


2023-08-11 00:26:33,300|INFO|Generated 1 conformers for IM2.
2023-08-11 00:26:33,300|INFO|Generating fingerprints for IM2.
2023-08-11 00:26:33,362|INFO|Generated 1 fingerprints for IM2.
2023-08-11 00:26:33,364|INFO|Generating conformers for ID1.


ID1
C[C@@H](O)[C@@H](C=O)[C@H]1C[C@@H](SCC/N=C\N)C(C(=O)O)=N1


2023-08-11 00:26:40,162|INFO|Generated 1 conformers for ID1.
2023-08-11 00:26:40,163|INFO|Generating fingerprints for ID1.
2023-08-11 00:26:40,214|INFO|Generated 1 fingerprints for ID1.
2023-08-11 00:26:40,216|INFO|Generating conformers for 3P7.


3P7
C=C1CO[C@H]([C@@](C=O)(NC(=O)[C@H](C(=O)O)c2ccc(O)cc2)OC)N=C1C(=O)O


2023-08-11 00:26:52,725|INFO|Generated 1 conformers for 3P7.
2023-08-11 00:26:52,725|INFO|Generating fingerprints for 3P7.
2023-08-11 00:26:52,801|INFO|Generated 1 fingerprints for 3P7.
2023-08-11 00:26:52,803|INFO|Generating conformers for 1X6.


1X6
N/C=C/C(=O)OC[C@H](N)C(=O)O


2023-08-11 00:26:53,155|INFO|Generated 1 conformers for 1X6.
2023-08-11 00:26:53,156|INFO|Generating fingerprints for 1X6.
2023-08-11 00:26:53,184|INFO|Generated 1 fingerprints for 1X6.
2023-08-11 00:26:53,186|INFO|Generating conformers for ISS.


ISS
O=C/C=C/NCC(=O)CCO


2023-08-11 00:26:53,506|INFO|Generated 1 conformers for ISS.
2023-08-11 00:26:53,507|INFO|Generating fingerprints for ISS.
2023-08-11 00:26:53,539|INFO|Generated 1 fingerprints for ISS.
2023-08-11 00:26:53,540|INFO|Generating conformers for DWZ.


DWZ
C[C@@H]1[C@H]([C@H](C=O)[C@@H](C)O)N=C(C(=O)O)[C@H]1S[C@@H]1CN[C@H](C(=O)N(C)C)C1


2023-08-11 00:27:09,073|INFO|Generated 1 conformers for DWZ.
2023-08-11 00:27:09,074|INFO|Generating fingerprints for DWZ.
2023-08-11 00:27:09,152|INFO|Generated 1 fingerprints for DWZ.
2023-08-11 00:27:09,155|INFO|Generating conformers for DRW.


DRW
C[C@@H]1[C@H]([C@H](C=O)[C@@H](C)O)N=C(C(=O)O)[C@H]1S[C@@H]1CN[C@H](CNS(N)(=O)=O)C1


2023-08-11 00:27:36,422|INFO|Generated 1 conformers for DRW.
2023-08-11 00:27:36,423|INFO|Generating fingerprints for DRW.
2023-08-11 00:27:36,509|INFO|Generated 1 fingerprints for DRW.
2023-08-11 00:27:36,511|INFO|Generating conformers for 1RG.


1RG
C[C@@H](O)[C@@H](C=O)[C@@H]1NC(C(=O)O)=C(S[C@@H]2CN[C@H](C(=O)Nc3cccc(C(=O)O)c3)C2)[C@@H]1C


2023-08-11 00:27:59,651|INFO|Generated 1 conformers for 1RG.
2023-08-11 00:27:59,652|INFO|Generating fingerprints for 1RG.
2023-08-11 00:27:59,750|INFO|Generated 1 fingerprints for 1RG.
2023-08-11 00:27:59,752|INFO|Generating conformers for 2RG.


2RG
C[C@@H]1[C@H]([C@H](C=O)[C@@H](C)O)N=C(C(=O)O)[C@H]1S[C@@H]1CN[C@H](C(=O)Nc2cccc(C(=O)O)c2)C1


2023-08-11 00:28:23,272|INFO|Generated 1 conformers for 2RG.
2023-08-11 00:28:23,273|INFO|Generating fingerprints for 2RG.
2023-08-11 00:28:23,366|INFO|Generated 1 fingerprints for 2RG.
2023-08-11 00:28:23,368|INFO|Generating conformers for XD1.


XD1
CC1(C)S[C@H]([C@H](N)C=O)N[C@H]1C(=O)O


2023-08-11 00:28:24,056|INFO|Generated 1 conformers for XD1.
2023-08-11 00:28:24,056|INFO|Generating fingerprints for XD1.
2023-08-11 00:28:24,095|INFO|Generated 1 fingerprints for XD1.
2023-08-11 00:28:24,097|INFO|Generating conformers for AXL.


AXL
CC1(C)S[C@H]([C@@H](C=O)NC(=O)[C@H](N)c2ccc(O)cc2)N[C@H]1C(=O)O


2023-08-11 00:28:36,962|INFO|Generated 1 conformers for AXL.
2023-08-11 00:28:36,963|INFO|Generating fingerprints for AXL.
2023-08-11 00:28:37,026|INFO|Generated 1 fingerprints for AXL.
2023-08-11 00:28:37,028|INFO|Generating conformers for AIX.


AIX
CC1(C)S[C@H]([C@@H](C=O)NC(=O)[C@H](N)c2ccccc2)N[C@H]1C(=O)O


2023-08-11 00:28:48,678|INFO|Generated 1 conformers for AIX.
2023-08-11 00:28:48,678|INFO|Generating fingerprints for AIX.
2023-08-11 00:28:48,744|INFO|Generated 1 fingerprints for AIX.
2023-08-11 00:28:48,746|INFO|Generating conformers for CB9.


CB9
CC1(C)S[C@H]([C@@H](C=O)NC(=O)[C@@H](C(=O)O)c2ccccc2)N[C@H]1C(=O)O


2023-08-11 00:29:01,251|INFO|Generated 1 conformers for CB9.
2023-08-11 00:29:01,252|INFO|Generating fingerprints for CB9.
2023-08-11 00:29:01,325|INFO|Generated 1 fingerprints for CB9.
2023-08-11 00:29:01,327|INFO|Generating conformers for XD2.


XD2
Cn1nnnc1SCC1=C(C(=O)O)N[C@@H]([C@@H](C=O)NC(=O)[C@H](O)c2ccccc2)SC1


2023-08-11 00:29:15,991|INFO|Generated 1 conformers for XD2.
2023-08-11 00:29:15,992|INFO|Generating fingerprints for XD2.
2023-08-11 00:29:16,074|INFO|Generated 1 fingerprints for XD2.
2023-08-11 00:29:16,075|INFO|Generating conformers for DXF.


DXF
CO/N=C(\C(=O)N[C@H](C=O)[C@@H]1NC(C(=O)O)=C(COC(N)=O)CS1)c1ccco1


2023-08-11 00:29:28,551|INFO|Generated 1 conformers for DXF.
2023-08-11 00:29:28,552|INFO|Generating fingerprints for DXF.
2023-08-11 00:29:28,630|INFO|Generated 1 fingerprints for DXF.
2023-08-11 00:29:28,631|INFO|Generating conformers for DH4.


DH4
CC1(C)S[C@@H]([C@@H](C=O)/N=C/N2CCCCCC2)N[C@H]1C(=O)O


2023-08-11 00:29:31,802|INFO|Generated 1 conformers for DH4.
2023-08-11 00:29:31,803|INFO|Generating fingerprints for DH4.
2023-08-11 00:29:31,856|INFO|Generated 1 fingerprints for DH4.
2023-08-11 00:29:31,858|INFO|Generating conformers for NFF.


NFF
CCOc1ccc2ccccc2c1C(=O)N[C@H](C=O)[C@@H]1N[C@@H](C(=O)O)C(C)(C)S1


2023-08-11 00:29:49,192|INFO|Generated 1 conformers for NFF.
2023-08-11 00:29:49,193|INFO|Generating fingerprints for NFF.
2023-08-11 00:29:49,266|INFO|Generated 1 fingerprints for NFF.
2023-08-11 00:29:49,268|INFO|Generating conformers for 9EP.


9EP
CC(=O)OCC1=C(C(=O)O)N[C@@H]([C@@H](C=O)NC(=O)Cc2cccs2)SC1


2023-08-11 00:29:59,951|INFO|Generated 1 conformers for 9EP.
2023-08-11 00:29:59,952|INFO|Generating fingerprints for 9EP.
2023-08-11 00:30:00,024|INFO|Generated 1 fingerprints for 9EP.
2023-08-11 00:30:00,026|INFO|Generating conformers for 7EP.


7EP
COc1cccc(OC)c1C(=O)N[C@H](C=O)[C@@H]1N[C@@H](C(=O)O)C(C)(C)S1


2023-08-11 00:30:13,656|INFO|Generated 1 conformers for 7EP.
2023-08-11 00:30:13,657|INFO|Generating fingerprints for 7EP.
2023-08-11 00:30:13,729|INFO|Generated 1 fingerprints for 7EP.
2023-08-11 00:30:13,730|INFO|Generating conformers for CD8.


CD8
C=C1CS[C@H]([C@@](C=O)(NC(=O)c2ccccc2)OC)N=C1C(=O)O


2023-08-11 00:30:15,696|INFO|Generated 1 conformers for CD8.
2023-08-11 00:30:15,697|INFO|Generating fingerprints for CD8.
2023-08-11 00:30:15,759|INFO|Generated 1 fingerprints for CD8.
2023-08-11 00:30:15,760|INFO|Generating conformers for CD6.


CD6
C=C1CS[C@H]([C@@H](C=O)NC(=O)Cc2ccccc2)N=C1C(=O)O


2023-08-11 00:30:17,705|INFO|Generated 1 conformers for CD6.
2023-08-11 00:30:17,706|INFO|Generating fingerprints for CD6.
2023-08-11 00:30:17,768|INFO|Generated 1 fingerprints for CD6.
2023-08-11 00:30:17,769|INFO|Generating conformers for SFR.


SFR
C[C@@H](O)[C@@H](C(=O)O)[C@@H]1NC(C(=O)O)=C([C@H]2CCCO2)S1


2023-08-11 00:30:19,446|INFO|Generated 1 conformers for SFR.
2023-08-11 00:30:19,447|INFO|Generating fingerprints for SFR.
2023-08-11 00:30:19,497|INFO|Generated 1 fingerprints for SFR.
2023-08-11 00:30:19,498|INFO|Generating conformers for PCZ.


PCZ
C=C1CS[C@H]([C@@H](C=O)NC(=O)/C(=N\OC)c2csc(N)n2)N=C1C(=O)O


2023-08-11 00:30:28,701|INFO|Generated 1 conformers for PCZ.
2023-08-11 00:30:28,701|INFO|Generating fingerprints for PCZ.
2023-08-11 00:30:28,769|INFO|Generated 1 fingerprints for PCZ.
2023-08-11 00:30:28,770|INFO|Generating conformers for TEB.


TEB
C[C@@H](O)[C@@H](C=O)[C@@H]1NC(C(=O)O)=C(SC2CN(C3=NCCS3)C2)[C@@H]1C


2023-08-11 00:30:42,065|INFO|Generated 1 conformers for TEB.
2023-08-11 00:30:42,065|INFO|Generating fingerprints for TEB.
2023-08-11 00:30:42,136|INFO|Generated 1 fingerprints for TEB.
2023-08-11 00:30:42,138|INFO|Generating conformers for 33V.


33V
O=C(O)[C@@H]1CS[C@H]([C@H]2C[C@@H]3CCSCN3N2)[C@@H](CO)CN1


2023-08-11 00:30:44,684|INFO|Generated 1 conformers for 33V.
2023-08-11 00:30:44,684|INFO|Generating fingerprints for 33V.
2023-08-11 00:30:44,736|INFO|Generated 1 fingerprints for 33V.
2023-08-11 00:30:44,738|INFO|Generating conformers for 3Y6.


3Y6
CCn1ccn(C(=O)N[C@@H](C(=O)N[C@@H](Cc2cccc(C(=O)O)c2)B(O)O)c2ccc(O)cc2)c(=O)c1=O


ImportError: cannot import name 'calculate_fprint' from 'SBLMDCOVDOCK.SBLSettings' (/Users/alexi/Library/CloudStorage/OneDrive-Nexus365/Rotation_Projects/Rotation_2/Project/SBL_MD_CovDock/SBLMDCOVDOCK/SBLSettings.py)

In [None]:
smiles_csv_path = os.path.join(settings.structures_input, "unique_ligands.csv").replace(".csv","_pvae_logp_196_115.csv")

smiles_df = pd.read_csv(smiles_csv_path)
smiles_df['vae_emb'] = smiles_df['vae_emb'].apply(ast.literal_eval)

smiles_df.head()

In [None]:
break

# Load your DataFrame

# Extract the vae_emb column as a numpy array
vae_embeddings = np.array(smiles_df['vae_emb'].tolist())

# Perform PCA
n_components = 2  # You can change this to the number of components you want
pca = PCA(n_components=n_components)
pca_result = pca.fit_transform(vae_embeddings)

# Create a new DataFrame to store the PCA results
pca_columns = [f'PC{i+1}' for i in range(n_components)]
pca_df = pd.DataFrame(data=pca_result, columns=pca_columns)

# Concatenate the original DataFrame with the PCA results
smiles_df = pd.concat([smiles_df, pca_df], axis=1)

# Print the first few rows of the result DataFrame
print(smiles_df.head())


# Load your 
# Convert the list of arrays to a numpy array
# vae_embeddings = np.array(data['vae_emb'].tolist())

# Perform t-SNE dimensionality reduction
n_components = 2  # You can change this to the number of components you want
tsne = TSNE(n_components=n_components)
tsne_result = tsne.fit_transform(vae_embeddings)

# Create a new DataFrame to store the t-SNE results
tsne_columns = [f'TSNE{i+1}' for i in range(n_components)]
tsne_df = pd.DataFrame(data=tsne_result, columns=tsne_columns)

# Concatenate the original DataFrame with the t-SNE results
smiles_df = pd.concat([smiles_df, tsne_df], axis=1)

# Print the first few rows of the result DataFrame
print(smiles_df.head())


# Perform PHATE dimensionality reduction
n_components = 2  # Number of components for 3D visualization
phate_operator = PHATE(n_components=n_components)
phate_result = phate_operator.fit_transform(vae_embeddings)

# Create a new DataFrame to store the PHATE results
phate_columns = [f'PHATE{i+1}' for i in range(n_components)]
phate_df = pd.DataFrame(data=phate_result, columns=phate_columns)

# Concatenate the original DataFrame with the PHATE results
smiles_df = pd.concat([smiles_df, phate_df], axis=1)

# Print the first few rows of the result DataFrame
print(smiles_df.head())



In [None]:
# Load your DataFrame
# Load your DataFrame

# Extract the vae_emb column as a numpy array
vae_embeddings = np.array(smiles_df['vae_emb'].tolist())


# Perform PCA using pairwise angles
n_components = 2  # You can change this to the number of components you want
distance_matrix = pairwise_distances(vae_embeddings, metric='cosine')  # Calculate cosine distances
pca = PCA(n_components=n_components, random_state=42)
pca_result = pca.fit_transform(distance_matrix)

# Create a new DataFrame to store the PCA results
pca_columns = [f'PC{i+1}' for i in range(n_components)]
pca_df = pd.DataFrame(data=pca_result, columns=pca_columns)

# Concatenate the original DataFrame with the PCA results
smiles_df = pd.concat([smiles_df, pca_df], axis=1)



# Perform t-SNE dimensionality reduction using pairwise angles
n_components = 2  # You can change this to the number of components you want
tsne = TSNE(n_components=n_components, metric='cosine', random_state=42)  # Set metric to 'precomputed' and init to 'random'
# distance_matrix = pairwise_distances(vae_embeddings, metric='cosine')  # Calculate cosine distances
tsne_result = tsne.fit_transform(vae_embeddings)

# Create a new DataFrame to store the t-SNE results
tsne_columns = [f'TSNE{i+1}' for i in range(n_components)]
tsne_df = pd.DataFrame(data=tsne_result, columns=tsne_columns)

# Concatenate the original DataFrame with the t-SNE results
smiles_df = pd.concat([smiles_df, tsne_df], axis=1)

# Print the first few rows of the result DataFrame
print(smiles_df.head())




# Perform PHATE dimensionality reduction using pairwise angles
n_components = 2  # Number of components for 3D visualization
phate_operator = PHATE(n_components=n_components, knn_dist="cosine", random_state=42)  # Set metric to 'angle'
phate_result = phate_operator.fit_transform(distance_matrix)

# Create a new DataFrame to store the PHATE results
phate_columns = [f'PHATE{i+1}' for i in range(n_components)]
phate_df = pd.DataFrame(data=phate_result, columns=phate_columns)

# Concatenate the original DataFrame with the PHATE results
smiles_df = pd.concat([smiles_df, phate_df], axis=1)

# Print the first few rows of the result DataFrame
print(smiles_df.head())


In [None]:
# Create an interactive scatter plot using Plotly
fig = px.scatter(smiles_df, x='PC1', y='PC2', color='Covalent', 
                 hover_data=['PDB code', 'Ligands'])

# Customize the plot layout
fig.update_layout(title="PCA Visualization of VAE Embeddings",
                  xaxis_title="PCA Component 1",
                  yaxis_title="PCA Component 2")

# Show the plot
fig.show()

In [None]:
import matplotlib.pyplot as plt

# Create a scatter plot
plt.figure(figsize=(15, 15))
scatter = plt.scatter(smiles_df['PC1'], smiles_df['PC2'], c=range(len(smiles_df)), cmap='Set1')

# Customize the plot
plt.title("PCA Visualization of VAE Embeddings")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")

# Add labels to data points
for index, row in smiles_df.iterrows():
    plt.annotate(row['Ligands'], (row['PC1'], row['PC2']), textcoords="offset points", xytext=(0,10), ha='center')

# Show the plot
plt.show()


In [None]:
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Draw
from matplotlib.offsetbox import TextArea, DrawingArea, OffsetImage, AnnotationBbox

# Create a scatter plot
plt.figure(figsize=(100, 100))
scatter = plt.scatter(smiles_df['PC1'], smiles_df['PC2'], c=range(len(smiles_df)), cmap='Set1')

# Customize the plot
plt.title("PCA Visualization of VAE Embeddings")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")

# Add the skeletal formulas as images next to data points
for index, row in smiles_df.iterrows():
    smiles = row['smiles']
    molecule = Chem.MolFromSmiles(smiles)
    
    if molecule is not None:
        img = Draw.MolToImage(molecule, size=(500, 500))
        
        img = img.convert("RGBA")  # Convert to RGBA format for transparency support
        img_data = img.getdata()
        
        # Set transparent pixels where the image is white
        new_img_data = []
        for item in img_data:
            if item[:3] == (255, 255, 255):
                new_img_data.append((255, 255, 255, 100))  # Transparent pixel
            else:
                new_img_data.append(item)
        img.putdata(new_img_data)

        imagebox = OffsetImage(img, zoom=0.5)

        ab = AnnotationBbox(imagebox, (row['PC1'], row['PC2']), frameon=False, pad=0)
        plt.gca().add_artist(ab)

plt.show()


In [None]:
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Draw
from matplotlib.offsetbox import TextArea, DrawingArea, OffsetImage, AnnotationBbox

# Create a scatter plot
plt.figure(figsize=(100, 100))
scatter = plt.scatter(smiles_df['PC1'], smiles_df['PC2'], c=range(len(smiles_df)), cmap='Set1')

# Customize the plot
plt.title("PCA Visualization of VAE Embeddings")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")

# Add the skeletal formulas as images next to data points
for index, row in smiles_df.iterrows():
    smiles = row['smiles']
    molecule = Chem.MolFromSmiles(smiles)
    
    if molecule is not None:
        img = Draw.MolToImage(molecule, size=(500, 500))
        
        img = img.convert("RGBA")  # Convert to RGBA format for transparency support
        img_data = img.getdata()
        
        # Set transparent pixels where the image is white
        new_img_data = []
        for item in img_data:
            if item[:3] == (255, 255, 255):
                new_img_data.append((255, 255, 255, 100))  # Transparent pixel
            else:
                new_img_data.append(item)
        img.putdata(new_img_data)

        imagebox = OffsetImage(img, zoom=0.5)

        ab = AnnotationBbox(imagebox, (row['PC1'], row['PC2']), frameon=False, pad=0)
        plt.gca().add_artist(ab)

plt.show()


In [None]:
# Create an interactive scatter plot using Plotly
fig = px.scatter(smiles_df, x='TSNE1', y='TSNE2', color='Covalent', 
                 hover_data=['PDB code', 'Ligands'])

# Customize the plot layout
fig.update_layout(title="t-SNE Visualization of VAE Embeddings",
                  xaxis_title="t-SNE Component 1",
                  yaxis_title="t-SNE Component 2")

# Show the plot
fig.show()

In [None]:
# Create an interactive scatter plot using Plotly
fig = px.scatter(smiles_df, x='TSNE1', y='TSNE2', color='Covalent', 
                 hover_data=['PDB code', 'Ligands'])

# Customize the plot layout
fig.update_layout(title="t-SNE Visualization of VAE Embeddings",
                  xaxis_title="t-SNE Component 1",
                  yaxis_title="t-SNE Component 2")

# Show the plot
fig.show()

In [None]:
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Draw
from matplotlib.offsetbox import TextArea, DrawingArea, OffsetImage, AnnotationBbox

# Create a scatter plot
plt.figure(figsize=(100, 100))
scatter = plt.scatter(smiles_df['TSNE1'], smiles_df['TSNE2'], c=range(len(smiles_df)), cmap='Set1')

# Customize the plot
plt.title("TSNE Visualization of VAE Embeddings")
plt.xlabel("TSNE Component 1")
plt.ylabel("TSNE Component 2")

# Add the skeletal formulas as images next to data points
for index, row in smiles_df.iterrows():
    smiles = row['smiles']
    molecule = Chem.MolFromSmiles(smiles)
    
    if molecule is not None:
        img = Draw.MolToImage(molecule, size=(500, 500))
        
        img = img.convert("RGBA")  # Convert to RGBA format for transparency support
        img_data = img.getdata()
        
        # Set transparent pixels where the image is white
        new_img_data = []
        for item in img_data:
            if item[:3] == (255, 255, 255):
                new_img_data.append((255, 255, 255, 100))  # Transparent pixel
            else:
                new_img_data.append(item)
        img.putdata(new_img_data)

        imagebox = OffsetImage(img, zoom=0.5)

        ab = AnnotationBbox(imagebox, (row['TSNE1'], row['TSNE2']), frameon=False, pad=0)
        plt.gca().add_artist(ab)

plt.show()


In [None]:
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Draw
from matplotlib.offsetbox import TextArea, DrawingArea, OffsetImage, AnnotationBbox

# Create a scatter plot
plt.figure(figsize=(100, 100))
scatter = plt.scatter(smiles_df['TSNE1'], smiles_df['TSNE2'], c=range(len(smiles_df)), cmap='Set1')

# Customize the plot
plt.title("TSNE Visualization of VAE Embeddings")
plt.xlabel("TSNE Component 1")
plt.ylabel("TSNE Component 2")

# Add the skeletal formulas as images next to data points
for index, row in smiles_df.iterrows():
    smiles = row['smiles']
    molecule = Chem.MolFromSmiles(smiles)
    
    if molecule is not None:
        img = Draw.MolToImage(molecule, size=(500, 500))
        
        img = img.convert("RGBA")  # Convert to RGBA format for transparency support
        img_data = img.getdata()
        
        # Set transparent pixels where the image is white
        new_img_data = []
        for item in img_data:
            if item[:3] == (255, 255, 255):
                new_img_data.append((255, 255, 255, 100))  # Transparent pixel
            else:
                new_img_data.append(item)
        img.putdata(new_img_data)

        imagebox = OffsetImage(img, zoom=0.5)

        ab = AnnotationBbox(imagebox, (row['TSNE1'], row['TSNE2']), frameon=False, pad=0)
        plt.gca().add_artist(ab)

plt.show()


In [None]:
import matplotlib.pyplot as plt

# Create a scatter plot
plt.figure(figsize=(15, 15))
scatter = plt.scatter(smiles_df['TSNE1'], smiles_df['TSNE2'], c=range(len(smiles_df)), cmap='Set1')

# Customize the plot
plt.title("TSNE Visualization of VAE Embeddings")
plt.xlabel("TSNE Component 1")
plt.ylabel("TSNE Component 2")

# Add labels to data points
for index, row in smiles_df.iterrows():
    plt.annotate(row['Ligands'], (row['TSNE1'], row['TSNE2']), textcoords="offset points", xytext=(0,10), ha='center')

# Show the plot
plt.show()


In [None]:
# Create an interactive scatter plot using Plotly
fig = px.scatter(smiles_df, x='PHATE1', y='PHATE2', color='Covalent', 
                 hover_data=['PDB code', 'Ligands'])

# Customize the plot layout
fig.update_layout(title="PHATE Visualization of VAE Embeddings",
                  xaxis_title="PHATE Component 1",
                  yaxis_title="PHATE Component 2")

# Show the plot
fig.show()

In [None]:
# Create an interactive scatter plot using Plotly
fig = px.scatter(smiles_df, x='PHATE1', y='PHATE2', color='Covalent', 
                 hover_data=['PDB code', 'Ligands'])

# Customize the plot layout
fig.update_layout(title="PHATE Visualization of VAE Embeddings",
                  xaxis_title="PHATE Component 1",
                  yaxis_title="PHATE Component 2")

# Show the plot
fig.show()

In [None]:
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Draw
from matplotlib.offsetbox import TextArea, DrawingArea, OffsetImage, AnnotationBbox

# Create a scatter plot
plt.figure(figsize=(100, 100))
scatter = plt.scatter(smiles_df['PHATE1'], smiles_df['PHATE2'], c=range(len(smiles_df)), cmap='Set1')

# Customize the plot
plt.title("PHATE Visualization of VAE Embeddings")
plt.xlabel("PHATE Component 1")
plt.ylabel("PHATE Component 2")

# Add the skeletal formulas as images next to data points
for index, row in smiles_df.iterrows():
    smiles = row['smiles']
    molecule = Chem.MolFromSmiles(smiles)
    
    if molecule is not None:
        img = Draw.MolToImage(molecule, size=(500, 500))
        
        img = img.convert("RGBA")  # Convert to RGBA format for transparency support
        img_data = img.getdata()
        
        # Set transparent pixels where the image is white
        new_img_data = []
        for item in img_data:
            if item[:3] == (255, 255, 255):
                new_img_data.append((255, 255, 255, 100))  # Transparent pixel
            else:
                new_img_data.append(item)
        img.putdata(new_img_data)

        imagebox = OffsetImage(img, zoom=0.5)

        ab = AnnotationBbox(imagebox, (row['PHATE1'], row['PHATE2']), frameon=False, pad=0)
        plt.gca().add_artist(ab)

plt.show()


In [None]:
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Draw
from matplotlib.offsetbox import TextArea, DrawingArea, OffsetImage, AnnotationBbox

# Create a scatter plot
plt.figure(figsize=(100, 100))
scatter = plt.scatter(smiles_df['PHATE1'], smiles_df['PHATE2'], c=range(len(smiles_df)), cmap='Set1')

# Customize the plot
plt.title("PHATE Visualization of VAE Embeddings")
plt.xlabel("PHATE Component 1")
plt.ylabel("PHATE Component 2")

# Add the skeletal formulas as images next to data points
for index, row in smiles_df.iterrows():
    smiles = row['smiles']
    molecule = Chem.MolFromSmiles(smiles)
    
    if molecule is not None:
        img = Draw.MolToImage(molecule, size=(500, 500))
        
        img = img.convert("RGBA")  # Convert to RGBA format for transparency support
        img_data = img.getdata()
        
        # Set transparent pixels where the image is white
        new_img_data = []
        for item in img_data:
            if item[:3] == (255, 255, 255):
                new_img_data.append((255, 255, 255, 100))  # Transparent pixel
            else:
                new_img_data.append(item)
        img.putdata(new_img_data)

        imagebox = OffsetImage(img, zoom=0.5)

        ab = AnnotationBbox(imagebox, (row['PHATE1'], row['PHATE2']), frameon=False, pad=0)
        plt.gca().add_artist(ab)

plt.show()


In [None]:
import matplotlib.pyplot as plt

# Create a scatter plot
plt.figure(figsize=(15, 15))
scatter = plt.scatter(smiles_df['PHATE1'], smiles_df['PHATE2'], c=range(len(smiles_df)), cmap='Set1')

# Customize the plot
plt.title("PHATE Visualization of VAE Embeddings")
plt.xlabel("PHATE Component 1")
plt.ylabel("PHATE Component 2")

# Add labels to data points
for index, row in smiles_df.iterrows():
    plt.annotate(row['Ligands'], (row['PHATE1'], row['PHATE2']), textcoords="offset points", xytext=(0,10), ha='center')

# Show the plot
plt.show()


In [None]:
# Create an interactive 3D scatter plot using Plotly
fig = px.scatter_3d(smiles_df, x='PHATE1', y='PHATE2', z='PHATE3', color='Covalent', 
                    hover_data=['PDB', 'Ligands']
                    )

# Customize the plot layout
fig.update_layout(title="3D PHATE Visualization of VAE Embeddings",
                  scene=dict(xaxis_title="PHATE Component 1",
                             yaxis_title="PHATE Component 2",
                             zaxis_title="PHATE Component 3"))

# Show the plot
fig.show()
