In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
import numpy as np
from mordred import Calculator, descriptors
from rdkit.Chem import Crippen

monomers = list(pd.read_csv("aa_monomers.csv").to_dict()['SMILES'].values())
# symbols = list(pd.read_csv("peptide/aa_monomers.csv").to_dict()['code'].values()) 
fps = []

def build_rdkit_mol(smiles):
    mol = Chem.MolFromSmiles(smiles)
    mol = Chem.AddHs(mol) 

    params = AllChem.ETKDGv2()
    params.useRandomCoords=True
    params.maxAttempts = 1000
    params.boxSizeMult = 2.0
    params.randomSeed = 3
    status = AllChem.EmbedMolecule(mol, params)

    if status != 0:
        raise RuntimeError("Failed to generate 3D coordinates")

    # Step 3: Optimize geometry using MMFF
    AllChem.MMFFOptimizeMolecule(mol)

    return mol

def get_descp(smiles):
    mol = build_rdkit_mol(smiles)
    calc = Calculator(descriptors, ignore_3D=False)
    descp = list(calc(mol).values())
    return descp

def is_valid(vec):
    return (type(vec) == int) or (type(vec) == float) or (type(vec) == np.float64) or (type(vec) == str)

def is_valid_list(List):
    return [is_valid(value) for value in List]


def descp(SMILES_list):
    descp_array = np.array([get_descp(smiles) for smiles in SMILES_list], dtype=object)

    def is_col_numeric(col):
        return all(isinstance(x, (int, float, np.float64)) for x in col)
    
    numeric_mask = np.apply_along_axis(is_col_numeric, axis=0, arr=descp_array)
    filtered_descp = descp_array[:,numeric_mask]

    # Find columns that have all the same value
    mask = ~(np.all(filtered_descp == filtered_descp[0, :], axis=0))  # Keep columns that have variation

    # Apply the mask to filter out uniform columns
    filtered_descp = filtered_descp[:, mask]

    # Min-max normalization for each column
    min_vals = filtered_descp.min(axis=0)
    max_vals = filtered_descp.max(axis=0)

    normalized_arr = (filtered_descp - min_vals) / (max_vals - min_vals)

    fps = []
    for monomer in SMILES_list:
        fpgen = AllChem.GetMorganGenerator(radius=2)
        mol = Chem.MolFromSmiles(monomer)
        fp = fpgen.GetFingerprint(mol)
        fp = np.frombuffer(fp.ToBitString().encode(), 'u1') - ord('0')
        fps.append(fp)
    fps = np.stack(fps)
    return np.concatenate((normalized_arr, fps), axis=1)

def get_logp_rdkit(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Crippen.MolLogP(mol)
    return None

# for monomer in monomers:
#     fpgen = AllChem.GetMorganGenerator(radius=2)
#     mol = Chem.MolFromSmiles(monomer)
#     fp = fpgen.GetFingerprint(mol)
#     fp = np.frombuffer(fp.ToBitString().encode(), 'u1') - ord('0')
#     fps.append(fp)

# fps = np.vstack(fps)


ModuleNotFoundError: No module named 'distutils'

In [2]:
data = descp(monomers)

In [3]:
import numpy as np
import plotly.express as px
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

# Generate random binary data (replace with your actual data)
n_samples = 260
# vector_size = 1024
continuous_values = [get_logp_rdkit(smiles) for smiles in monomers]  # Example continuous values


# Apply t-SNE for dimensionality reduction
tsne = TSNE(n_components=3, random_state=42)
embedded_data = tsne.fit_transform(data)

# Perform clustering (K-Means with 5 clusters as an example)
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(embedded_data)

# Create a mapping of cluster labels to different marker symbols
marker_symbols = ['circle', 'square', 'diamond', 'cross', 'star']
marker_styles = [marker_symbols[label % len(marker_symbols)] for label in cluster_labels]

# Create an interactive 3D scatter plot using Plotly
fig = px.scatter_3d(
    x=embedded_data[:, 0],
    y=embedded_data[:, 1],
    z=embedded_data[:, 2],
    color=continuous_values,  # Use continuous values for color
    # symbol=cluster_labels,  # Use cluster labels for different marker shapes
    text=symbols,  # Add labels to the visualization
    labels={'x': 'TSNE Component 1', 'y': 'TSNE Component 2', 'z': 'TSNE Component 3'},
    title='Interactive 3D t-SNE Visualization of Binary Vectors',
    color_continuous_scale='viridis'  # Adjust color scale as needed
)

fig.update_traces(marker=dict(size=5, opacity=0.8), textposition='top center')
fig.show(renderer="browser")


In [4]:
import numpy as np
import plotly.express as px
from sklearn.manifold import TSNE

# Generate random binary data (replace with your actual data)
n_samples = 300
vector_size = 1024
data = np.random.randint(0, 2, size=(n_samples, vector_size))

# Generate random labels for some points (replace with your actual labels)
labels = np.array([f'Point {i}' if np.random.rand() > 0.5 else '' for i in range(n_samples)])

# Apply t-SNE for dimensionality reduction
tsne = TSNE(n_components=3, random_state=42)
embedded_data = tsne.fit_transform(data)

# Create an interactive 3D scatter plot using Plotly
fig = px.scatter_3d(
    x=embedded_data[:, 0],
    y=embedded_data[:, 1],
    z=embedded_data[:, 2],
    color=np.arange(n_samples),
    text=labels,  # Add labels to the visualization
    labels={'x': 'TSNE Component 1', 'y': 'TSNE Component 2', 'z': 'TSNE Component 3'},
    title='Interactive 3D t-SNE Visualization of Binary Vectors'
)

fig.update_traces(marker=dict(size=5, opacity=0.8), textposition='top center')
fig.show(renderer="browser")