# tSNE Plot for Fragment Libraries 

The tSNE plot requires two parameters perplexity and metric which are user defined. For the plot in the paper, a perplexity of 45 with a Euclidean metric was used.

In [None]:
import ast

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from openTSNE import TSNE
from rdkit import Chem
from sklearn.decomposition import PCA
from tqdm import tqdm

In [None]:
PERPLEXITY = 45
METRIC = "euclidean"

In [None]:
# Paths to the datasets
paths = {
    "GDB11": "input_datasets/GDB11_100k_sample.parquet",
    "Enamine": "input_datasets/Enamine_100k_sample.parquet",
    "FDA_232": "input_datasets/FDA_232_drugs.csv",
    "known_abx_559": "input_datasets/known_abx_559.csv",
    "SA_actives": "input_datasets/SA_actives_512.csv",
    "NG_actives": "input_datasets/NG_actives_1335.csv",
    "inactives": "input_datasets/combined_inactive_smiles.csv",
    "F2/F3": "input_datasets/F2_F3_smiles.csv",
}

# Load datasets
dataframes_all = {}
for key, path in paths.items():
    if path.endswith(".parquet"):
        dataframes_all[key] = pd.read_parquet(path)
    elif path.endswith(".csv"):
        dataframes_all[key] = pd.read_csv(path)

known_abx = pd.concat([dataframes_all["FDA_232"], dataframes_all["known_abx_559"]])
actives = pd.concat([dataframes_all["SA_actives"], dataframes_all["NG_actives"]])

dataframes = {
    "GDB11": dataframes_all["GDB11"],
    "Enamine": dataframes_all["Enamine"],
    "known_abx": known_abx,
    "actives": actives,
    "inactives": dataframes_all["inactives"],
    "F2/F3": dataframes_all["F2/F3"],
}

len(dataframes)  # GDB11, Enamine, known_abx, actives, inactives

In [None]:
def compute_rdkit_fps(smiles_list, label="", min_path=1, max_path=7, fp_size=2048):
    """
    Compute RDKit fingerprints from a list of SMILES with corresponding labels.
    Skips invalid or missing SMILES values.
    """
    fingerprints = []
    labels = []

    for smi in tqdm(smiles_list, desc="Computing RDKit Fingerprints"):
        if not isinstance(smi, str):  # Skip non-string values
            continue
        mol = Chem.MolFromSmiles(smi)
        if mol:
            fingerprints.append(
                Chem.RDKFingerprint(
                    mol, minPath=min_path, maxPath=max_path, fpSize=fp_size
                )
            )
            labels.append(label)

    if not fingerprints:
        return pd.DataFrame(
            columns=["fingerprint", "label"]
        )  # Return empty DataFrame if no valid SMILES

    fingerprints_array = [np.array(fp) for fp in fingerprints]
    return pd.DataFrame({"fingerprint": fingerprints_array, "label": labels})


fp_dfs = []

for key, df in dataframes.items():
    fp_df = compute_rdkit_fps(df["SMILES"], label=key)
    fp_dfs.append(fp_df)

In [None]:
def compute_tSNE(
    fingerprint_dfs, perplexity=45, metric="euclidean", negative_gradient_method="auto"
):
    """
    Perform t-SNE on fingerprints using PCA for initialization.

    :param fingerprint_dfs: List or array of DataFrames with fingerprints and labels
    :param perplexity: Perplexity for t-SNE (default: 30)
    :param metric: Metric for t-SNE (default: "euclidean")
    :return: DataFrame with t-SNE embeddings and labels
    """
    # Concatenate all DataFrames
    if isinstance(fingerprint_dfs, list) or isinstance(fingerprint_dfs, np.ndarray):
        fingerprint_df = pd.concat(fingerprint_dfs, ignore_index=True)
    else:
        fingerprint_df = fingerprint_dfs  # Assume it's already a single DataFrame

    fp_array = np.vstack(fingerprint_df["fingerprint"])
    labels = fingerprint_df["label"].tolist()

    # Perform PCA for initialization
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(fp_array)

    # Perform t-SNE
    tsne = TSNE(
        n_components=2,
        initialization=pca_result,
        negative_gradient_method=negative_gradient_method,
        perplexity=perplexity,
        n_jobs=-1,
        metric=metric,
        verbose=True,
        random_state=777,
    )
    tsne_result = tsne.fit(fp_array)

    # Create a DataFrame with t-SNE results
    tsne_df = pd.DataFrame(tsne_result, columns=["X", "Y"])
    tsne_df["label"] = labels
    return tsne_df


tsne_df = compute_tSNE(fp_dfs, perplexity=PERPLEXITY, metric=METRIC)
tsne_df.to_csv("tsne_perp=45_metric=euclidean_with_F2_and_F3_no_GDB17.csv")

In [None]:
PERP = 45
METRIC = "euclidean"
RASTERIZED = True

# Ensure the 'label' column is parsed correctly
tsne_df["label"] = tsne_df["label"].apply(
    lambda x: ast.literal_eval(x)[0] if isinstance(x, str) and x.startswith("[") else x
)

# Update the last and second-to-last labels
tsne_df.iloc[-1, tsne_df.columns.get_loc("label")] = "F3"  # Last label
tsne_df.iloc[-2, tsne_df.columns.get_loc("label")] = "F2"  # Second-to-last label

# Assuming tsne_df is already defined and contains "X", "Y", and "label" columns
fig_path = "./output_jackie/"
colors = ["#808687", "#AE8EB4", "#89c0fa", "#CC3300", "#0096FF", "#52255C", "#CC402A"]
labels_to_plot = [
    "GDB11",
    "Enamine",
    "Inactives",
    "Known Antibiotics",
    "Actives",
    "F2",
    "F3",
]

# Assign colors to labels
palette = dict(zip(labels_to_plot, colors))

plt.figure(figsize=(8, 5), dpi=300)

# Plot GDB11, GDB17, Enamine first
for label in ["GDB11", "Enamine"]:
    subset = tsne_df[tsne_df["label"] == label]
    sns.scatterplot(
        x=subset["X"],
        y=subset["Y"],
        label=label,
        color=palette[label],
        alpha=0.25,
        s=6,
        rasterized=RASTERIZED,
    )

# Plot Inactives
subset = tsne_df[tsne_df["label"] == "inactives"]
sns.scatterplot(
    x=subset["X"],
    y=subset["Y"],
    label="Inactives",
    color=palette["Inactives"],
    alpha=0.8,
    s=6,
    rasterized=RASTERIZED,
)

# Plot Actives
subset = tsne_df[tsne_df["label"] == "actives"]
sns.scatterplot(
    x=subset["X"],
    y=subset["Y"],
    label="Actives",
    color=palette["Actives"],
    alpha=1,
    edgecolor="black",
    linewidth=0.5,  # Thicker edges
    s=6,
    rasterized=RASTERIZED,
)

# Plot Known Antibiotics
subset = tsne_df[tsne_df["label"] == "known_abx"]
sns.scatterplot(
    x=subset["X"],
    y=subset["Y"],
    label="Known Antibiotics",
    color=palette["Known Antibiotics"],
    edgecolor="black",
    linewidth=0.5,  # Thicker edges
    alpha=1,
    s=6,
    rasterized=RASTERIZED,
)

# Plot F2
subset = tsne_df[tsne_df["label"] == "F2"]
sns.scatterplot(
    x=subset["X"],
    y=subset["Y"],
    label="F2",
    color="orange",
    alpha=1,
    edgecolor="black",
    linewidth=0.8,  # Thicker edges
    s=25,
    rasterized=RASTERIZED,
)

# Plot F3
subset = tsne_df[tsne_df["label"] == "F3"]
sns.scatterplot(
    x=subset["X"],
    y=subset["Y"],
    label="F3",
    color="purple",
    alpha=1,
    linewidth=0.8,  # Thicker edges
    edgecolor="black",
    s=25,
    rasterized=RASTERIZED,
)

plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.savefig(fig_path + f"tsne_perp{PERP}_{METRIC}_with_F2_F3.png")
plt.savefig(fig_path + f"tsne_perp{PERP}_{METRIC}_with_F2_F3.svg")
plt.show()