In [18]:
import pandas as pd

In [45]:
df=pd.read_excel('Elastasa_curated_with_assay.xlsx')

In [46]:
df.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,pIC50,label,NEW_SMILES,assay_chembl_id
0,CHEMBL10008,CC(C)(C)C(=O)Oc1ccccc1C(=O)OCc1cc(=O)c(OC(=O)C...,5.267606,0,CC(C)(C)C(=O)Oc1ccccc1C(=O)OCc1cc(=O)c(OC(=O)C...,CHEMBL675722
1,CHEMBL10034,CC(C)(C)C(=O)Oc1ccc(C(=O)c2ccc(Cl)cc2)cc1,6.853872,1,CC(C)(C)C(=O)Oc1ccc(C(=O)c2ccc(Cl)cc2)cc1,CHEMBL675723
2,CHEMBL10089,CC(C)(C)C(=O)Oc1ccc(C(=O)OCc2cc(=O)c(OC(=O)C(C...,6.69897,1,CC(C)(C)C(=O)Oc1ccc(C(=O)OCc2cc(=O)c(OC(=O)C(C...,CHEMBL675723
3,CHEMBL10148,CC(C)(C)C(=O)Oc1ccc(C(OC(=O)CCC(=O)[O-])c2ccc(...,5.522879,0,CC(C)(C)C(=O)Oc1ccc(C(OC(=O)CCC(=O)O)c2ccc(OC(...,CHEMBL675722
4,CHEMBL10157,CC(C)(C)C(=O)Oc1ccc(Oc2ccc(OC(=O)C(C)(C)C)cc2)cc1,6.346787,1,CC(C)(C)C(=O)Oc1ccc(Oc2ccc(OC(=O)C(C)(C)C)cc2)cc1,CHEMBL675723


In [47]:
#Manipulaci칩n de datos y gr치ficas
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#Quimioinform치tica
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import PandasTools
from rdkit.Chem import Descriptors
PandasTools.RenderImagesInAllDataFrames(images=True) #Para visualizar mol칠culas

In [48]:
#Nueva columna de objeto tipo Mol
PandasTools.AddMoleculeColumnToFrame(df, "NEW_SMILES", "ROMol")

In [49]:
#C치lculo del resto de los descriptores a analizar
df["MW"] = df["ROMol"].apply(Descriptors.ExactMolWt)
df["HBA"] = df["ROMol"].apply(Descriptors.NumHAcceptors)
df["HBD"] = df["ROMol"].apply(Descriptors.NumHDonors)
df["logP"] = df["ROMol"].apply(Descriptors.MolLogP)
df["TPSA"] = df["ROMol"].apply(Descriptors.TPSA)
df["CSP3"] = df["ROMol"].apply(Descriptors.FractionCSP3)
df["NumRings"] = df["ROMol"].apply(Descriptors.RingCount)
df["HetAtoms"] = df["ROMol"].apply(Descriptors.NumHeteroatoms)
df["RotBonds"] = df["ROMol"].apply(Descriptors.NumRotatableBonds)
#Visualizar columnas seleccionadas
df[['MW', 'logP', 'TPSA', 'HBA']]

Unnamed: 0,MW,logP,TPSA,HBA
0,430.162768,3.8998,109.11,8
1,316.086622,4.5225,43.37,3
2,430.162768,3.8998,109.11,8
3,484.209718,5.0870,116.20,7
4,370.178024,5.3819,61.83,5
...,...,...,...,...
1812,368.162374,4.1323,80.67,4
1813,482.230453,5.7721,95.97,7
1814,630.267627,5.8028,161.71,12
1815,354.146724,3.8863,80.67,4


In [50]:
# ================================
# 游댢 SETUP
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

sns.set(style="whitegrid", context="notebook", font_scale=1.2)

# Convert label values
df["label"] = df["label"].map({0: "Inactive", 1: "Active"})

descriptors = ["MW", "HBA", "HBD", "logP", "TPSA", "CSP3", "NumRings", "HetAtoms", "RotBonds"]

# Create output directory
import os
os.makedirs("EDA_PLOT", exist_ok=True)

# Function to save figures
def save(fig, name):
    fig.savefig(f"EDA_PLOT/{name}.png", dpi=300, bbox_inches="tight")
    plt.close(fig)

In [53]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as Patch

sns.set_theme(style="whitegrid", rc={
    "axes.edgecolor": "0.25",
    "axes.linewidth": 1.1,
    "grid.linewidth": 0.6,
    "grid.color": "0.88",
})

# Categorical label
df["label"] = pd.Categorical(df["label"], categories=["Inactive", "Active"], ordered=True)

# Custom palette
palette = sns.color_palette("husl", 2)
color_map = {"Inactive": palette[0], "Active": palette[1]}

for col in descriptors:

    # Skip low-variance columns
    valid_values = df[col].dropna()
    if valid_values.nunique() < 2:
        print(f"SKIPPED: {col} (not enough unique values)")
        continue

    fig, ax = plt.subplots(figsize=(8, 5), dpi=300)

    # ----- HISTOGRAM -----
    sns.histplot(
        data=df,
        x=col,
        hue="label",
        bins=30,
        stat="density",
        alpha=0.45,
        edgecolor="black",
        linewidth=0.8,
        palette=color_map,
        ax=ax,
        legend=False
    )

    # ----- KDE SEPARADA (robusto y m치s limpio) -----
    sns.kdeplot(
        data=df,
        x=col,
        hue="label",
        ax=ax,
        linewidth=1.7,
        alpha=0.9,
        palette=color_map,
        legend=False
    )

    # ----- TITLES / LABELS -----
    ax.set_title(f"Distribution of {col}", fontsize=18, fontweight="bold", pad=15)
    ax.set_xlabel(col, fontsize=15)
    ax.set_ylabel("Density", fontsize=15)

    # ----- LEGEND MANUAL (siempre funciona) -----
    handles = [
        Patch.Patch(color=color_map["Inactive"], label="Inactive"),
        Patch.Patch(color=color_map["Active"], label="Active")
    ]

    leg = ax.legend(
        handles=handles,
        title="Activity",
        fontsize=12,
        title_fontsize=13,
        loc="upper right",
        frameon=True
    )

    leg.get_frame().set_linewidth(0.8)
    leg.get_frame().set_edgecolor("0.4")
    leg.get_frame().set_alpha(0.85)

    # ----- STYLE -----
    sns.despine(offset=10, trim=True)
    plt.tight_layout()

    # ----- SAVE -----
    save(fig, f"hist_{col}")




In [54]:
import seaborn as sns
import matplotlib.pyplot as plt

# Global aesthetic theme
sns.set_theme(style="whitegrid", rc={
    "axes.edgecolor": "0.3",
    "axes.linewidth": 1.1,
    "grid.color": "0.9",
    "grid.linewidth": 0.8,
})

palette = {"Inactive": "#5DA5DA", "Active": "#F17CB0"}  # harmonious colors

for col in descriptors:

    # Skip low-variance columns
    if df[col].dropna().nunique() < 2:
        print(f"Skipping {col}: not enough variation")
        continue

    fig, ax = plt.subplots(figsize=(7, 5), dpi=300)

    # ----- Main Boxplot -----
    sns.boxplot(
        data=df,
        x="label",
        y=col,
        palette=palette,
        linewidth=1.4,
        fliersize=0,     # remove default outliers for cleaner look
        ax=ax
    )

    # ----- OPTIONAL: Overlay swarmplot for individual points -----
    sns.stripplot(
        data=df,
        x="label",
        y=col,
        hue=None,
        dodge=False,
        alpha=0.35,
        jitter=0.15,
        color="black",
        size=3,
        ax=ax
    )

    # ----- Titles and labels -----
    ax.set_title(f"{col}: Distribution by Activity", fontsize=18, fontweight="bold", pad=12)
    ax.set_xlabel("Activity Class", fontsize=14)
    ax.set_ylabel(col, fontsize=14)

    # Clean legend and spines
    sns.despine(offset=10, trim=True)

    plt.tight_layout()
    save(fig, f"box_{col}")

    print(f"Saved box_{col}")



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(


Saved box_MW



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(


Saved box_HBA



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(


Saved box_HBD



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(


Saved box_logP



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(


Saved box_TPSA



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(


Saved box_CSP3



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(


Saved box_NumRings



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(


Saved box_HetAtoms



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(


Saved box_RotBonds


In [55]:
import seaborn as sns
import matplotlib.pyplot as plt

# Elegant global style
sns.set_theme(style="whitegrid", rc={
    "axes.edgecolor": "0.25",
    "axes.linewidth": 1.2,
    "grid.color": "0.9",
    "grid.linewidth": 0.8,
})

# Professional palette
palette = {"Inactive": "#5DA5DA", "Active": "#F17CB0"}  # blue / magenta

for col in descriptors:

    # Skip columns with insufficient variability
    if df[col].dropna().nunique() < 2:
        print(f"Skipping {col}: not enough variation")
        continue

    fig, ax = plt.subplots(figsize=(7, 5), dpi=300)

    # ---- Violin plot ----
    sns.violinplot(
        data=df,
        x="label",
        y=col,
        palette=palette,
        inner="quartile",        # draws quartile lines
        linewidth=1.3,
        cut=0,                   # no extension beyond data range
        scale="width",           # cleaner proportions
        ax=ax
    )

    # ---- OPTIONAL SWARM OVERLAY (Highly Recommended) ----
    sns.stripplot(
        data=df,
        x="label",
        y=col,
        color="black",
        size=3,
        jitter=0.15,
        alpha=0.32,
        ax=ax
    )

    # ---- Titles and labels ----
    ax.set_title(
        f"{col}: Violin Plot by Activity",
        fontsize=18,
        fontweight="bold",
        pad=12
    )
    ax.set_xlabel("Activity", fontsize=14)
    ax.set_ylabel(col, fontsize=14)

    sns.despine(offset=10, trim=True)

    plt.tight_layout()
    save(fig, f"violin_{col}")

    print(f"Saved violin_{col}")



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(

The `scale` parameter has been renamed and will be removed in v0.15.0. Pass `density_norm='width'` for the same effect.
  sns.violinplot(


Saved violin_MW



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(

The `scale` parameter has been renamed and will be removed in v0.15.0. Pass `density_norm='width'` for the same effect.
  sns.violinplot(


Saved violin_HBA



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(

The `scale` parameter has been renamed and will be removed in v0.15.0. Pass `density_norm='width'` for the same effect.
  sns.violinplot(


Saved violin_HBD



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(

The `scale` parameter has been renamed and will be removed in v0.15.0. Pass `density_norm='width'` for the same effect.
  sns.violinplot(


Saved violin_logP



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(

The `scale` parameter has been renamed and will be removed in v0.15.0. Pass `density_norm='width'` for the same effect.
  sns.violinplot(


Saved violin_TPSA



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(

The `scale` parameter has been renamed and will be removed in v0.15.0. Pass `density_norm='width'` for the same effect.
  sns.violinplot(


Saved violin_CSP3



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(

The `scale` parameter has been renamed and will be removed in v0.15.0. Pass `density_norm='width'` for the same effect.
  sns.violinplot(


Saved violin_NumRings



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(

The `scale` parameter has been renamed and will be removed in v0.15.0. Pass `density_norm='width'` for the same effect.
  sns.violinplot(


Saved violin_HetAtoms



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(

The `scale` parameter has been renamed and will be removed in v0.15.0. Pass `density_norm='width'` for the same effect.
  sns.violinplot(


Saved violin_RotBonds


In [56]:
import seaborn as sns
import matplotlib.pyplot as plt

# High-quality style
sns.set_theme(style="whitegrid", rc={
    "axes.edgecolor": "0.25",
    "axes.linewidth": 1.2,
    "grid.color": "0.90",
    "grid.linewidth": 0.8,
    "lines.linewidth": 2.0
})

palette = {"Inactive": "#4C72B0", "Active": "#DD8452"}  # Blue / Orange

for col in descriptors:

    # Skip if feature has too few unique values
    if df[col].dropna().nunique() < 2:
        print(f"Skipping {col}: not enough variation.")
        continue

    fig, ax = plt.subplots(figsize=(8, 5), dpi=300)

    # --- ECDF Plot ---
    sns.ecdfplot(
        data=df,
        x=col,
        hue="label",
        palette=palette,
        ax=ax,
        linewidth=2.2
    )

    # --- Optional Shading Behind Curve ---
    # This visually enhances the plot
    for lab in ["Inactive", "Active"]:
        subset = df[df["label"] == lab][col].dropna().sort_values()
        if len(subset) > 1:
            ax.fill_between(
                subset,
                np.linspace(0, 1, len(subset)),
                alpha=0.12,
                color=palette[lab]
            )

    # --- Titles & Labels ---
    ax.set_title(
        f"ECDF of {col}",
        fontsize=18,
        fontweight="bold",
        pad=12
    )
    ax.set_xlabel(col, fontsize=14)
    ax.set_ylabel("Cumulative Probability", fontsize=14)

    # --- Legend Styling ---
    leg = ax.legend(
        title="Activity",
        fontsize=12,
        title_fontsize=13,
        loc="lower right",
        frameon=True
    )
    leg.get_frame().set_linewidth(0.8)
    leg.get_frame().set_edgecolor("0.4")
    leg.get_frame().set_alpha(0.85)

    sns.despine(offset=10, trim=True)

    plt.tight_layout()
    save(fig, f"ecdf_{col}")

    print(f"Saved ecdf_{col}")


  leg = ax.legend(


Saved ecdf_MW


  leg = ax.legend(


Saved ecdf_HBA


  leg = ax.legend(


Saved ecdf_HBD


  leg = ax.legend(


Saved ecdf_logP


  leg = ax.legend(


Saved ecdf_TPSA


  leg = ax.legend(


Saved ecdf_CSP3


  leg = ax.legend(


Saved ecdf_NumRings


  leg = ax.legend(


Saved ecdf_HetAtoms


  leg = ax.legend(


Saved ecdf_RotBonds


In [60]:
import seaborn as sns
import matplotlib.pyplot as plt

# Professional global theme
sns.set_theme(
    style="white",
    rc={
        "axes.edgecolor": "0.25",
        "axes.linewidth": "1.0",
        "grid.color": "0.9",
        "grid.linewidth": 0.7,
        "xtick.labelsize": 9,
        "ytick.labelsize": 9,
        "font.size": 10,
    }
)

# High-quality palette
palette = {"Inactive": "#4C72B0", "Active": "#DD8452"}

# --- Create pairplot ---
g = sns.pairplot(
    df,
    vars=descriptors,
    hue="label",
    palette=palette,
    diag_kind="kde",
    corner=False,
    height=2.2,
    plot_kws={
        "alpha": 0.55,
        "s": 18,
        "edgecolor": "black",
        "linewidth": 0.4
    },
    diag_kws={
        "fill": True,       # FIX: replaces deprecated "shade"
        "alpha": 0.35,
        "linewidth": 1.2
    }
)

# Title
g.fig.suptitle(
    "Pairplot of Molecular Descriptors",
    fontsize=18,
    fontweight="bold",
    y=1.02
)

# Improve legend
if g._legend:
    g._legend.set_title("Activity Class")
    frame = g._legend.get_frame()
    frame.set_edgecolor("0.4")
    frame.set_linewidth(0.8)
    frame.set_alpha(0.8)

# Save
g.savefig("EDA_PLOT/pairplot.png", dpi=300, bbox_inches="tight")
plt.close()
print("Saved: EDA_PLOT/pairplot.png")



Saved: EDA_PLOT/pairplot.png
