In [None]:
"""
Code function:
1)This code is just to determine the number of elements of every Pathway, SuperClass and Class and plotting into a pie chart.
2)It is intended to use when it is performed an update of a database and instead of classifying all the compounds, just
2)Works from a file that already have the compounds classified by their Pathway, SuperClass and Class.
    The code uses as input a file with the structure obtained in the input file results_separated.xlsx with the code 
    Database classification counts and plots.ipynb

1)This code classifies a natural product from its SMILES strings according to their Pathway, SuperClass and Class.
    Uses the NPClassifier free-access application reported in  https://pubs.acs.org/doi/10.1021/acs.jnatprod.1c00399 .
    It uses the API of NPClassifier, nonetheless, the SMILES can be individually submitted in the following web site: https://npclassifier.ucsd.edu/ .
2)After classifying all the compounds are determined the individual counts of all the obtained labels in Pathway, SuperClass and Class and this 
information is exported into an excel file.
3)Pie charts are constructed to visualize the previous information.

Requirements/Notes: 
1) The column containing the IDs must have a header labeled 'ID'.
2) The column containing the SMILES must have a header labeled 'Smiles'.
3) Modify in the script the name of the input file.
4) The file xxx_counts_complete.xlsx contain the individual counts of all the compounds.
   The file xxx_counts_modified.xlsx contain the counts of just the first 9 labels with more compounds and the remaining are grouped as 'Others'.
In the blocks of code where the counts are set, modify the number in the following lines of class and superclass 'top_n_class = 9' if it is
intended to keep a different number of top entries and group the remaining as 'Others' in the file xxx_counts_modified.xlsx. 
5)The pie charts uses as input xxx_counts_complete.xlsx, nonetheless, can be modified the number of the following line of code on every chart to
keep a diferent number of top entries and group the remaining as 'Others' : 'top_n = 10'.
"""
import pandas as pd

# -----------------------------
# READ ORIGINAL DATA
# -----------------------------
df = pd.read_excel("Name of the input file.xlsx") # HERE MODIFY THE NAME AND TYPE OF THE INPUT FILE

# Select only columns that start with "Pathway_"
pathway_cols = [col for col in df.columns if col.startswith("Pathway_")]

if not pathway_cols:
    raise ValueError("No columns starting with 'Pathway_' were found in the file.")

# Melt all Pathway_ columns into a single column
df_melted = df.melt(value_vars=pathway_cols, value_name="Pathway")

# Clean values
df_melted["Pathway"] = df_melted["Pathway"].astype(str).str.strip()

# Drop empty or invalid values
invalid_values = {"", "nan", "NaN", "None", "not_classified", "try_again_error", "time_error"}
df_melted = df_melted[~df_melted["Pathway"].isin(invalid_values)]

# Count occurrences
counts = df_melted["Pathway"].value_counts().reset_index()
counts.columns = ["Pathway", "counts"]

# Save to Excel
counts.to_excel("Pathway_counts.xlsx", index=False)

print("✅ File 'Pathway_counts.xlsx' created successfully with", len(counts), "unique pathways.")
print(counts.head(10))  # Preview first 10 rows


In [None]:
# -----------------------------
# SUPERCLASS
# -----------------------------
super_cols = [col for col in df.columns if col.startswith("SuperClass_")]

if not super_cols:
    raise ValueError("No columns starting with 'SuperClass_' were found in the file.")

# Melt all SuperClass_ columns into a single column
df_super = df.melt(value_vars=super_cols, value_name="SuperClass")

# Clean values
df_super["SuperClass"] = df_super["SuperClass"].astype(str).str.strip()

# Drop empty or invalid values
df_super = df_super[~df_super["SuperClass"].isin(invalid_values)]

# Count occurrences
super_counts = df_super["SuperClass"].value_counts().reset_index()
super_counts.columns = ["SuperClass", "counts"]

# Save to Excel
super_counts.to_excel("SuperClass_counts.xlsx", index=False)

print("✅ File 'SuperClass_counts.xlsx' created successfully with", len(super_counts), "unique superclasses.")
print(super_counts.head(10))  # Preview first 10 rows


In [None]:
# -----------------------------
# CLASS
# -----------------------------
class_cols = [col for col in df.columns if col.startswith("Class_")]

if not class_cols:
    raise ValueError("No columns starting with 'Class_' were found in the file.")

# Melt all Class_ columns into a single column
df_class = df.melt(value_vars=class_cols, value_name="Class")

# Clean values
df_class["Class"] = df_class["Class"].astype(str).str.strip()

# Drop empty or invalid values
df_class = df_class[~df_class["Class"].isin(invalid_values)]

# Count occurrences
class_counts = df_class["Class"].value_counts().reset_index()
class_counts.columns = ["Class", "counts"]

# Save to Excel
class_counts.to_excel("Class_counts.xlsx", index=False)

print("✅ File 'Class_counts.xlsx' created successfully with", len(class_counts), "unique classes.")
print(class_counts.head(10))  # Preview first 10 rows


In [None]:
import plotly.express as px
import plotly
import kaleido
import os
import pandas as pd

# Create folder for images if it doesn't exist
if not os.path.exists("images"):
    os.mkdir("images")

# Read data from Excel
dfi = pd.read_excel("Pathway_counts.xlsx")

# ============================
# Parameter: number of top elements to keep
# Change this number (default = 10)
# ============================
top_n = 10   # <<< Change here (e.g., 15, 20...)

# Sort values by counts (descending)
dfi_sorted = dfi.sort_values(by="counts", ascending=False).reset_index(drop=True)

# Keep top_n and group the rest as "Others"
top_data = dfi_sorted.iloc[:top_n].copy()
others_sum = dfi_sorted.iloc[top_n:]["counts"].sum()

if others_sum > 0:
    top_data = pd.concat([
        top_data,
        pd.DataFrame({"Pathway": ["Others"], "counts": [others_sum]})
    ])

# Create pie chart (no hole to avoid white center)
fig = px.pie(
    top_data,
    names="Pathway",
    values="counts",
    title="Pathway"
)

# Update pie slices (percentages inside chart)
fig.update_traces(
    textinfo="percent",
    textfont_size=28   # <<< Change size of percentages here
)

# Update layout (title, legend, margins)
fig.update_layout(
    title=dict(
        text="Pathway",
        x=0.2,     # <<< Horizontal position of the title
        y=0.9,     # <<< Vertical position of the title
        xanchor="right",
        yanchor="top",
        font=dict(size=46)  # <<< Title font size
    ),
    font_size=20,
    legend_title_text="", 
    legend=dict(
        orientation="v",
        yanchor="middle",   # <<< Anchor point of legend relative to y
        y=0.5,              # <<< Vertical position of legend
        xanchor="left",     # <<< Anchor point of legend relative to x
        x=1,                # <<< Horizontal position of legend
        font=dict(size=34)  # <<< Legend label text size
    ),
    width=1400,
    height=800,
    margin=dict(t=150, b=100, l=80, r=300)  # <<< Adjust spacing around figure
)

# Show interactive chart
fig.show()

# ============================
# Export for publication
# ============================

# Save normal image
fig.write_image(
    "images/1_pathway.png",
    format="png",
    width=1250,
    height=800,
    scale=1   # <<< Scale factor here
)


In [None]:
import plotly.express as px
import plotly
import kaleido
import os
import pandas as pd

# Create folder for images if it doesn't exist
if not os.path.exists("images"):
    os.mkdir("images")

# Read data from Excel
dfi = pd.read_excel("SuperClass_counts.xlsx")

# ============================
# Parameter: number of top elements to keep
# Change this number (default = 10)
# ============================
top_n = 10   # <<< Change here (e.g., 15, 20...)

# Sort values by counts (descending)
dfi_sorted = dfi.sort_values(by="counts", ascending=False).reset_index(drop=True)

# Keep top_n and group the rest as "Others"
top_data = dfi_sorted.iloc[:top_n].copy()
others_sum = dfi_sorted.iloc[top_n:]["counts"].sum()

if others_sum > 0:
    top_data = pd.concat([
        top_data,
        pd.DataFrame({"SuperClass": ["Others"], "counts": [others_sum]})
    ])

# Create pie chart (no hole to avoid white center)
fig = px.pie(
    top_data,
    names="SuperClass",
    values="counts",
    title="SuperClass"
)

# Update pie slices (percentages inside chart)
fig.update_traces(
    textinfo="percent",
    textfont_size=28   # <<< Change size of percentages here
)

# Update layout (title, legend, margins)
fig.update_layout(
    title=dict(
        text="SuperClass",
        x=0.2,     # <<< Horizontal position of the title
        y=0.9,     # <<< Vertical position of the title
        xanchor="right",
        yanchor="top",
        font=dict(size=46)  # <<< Title font size
    ),
    font_size=20,
    legend_title_text="", 
    legend=dict(
        orientation="v",
        yanchor="middle",   # <<< Anchor point of legend relative to y
        y=0.5,              # <<< Vertical position of legend
        xanchor="left",     # <<< Anchor point of legend relative to x
        x=1,                # <<< Horizontal position of legend
        font=dict(size=34)  # <<< Legend label text size
    ),
    width=1400,
    height=800,
    margin=dict(t=150, b=100, l=80, r=300)  # <<< Adjust spacing around figure
)

# Show interactive chart
fig.show()

# ============================
# Export for publication
# ============================

# Save normal image
fig.write_image(
    "images/2_superclass.png",  # <<< File name
    format="png",
    width=1250,
    height=800,
    scale=1   # <<< Scale factor (1 = normal resolution, 2 = double, etc.)
)


In [None]:
import plotly.express as px
import plotly
import kaleido
import os
import pandas as pd

# Create folder for images if it doesn't exist
if not os.path.exists("images"):
    os.mkdir("images")

# Read data from Excel
dfi = pd.read_excel("Class_counts.xlsx")

# ============================
# Parameter: number of top elements to keep
# Change this number (default = 10)
# ============================
top_n = 10   # <<< Change here (e.g., 15, 20...)

# Sort values by counts (descending)
dfi_sorted = dfi.sort_values(by="counts", ascending=False).reset_index(drop=True)

# Keep top_n and group the rest as "Others"
top_data = dfi_sorted.iloc[:top_n].copy()
others_sum = dfi_sorted.iloc[top_n:]["counts"].sum()

if others_sum > 0:
    top_data = pd.concat([
        top_data,
        pd.DataFrame({"Class": ["Others"], "counts": [others_sum]})
    ])

# Create pie chart (no hole to avoid white center)
fig = px.pie(
    top_data,
    names="Class",
    values="counts",
    title="Class"
)

# Update pie slices (percentages inside chart)
fig.update_traces(
    textinfo="percent",
    textfont_size=28   # <<< Change size of percentages here
)

# Update layout (title, legend, margins)
fig.update_layout(
    title=dict(
        text="Class",
        x=0.2,     # <<< Horizontal position of the title
        y=0.9,     # <<< Vertical position of the title
        xanchor="right",
        yanchor="top",
        font=dict(size=46)  # <<< Title font size
    ),
    font_size=20,
    legend_title_text="", 
    legend=dict(
        orientation="v",
        yanchor="middle",   # <<< Anchor point of legend relative to y
        y=0.5,              # <<< Vertical position of legend
        xanchor="left",     # <<< Anchor point of legend relative to x
        x=1,                # <<< Horizontal position of legend
        font=dict(size=34)  # <<< Legend label text size
    ),
    width=1400,
    height=800,
    margin=dict(t=150, b=100, l=80, r=300)  # <<< Adjust spacing around figure
)

# Show interactive chart
fig.show()

# ============================
# Export for publication
# ============================

# Save normal image
fig.write_image(
    "images/3_class.png",  # <<< File name changed
    format="png",
    width=1250,
    height=800,
    scale=1   # <<< Scale factor (1 = normal resolution, 2 = double, etc.)
)
