In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report

# 1. example data

plants = [
    "Papaver somniferum", "Atropa belladonna", "Catharanthus roseus",
    "Digitalis purpurea", "Taxus brevifolia", "Erythroxylum coca",
    "Cinchona officinalis", "Salix alba", "Ginkgo biloba",
    "Curcuma longa", "Artemisia annua", "Panax ginseng",
    "Rauvolfia serpentina", "Ephedra sinica", "Camellia sinensis",
    "Hypericum perforatum", "Valeriana officinalis", "Zingiber officinale"
]

growth_forms = ["herb", "shrub", "tree"]
native_areas = ["Asia", "Europe", "South America", "Africa", "North America"]

# simulated dataframe (create a pandas dataframe using a dictionary)
np.random.seed(42)
df = pd.DataFrame({
    "plant": plants,
    "growth_form": np.random.choice(growth_forms, len(plants)),
    "native_area": np.random.choice(native_areas, len(plants)),
    "traditional_use": np.random.choice([0, 1], len(plants)),
    "known_bioactive": np.random.choice([0, 1], len(plants)),
})

# bioactive compounds (16)
compound_classes = [
    "vincristine", "morphine", "atropine", "quinine",
    "digitoxin", "artemisinin", "resveratrol", "curcumin",
    "caffeine", "ephedrine", "ginsenoside", "salicin",
    "taxol", "hypericin", "reserpine", "gingerol"
]

# pharmacological activities (random 2-3 per plant)
activities = [
    "anti-inflammatory", "antibacterial", "antiviral", "anticancer",
    "analgesic", "sedative", "stimulant", "antidepressant",
    "antimalarial", "antipyretic", "hypotensive", "immunomodulatory",
    "hepatoprotective", "cardiotonic", "neuroprotective", "diuretic",
    "antidiabetic", "anxiolytic"
]

# generate random multi-labels
def random_labels(label_list, n_labels=2):  # label_list is a list of possible labels (compound or pharma activity), n_labels is the max number of labels to assign
    return list(np.random.choice(label_list, size=np.random.randint(1, n_labels+1), replace=False))  
# np.random.randint(1, n_labels+1), randomly choose a number of labels between 1 and n_labels (included)
# np.random.choice(label_list, size=..., replace=False), randomly select labels from label_list without duplication

df["bioactive_compounds"] = [random_labels(compound_classes) for _ in df.index]
df["pharma_activities"] = [random_labels(activities) for _ in df.index]
# add two columns to the dataframe, one for bioactive compounds and one for pharmacological activities
# each row in those columns contains a list of randomly selected labels from the respective classes

# 2. encode Inputs and Multi-label Ys (we have two Ys: bioactive compounds and pharmacological activities)

# encode categorical variables into numeric format (one-hot encoding)
X = pd.get_dummies(df[["growth_form", "native_area", "traditional_use", "known_bioactive"]])

# encode Y targets
mlb_comp = MultiLabelBinarizer()  # create encoder for bioactive compounds
mlb_pharma = MultiLabelBinarizer()  # create encoder for pharmacological activities
# MultiLabelBinarizer is used to convert lists of labels into a binary matrix (0, 1)

# fit the encoders on the labels
# and transform the lists of labels (list of lists) into binary matrices
Y_comp = mlb_comp.fit_transform(df["bioactive_compounds"])
Y_pharma = mlb_pharma.fit_transform(df["pharma_activities"])

# Combine both target outputs
Y = np.concatenate([Y_comp, Y_pharma], axis=1)

# 3. Train/Test Split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# 4. train Multi-Output Model

model = RandomForestClassifier(n_estimators=100, random_state=42)
multi_model = MultiOutputClassifier(model)  # the random forest can predict only one target at a time, so we use MultiOutputClassifier, which trains the random forest for each target (like an iteration)
multi_model.fit(X_train, Y_train)

# 5. predict and Evaluate

Y_pred = multi_model.predict(X_test)

# split back the prediction into compound and activity parts
n_comp = Y_comp.shape[1]  
# number of bioactive compounds columns
# Y_comp.shape[1] gives the number of columns in the binary matrix for bioactive compounds
# it's the point where the predictions for compounds and pharmacological activities are split
Y_pred_comp = Y_pred[:, :n_comp]  # first n_comp columns are for bioactive compounds
Y_pred_pharma = Y_pred[:, n_comp:]  # remaining columns are for pharmacological activities
Y_test_comp = Y_test[:, :n_comp]  # same here for the test set
Y_test_pharma = Y_test[:, n_comp:]

print("\n Predicted Bioactive Compounds:")
print(classification_report(Y_test_comp, Y_pred_comp, target_names=mlb_comp.classes_))

print("\n Predicted Pharmacological Activities:")
print(classification_report(Y_test_pharma, Y_pred_pharma, target_names=mlb_pharma.classes_))

In [None]:
# wider dataset

# 1. dataset generation

growth_forms = ["herb", "shrub", "tree"]
native_areas = ["Asia", "Europe", "South America", "Africa", "North America"]

compound_classes = [
    "vincristine", "morphine", "atropine", "quinine", "digitoxin", "artemisinin",
    "resveratrol", "curcumin", "caffeine", "ephedrine", "ginsenoside", "salicin",
    "taxol", "hypericin", "reserpine", "gingerol"
]

pharma_activities = [
    "anti-inflammatory", "antibacterial", "antiviral", "anticancer",
    "analgesic", "sedative", "stimulant", "antidepressant",
    "antimalarial", "antipyretic", "hypotensive", "immunomodulatory",
    "hepatoprotective", "cardiotonic", "neuroprotective", "diuretic",
    "antidiabetic", "anxiolytic"
]

def random_labels(label_list, min_labels=1, max_labels=3):
    return list(np.random.choice(label_list, size=np.random.randint(min_labels, max_labels+1), replace=False))

n_samples = 120
np.random.seed(42)
df = pd.DataFrame({
    "plant": [f"Plant_{i+1}" for i in range(n_samples)],
    "growth_form": np.random.choice(growth_forms, n_samples),
    "native_area": np.random.choice(native_areas, n_samples),
    "traditional_use": np.random.choice([0, 1], n_samples),
    "known_bioactive": np.random.choice([0, 1], n_samples),
    "bioactive_compounds": [random_labels(compound_classes, 1, 3) for _ in range(n_samples)],
    "pharma_activities": [random_labels(pharma_activities, 2, 4) for _ in range(n_samples)]
})

# 2. encode inputs and outputs

X = pd.get_dummies(df[["growth_form", "native_area", "traditional_use", "known_bioactive"]])

mlb_comp = MultiLabelBinarizer()
mlb_pharma = MultiLabelBinarizer()
Y_comp = mlb_comp.fit_transform(df["bioactive_compounds"])
Y_pharma = mlb_pharma.fit_transform(df["pharma_activities"])

# combine both outputs
Y = np.concatenate([Y_comp, Y_pharma], axis=1)

# 3. train/test split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# 4. train multi-output random forest model

model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42) 
# class_weight='balanced' helps to handle class imbalance, because I have some compounds and activities that are very common in nature, and others instead very rare.
# this causes an imbalance, so the model learns well to predict the most common (and most frequent) compounds and activities, but not the rare ones.
multi_model = MultiOutputClassifier(model)
multi_model.fit(X_train, Y_train)

# 5. predict and evaluate

Y_pred = multi_model.predict(X_test)

# split back to compounds and activities
n_comp = Y_comp.shape[1]
Y_test_comp = Y_test[:, :n_comp]
Y_test_pharma = Y_test[:, n_comp:]
Y_pred_comp = Y_pred[:, :n_comp]
Y_pred_pharma = Y_pred[:, n_comp:]

print("\n Predicted Bioactive Compounds:")
print(classification_report(Y_test_comp, Y_pred_comp, target_names=mlb_comp.classes_, zero_division=0))

print("\n Predicted Pharmacological Activities:")
print(classification_report(Y_test_pharma, Y_pred_pharma, target_names=mlb_pharma.classes_, zero_division=0))

precision: of all predicted positives, how many were correct? (low = many false positives)

recall: of all actual positives, how many did we catch? (low = many false negatives)

f1-score: harmonic mean of precision and recall (between 0 and 1, 0 bad - 1 great)

support: how many times the true label appeared in the test set


- most of the compounds have metrics = 0: the model failed to identify them in the test set
- ex. caffeine: the model correctly predicted that caffeine was present in 14% of the cases, but it catched only 25% of the actual caffeine cases, and the overall performance is low

## Why the model doesn't work well? Possible problems.

- too few data
- need more meaningful data (the input data X should contain useful information to predict the outputs Y). Maybe we need chemical structures, plant families, molecular pathways, ...
- need to logically assign the compounds and the activities to the plant

In [1]:
# import dataset with real data
import pandas as pd
import csv

df = pd.read_csv("pfaf_plants_merged.csv")

In [2]:
# delete non useful columns
df = df.drop(columns=["Common Name", "Medicinal Rating", "Common Names", "Scientific Name", "Summary", "Edibility Rating", "Image URLs"])

In [3]:
# rename columns
df = df.rename(columns={
    "use_keyword" : "Use",
    "latin_name_search" : "Scientific name",
    "common_name_search": "Common name",
    "edibility_rating_search": "Edibility rating",
    "medicinal_rating_search": "Medicinal rating",
    "plant_url": "Plant URL",
    "Care Requirements": "Care requirements",
    "Cultivation Details": "Cultivation details",
    "Edible Uses": "Edible uses",
    "Family" : "Plant family",
    "Known Hazards": "Known hazards",
    "Medicinal Properties": "Medicinal properties",
    "Native Range": "Native area",
    "Other Uses": "Other uses",
    "Other Uses Rating": "Other uses rating",
    "Propagation": "Propagation",
    "Range" : "Diffusion area",
    "Special Uses": "Special uses",
    "USDA hardiness" : "USDA hardiness zone",
    "Weed Potential": "Weed potential"
})

## Predictive model
Given a plant name, predict the medicinal use.

In [None]:
# drop rows where "Medicinal properties" has NaN
df = df.dropna(subset=["Medicinal properties"]).copy()

# split the text on ; and convert to a list
df["Medicinal properties"] = (
    df["Medicinal properties"]
    .astype(str)
    .str.split(";")
    .apply(lambda x: [item.strip() for item in x if item.strip() and item.lower() != "nan"])
)

# define inputs (X) and outputs (y)
X = df[["Use", "Scientific name", "Common name", "Edibility rating", "Medicinal rating", "Plant URL", "Care requirements", "Cultivation details", "Edible uses", "Plant family", 
        "Known hazards", "Native area", "Other uses", "Other uses rating", "Propagation", "Diffusion area", "Special uses", "USDA hardiness zone", "Weed potential"]]
y = df["Medicinal properties"]

# encode output labels using MultiLabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y_encoded = mlb.fit_transform(y)

# combine the fields into a single text column
X_test = X.fillna("").astype(str).agg(" ".join, axis=1)  # join all columns into a single string for each row

# vectorize the combined text using TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=1000)
X_vectorized = vectorizer.fit_transform(X_test)

# train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y_encoded, test_size=0.2, random_state=42)

# train model
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

model = RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=42)
multi_model = MultiOutputClassifier(model)
multi_model.fit(X_train, y_train)

# predict and evaluate
from sklearn.metrics import classification_report
y_pred = multi_model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=mlb.classes_, zero_division=0))