In [2]:
from datasets import load_dataset
from datasets import DatasetDict
import numpy as np
import pandas as pd
from classes import *
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Docstrings generated from Anysphere. (2025). Cursor [Large language model]. https://cursor.com/en

In [3]:
# --------- CONFIGURATION ---------
DATASET_NAME = 'go_emotions'
DATASET_CONFIG = 'simplified'
PREPROCESSOR = TextPreprocessor(extra_stopwords={'name'})

In [4]:
def load_and_prepare_dataset():
    """Load GoEmotions dataset, remove neutral labels, and return cleaned splits.

    Loads the GoEmotions dataset from HuggingFace, removes the 'neutral' label 
    from all examples, filters out examples that would have no labels after 
    neutral removal, and returns the cleaned dataset with capitalized label names.

    The function processes all dataset splits (train, validation, test) and
    ensures consistency across splits by applying the same filtering logic.

    Returns:
        tuple: (clean_dict, clean_labels)
            - clean_dict: DatasetDict with neutral labels removed from all splits
            - clean_labels: List of capitalized label names excluding 'neutral'

    Note:
        This function modifies the original dataset by removing the neutral class,
        which is common practice in emotion classification tasks where neutral
        is often considered a baseline or uninformative class.
    """
    ds = load_dataset(DATASET_NAME, DATASET_CONFIG)
    labels = ds["train"].features["labels"].feature.names
    neutral_idx = labels.index("neutral")

    def remove_neutral(example):
        example["labels"] = [l for l in example["labels"] if l != neutral_idx]
        return example

    def keep_if_not_empty(example):
        return len(example["labels"]) > 0

    clean_dict = DatasetDict()
    for split in ds:
        split_ds = ds[split].map(remove_neutral)
        split_ds = split_ds.filter(keep_if_not_empty)
        clean_dict[split] = split_ds

    clean_labels = [l.capitalize() for l in labels if l != "neutral"]
    return clean_dict, clean_labels

In [33]:
clean_dict, clean_labels = load_and_prepare_dataset()

In [5]:
def preprocess_texts(dataset, preprocessor):
    """Preprocess texts using the preprocessor and filter out empty results.

    Applies the text preprocessor to each text in the dataset, then filters out
    any texts that become empty after preprocessing. This ensures that the
    downstream models receive meaningful input data.

    Args:
        dataset: HuggingFace dataset containing 'text' and 'labels' fields
        preprocessor: TextPreprocessor object with a preprocess method that
                    takes a string and returns a processed string

    Returns:
        tuple: (X_clean, y_clean)
            - X_clean: List of preprocessed text strings (non-empty only)
            - y_clean: List of corresponding label lists, aligned with X_clean

    Note:
        The function maintains alignment between texts and labels, so if a text
        is filtered out, its corresponding labels are also removed.
    """
    X_clean, y_clean = [], []
    for text, labels in zip(dataset["text"], dataset["labels"]):
        processed_text = preprocessor.preprocess(text)
        if processed_text.strip():
            X_clean.append(processed_text)
            y_clean.append(labels)
    return X_clean, y_clean

In [6]:
# --------- THRESHOLD TUNING ---------
# Code modified from OpenAI. (2025). ChatGPT [Large language model]. https://chat.openai.com/chat
def tune_thresholds(probs, targets, low=0.1, high=0.9, steps=81):
    """Find optimal decision thresholds for each class to maximize F1 score.

    Performs grid search over threshold values for each class to find the
    threshold that maximizes the F1 score for that class. This is particularly
    important for multi-label classification where the default threshold of 0.5
    may not be optimal for all classes.

    Args:
        probs (numpy.ndarray): Predicted probabilities, shape (N, C) where
                            N is number of samples, C is number of classes
        targets (numpy.ndarray): Ground truth binary labels, shape (N, C)
        low (float): Lower bound for threshold search (default: 0.1)
        high (float): Upper bound for threshold search (default: 0.9)
        steps (int): Number of threshold values to test between low and high
                    (default: 81, giving ~0.01 step size)

    Returns:
        numpy.ndarray: Optimal threshold for each class, shape (C,)

    Note:
        The function uses F1 score as the optimization metric, which balances
        precision and recall. For classes with very few positive examples,
        this can help improve performance significantly.
    """
    C = probs.shape[1]
    best_thresholds = np.full(C, 0.5)
    for i in range(C):
        best_f1 = 0.0
        for t in np.linspace(low, high, steps):
            preds = (probs[:, i] >= t).astype(int)
            f1 = f1_score(targets[:, i], preds, zero_division=0)
            if f1 > best_f1:
                best_f1 = f1
                best_thresholds[i] = t
    return best_thresholds

In [9]:
# --------- MAIN PIPELINE ---------
def main():
    """Execute the complete Random Forest-based emotion classification pipeline.

    Orchestrates the entire machine learning pipeline including:
    1. Dataset loading and preprocessing (removing neutral labels)
    2. Text preprocessing using the configured TextPreprocessor
    3. Multi-label encoding using MultiLabelBinarizer
    4. TF-IDF feature extraction with Random Forest classification
    5. Hyperparameter optimization using RandomizedSearchCV
    6. Threshold tuning for optimal F1 scores
    7. Final evaluation and metric reporting

    The pipeline uses a One-vs-Rest approach with Random Forest classifiers,
    which is well-suited for multi-label emotion classification tasks.

    Outputs:
        - Saves per-label F1 scores to 'rf_metrics.csv'
        - Prints validation set performance metrics

    Configuration:
        Uses the global PREPROCESSOR object and DATASET_NAME/DATASET_CONFIG
        constants defined at the module level.

    Returns:
        None: Executes the complete pipeline and saves results
    """
    # Load dataset
    clean_dicts, clean_labels = load_and_prepare_dataset()

    # Preprocess
    X_train, y_train = preprocess_texts(clean_dicts["train"], PREPROCESSOR)
    X_val, y_val = preprocess_texts(clean_dicts["validation"], PREPROCESSOR)

    # Multi-hot encoding
    mlb = MultiLabelBinarizer(classes=list(range(len(clean_labels))))
    y_train_bin = mlb.fit_transform(y_train)
    y_val_bin = mlb.transform(y_val)

    # Random Forest + OVR pipeline
    base_rf = RandomForestClassifier(class_weight="balanced", random_state=42)
    pipeline = Pipeline([
        ("tfidf", TfidfVectorizer(max_features=5000)),
        ("clf", OneVsRestClassifier(base_rf))
    ])

    # Hyperparameter search
    param_dist = {
        "clf__estimator__n_estimators": [100, 200, 300],
        "clf__estimator__max_depth": [10, 20, None],
        "clf__estimator__min_samples_split": [2, 5],
        "clf__estimator__min_samples_leaf": [1, 2],
        "clf__estimator__max_features": ["sqrt"],
    }

    search = RandomizedSearchCV(
        pipeline,
        param_distributions=param_dist,
        n_iter=20,
        scoring="f1_micro",
        cv=3,
        n_jobs=-1,
        verbose=2,
        random_state=42
    )

    search.fit(X_train, y_train_bin)
    rf = search.best_estimator_
    joblib.dump(rf, "rf_pipeline.pkl")

    # Predict on validation
    probs_val = rf.predict_proba(X_val)
    if isinstance(probs_val, list):
        probs_val = np.vstack([p[:, 1] for p in probs_val]).T

    best_thresholds = tune_thresholds(probs_val, y_val_bin)
    preds_val = (probs_val >= best_thresholds[None, :]).astype(int)

    # Evaluate per-label F1
    f1s = f1_score(y_val_bin, preds_val, average=None, zero_division=0)
    f1_df = pd.DataFrame({"Label": clean_labels, "F1-score": f1s.round(2)})
    print(f1_df)
    f1_df.to_csv("rf_metrics.csv", index=False)

    return rf, best_thresholds, mlb

In [None]:
if __name__ == '__main__':
    rf, best_thresholds, mlb = main()

In [None]:
import joblib

# Load the trained pipeline
rf = joblib.load("rf_pipeline.pkl")

# Demonstration

In [None]:
def predict_rf(text):
    if isinstance(text, list):
        text = text[0]

    # Preprocess text with your PREPROCESSOR
    processed_text = PREPROCESSOR.preprocess(text)

    # Pipeline handles TF-IDF + RF internally
    probs = rf.predict_proba([processed_text])
    if isinstance(probs, list):   # OneVsRestClassifier returns a list of arrays
        probs = np.vstack([p[:, 1] for p in probs]).T

    preds = (probs >= best_thresholds[None, :]).astype(int)

    pred_indices = mlb.inverse_transform(preds)[0]
    pred_labels = [clean_labels[i] for i in pred_indices]

    print(f"Text: {text}")
    print(f"Labels: {pred_labels}")

    return pred_labels

In [None]:
text = input(str('Enter:'))

predict_rf(text)