In [None]:
"""
Machine Learning Pipeline for Chicken Shelf-Life Prediction
----------------------------------------------------------
This script:
1. Loads and preprocesses feature datasets.
2. Handles class imbalance using SMOTE.
3. Standardizes features.
4. Trains and tunes models using GridSearchCV.
5. Evaluates models on test and shelf-life datasets.
6. Saves trained models and evaluation results.
"""

# =============================
# Imports
# =============================
import os
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report,
    precision_score, recall_score, f1_score, roc_curve, roc_auc_score
)
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE


# =============================
# Configurations
# =============================
DATA_FILE = "data.xlsx"
SHELF_LIFE_FILE = "shelf_life_data_f.xlsx"
SCALER_FILE = "standard_scaler.joblib"
RESULTS_FILE = "model_evaluation_results_with_roc_conf_matrix.csv"

FEATURES = ["log-Hue", "Saturation", "b* lab"]
TARGET = "Classification_label"


# =============================
# Data Loading & Preprocessing
# =============================
def load_and_preprocess_data():
    """Load dataset, apply SMOTE, split train/test, and scale features."""
    # Load dataset
    data = pd.read_excel(DATA_FILE)
    X = data[FEATURES]
    y = data[TARGET]

    # Handle class imbalance
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Split into train/test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_resampled, y_resampled, test_size=0.2, random_state=42
    )

    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Save scaler
    joblib.dump(scaler, SCALER_FILE)
    print(f"Scaler saved as {SCALER_FILE}")

    return X_train_scaled, X_test_scaled, y_train, y_test, scaler


# def load_shelf_life_data(scaler):
#     """Load and scale shelf-life dataset for separate evaluation."""
#     shelf_data = pd.read_excel(SHELF_LIFE_FILE)
#     X_shelf = shelf_data[FEATURES]
#     y_shelf = shelf_data["real_code"]
#     X_shelf_scaled = scaler.transform(X_shelf)
#     return X_shelf_scaled, y_shelf


# =============================
# Model Training & Evaluation
# =============================
def evaluate_model(name, model, param_grid, X_train, y_train, X_test, y_test, X_shelf, y_shelf):
    """Train model using GridSearchCV, evaluate, plot, and return results."""
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=5,
        scoring="accuracy",
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    # Predictions
    y_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Confusion matrix heatmap
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False,
                xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
    plt.title(f"Confusion Matrix for {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

    # Metrics
    precision_per_class = precision_score(y_test, y_pred, average=None).tolist()
    recall_per_class = recall_score(y_test, y_pred, average=None).tolist()
    f1_per_class = f1_score(y_test, y_pred, average=None).tolist()

    weighted_precision = precision_score(y_test, y_pred, average="weighted")
    weighted_recall = recall_score(y_test, y_pred, average="weighted")
    weighted_f1 = f1_score(y_test, y_pred, average="weighted")

    # Cross-validation
    cv_scores = cross_val_score(best_model, X_train, y_train, cv=10, scoring="accuracy")
    mean_cv_accuracy, std_cv_accuracy = cv_scores.mean(), cv_scores.std()

    # Shelf-life evaluation
    y_shelf_pred = best_model.predict(X_shelf)
    shelf_life_accuracy = accuracy_score(y_shelf, y_shelf_pred)
    print(f"Shelf-life predictions for {name}: {y_shelf_pred}")

    # ROC Curve
    if hasattr(best_model, "predict_proba"):
        y_test_binarized = label_binarize(y_test, classes=np.unique(y_test))
        y_score = best_model.predict_proba(X_test)

        plt.figure(figsize=(10, 7))
        for i, class_label in enumerate(np.unique(y_test)):
            fpr, tpr, _ = roc_curve(y_test_binarized[:, i], y_score[:, i])
            auc_score = roc_auc_score(y_test_binarized[:, i], y_score[:, i])
            plt.plot(fpr, tpr, label=f"Class {class_label} (AUC = {auc_score:.4f})")

        plt.plot([0, 1], [0, 1], "k--")
        plt.title(f"ROC Curve per Class for {name}")
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.legend()
        plt.show()

    # Save model
    model_filename = f"{name.replace(' ', '_').lower()}_model.joblib"
    joblib.dump(best_model, model_filename)
    print(f"{name} saved as {model_filename}")

    return {
        "Model": name,
        "Best Params": grid_search.best_params_,
        "Test Accuracy": test_accuracy,
        "Confusion Matrix": conf_matrix.tolist(),
        "Precision (per class)": precision_per_class,
        "Recall (per class)": recall_per_class,
        "F1 Score (per class)": f1_per_class,
        "Weighted Precision": weighted_precision,
        "Weighted Recall": weighted_recall,
        "Weighted F1 Score": weighted_f1,
        "Mean CV Accuracy (10-fold)": mean_cv_accuracy,
        "Std CV Accuracy (10-fold)": std_cv_accuracy,
        "Shelf Life Accuracy": shelf_life_accuracy,
    }


# =============================
# Main Execution
# =============================
def main():
    # Load and preprocess
    X_train, X_test, y_train, y_test, scaler = load_and_preprocess_data()
    # X_shelf, y_shelf = load_shelf_life_data(scaler)

    # Classifier configurations
    param_grids = {
        "Neural Network": {
            "model": MLPClassifier(max_iter=3000, random_state=42),
            "param_grid": {
                "hidden_layer_sizes": [(50, 50)],
                "activation": ["relu"],
                "solver": ["adam"],
                "alpha": [0.01],
                "learning_rate": ["constant"],
            },
        },
    }

    # Train and evaluate
    results = []
    for name, config in param_grids.items():
        result = evaluate_model(
            name, config["model"], config["param_grid"],
            X_train, y_train, X_test, y_test, X_shelf, y_shelf
        )
        results.append(result)

    # Save results
    results_df = pd.DataFrame(results)
    results_df.to_csv(RESULTS_FILE, index=False)
    print(f"Results saved to {RESULTS_FILE}")


if __name__ == "__main__":
    main()


In [None]:
import joblib
joblib.dump(best_model, 'trained_model.joblib')
print("Model saved as 'trained_model.joblib'")

# Export the scaler to a joblib file
joblib.dump(scaler, 'scaler.joblib')
print("Scaler saved as 'scaler.joblib'")

Model saved as 'trained_model.joblib'
Scaler saved as 'scaler.joblib'


In [None]:
# regression model

In [None]:
"""
Regression Pipeline for Shelf-Life Prediction
--------------------------------------------
This script:
1. Loads and preprocesses data.
2. Monitors CPU usage to prevent overload.
3. Trains and tunes regression models using GridSearchCV.
4. Evaluates models with RMSEP, RMSECV, R², and shelf-life dataset.
5. Saves results to CSV for comparison.
"""

# =============================
# Imports
# =============================
import os
import time
import psutil
import threading
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score


# =============================
# Configurations
# =============================
DATA_FILE = "data.xlsx"
SHELF_LIFE_FILE = "data.xlsx"
RESULTS_FILE = "regression_model_results_with_tuning.csv"

FEATURES = ["log-Hue", "Saturation", "b* lab"]
TARGET = "mg N/100g"

CPU_THRESHOLD = 85   # % usage limit
CHECK_INTERVAL = 5   # seconds


# =============================
# Data Handling
# =============================
def load_and_preprocess_data():
    """Load dataset, split into train/test, and standardize features."""
    data = pd.read_excel(DATA_FILE)
    X = data[FEATURES]
    y = data[TARGET]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled, y_train, y_test, scaler


# def load_shelf_life_data(scaler):
#     """Load and scale shelf-life dataset for evaluation."""
#     shelf_data = pd.read_excel(SHELF_LIFE_FILE)
#     X_shelf = shelf_data[FEATURES]
#     y_shelf = shelf_data["Experimental (N mg/100g)"]
#     X_shelf_scaled = scaler.transform(X_shelf)
#     return X_shelf_scaled, y_shelf


# =============================
# CPU Monitoring
# =============================
def monitor_cpu():
    """Monitor CPU usage and stop if usage exceeds threshold."""
    while True:
        cpu_usage = psutil.cpu_percent(interval=1)
        if cpu_usage > CPU_THRESHOLD:
            print(f"CPU usage {cpu_usage}% exceeded threshold {CPU_THRESHOLD}%")
            raise SystemExit("Terminated due to high CPU usage.")
        time.sleep(CHECK_INTERVAL)


def start_cpu_monitor():
    """Start CPU monitoring in a background thread."""
    cpu_thread = threading.Thread(target=monitor_cpu, daemon=True)
    cpu_thread.start()


# =============================
# Model Training & Evaluation
# =============================
def evaluate_regressor(name, model, param_grid, X_train, y_train, X_test, y_test, X_shelf, y_shelf):
    """Perform GridSearchCV, evaluate metrics, and return results."""
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=5,
        scoring="r2",
        n_jobs=2  # Protect CPU
    )
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    print(f"Best parameters for {name}: {best_params}")

    # Predictions
    y_pred = best_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmsep = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # RMSECV (cross-validation)
    rmsecv_scores = -cross_val_score(best_model, X_train, y_train, cv=10, scoring="neg_mean_squared_error")
    rmsecv = np.sqrt(rmsecv_scores.mean())
    rmsecv_std = rmsecv_scores.std()

    # Cross-validated R²
    r2_cv_scores = cross_val_score(best_model, X_train, y_train, cv=10, scoring="r2")
    r2_cv, r2_cv_std = r2_cv_scores.mean(), r2_cv_scores.std()

    # # Shelf-life dataset evaluation
    # y_shelf_pred = best_model.predict(X_shelf)
    # shelf_r2 = r2_score(y_shelf, y_shelf_pred)
    # mse_shelf = mean_squared_error(y_shelf, y_shelf_pred)
    # rmsep_shelf = np.sqrt(mse_shelf)

    # print(f"Shelf-life predictions for {name}: {y_shelf_pred}")

    # print(
    #     f"Model: {name}, "
    #     f"Test RMSEP: {rmsep:.4f}, RMSECV: {rmsecv:.4f} ± {rmsecv_std:.4f}, "
    #     f"Test R²: {r2:.4f}, CV R²: {r2_cv:.4f} ± {r2_cv_std:.4f}, "
    #     f"Shelf Life R²: {shelf_r2:.4f}, Shelf RMSEP: {rmsep_shelf:.4f}"
    # )

    return {
        "Features": FEATURES,
        "Model": name,
        "Best Params": best_params,
        "RMSEP": rmsep,
        "RMSECV": rmsecv,
        "RMSECV Std Dev": rmsecv_std,
        "Test R²": r2,
        "Cross-validated R²": r2_cv,
        "R² CV Std Dev": r2_cv_std,
        # "Shelf Life R²": shelf_r2,
        # "Shelf RMSEP": rmsep_shelf,
    }


# =============================
# Main Execution
# =============================
def main():
    # Start CPU monitoring
    start_cpu_monitor()

    # Load datasets
    X_train, X_test, y_train, y_test, scaler = load_and_preprocess_data()
    # X_shelf, y_shelf = load_shelf_life_data(scaler)

    # Parameter grids
    param_grids = {
        "Neural Network": {
            "hidden_layer_sizes": [(50, 50)],
            "activation": ["relu"],
            "solver": ["adam"],
            "alpha": [0.01],
            "learning_rate": ["constant"],
            "max_iter": [3000],

    }

    regressors = {
        "Neural Network": MLPRegressor(random_state=42, max_iter=3000),
    }

    results = []

    # Train & evaluate
    for name, model in regressors.items():
        if name in param_grids:
            print(f"\n=== Training {name} ===")
            result = evaluate_regressor(
                name, model, param_grids[name],
                X_train, y_train, X_test, y_test,
                X_shelf, y_shelf
            )
            results.append(result)

    # Save results
    results_df = pd.DataFrame(results)
    results_df.to_csv(RESULTS_FILE, index=False)
    print(f"\nResults saved to {RESULTS_FILE}")


if __name__ == "__main__":
    main()
