In [30]:
import os
import glob

# Directory path
report_dir = "./AllSmaples-Report/"

# Get all CSV files that do NOT contain D1, D2, D3, D4, or D5
all_csv_files = glob.glob(os.path.join(report_dir, "*.csv"))
fruit_mapping = {}

for file_path in all_csv_files:
    filename = os.path.basename(file_path)
    # Check if filename does NOT contain D1, D2, D3, D4, or D5
    if not any(f"D{i}" in filename for i in range(1, 6)):
        # Remove .csv extension and use as key, filename as value
        fruit_name = filename.replace(".csv", "")
        fruit_mapping[fruit_name] = filename

# Display the mapping
print("Fruit Mapping for files without D1-D5:")
print("=" * 50)
for fruit_name, filename in sorted(fruit_mapping.items()):
    print(f"{fruit_name}: {filename}")

print(f"\nTotal files mapped: {len(fruit_mapping)}")

Fruit Mapping for files without D1-D5:
AppleBanana: AppleBanana.csv
AppleBananaMandarin: AppleBananaMandarin.csv
AppleBananaTomato: AppleBananaTomato.csv
AppleMandarin: AppleMandarin.csv
AppleTomato: AppleTomato.csv
BananaMandarin: BananaMandarin.csv
Mandarin: Mandarin.csv
TomatoBanana: TomatoBanana.csv
TomatoMandarin: TomatoMandarin.csv

Total files mapped: 9


In [31]:
import pandas as pd
import os

# Load all mapped CSV files and add label column
dataframes = []

for fruit_name, filename in fruit_mapping.items():
    file_path = os.path.join(report_dir, filename)
    print(f"Loading {filename}...")
    
    # Read CSV file
    df = pd.read_csv(file_path)
    
    # Add label column with the mapped fruit name
    df['label'] = fruit_name
    
    # Append to list
    dataframes.append(df)
    
    print(f"  - Loaded {len(df)} samples")

# Combine all dataframes into one full dataframe
full_dataframe = pd.concat(dataframes, ignore_index=True)

print("\n" + "=" * 50)
print(f"Full DataFrame created:")
print(f"  - Total samples: {len(full_dataframe)}")
print(f"  - Total columns: {len(full_dataframe.columns)}")
print(f"  - Columns: {list(full_dataframe.columns)}")
print(f"\nLabel distribution:")
print(full_dataframe['label'].value_counts().sort_index())

Loading Mandarin.csv...
  - Loaded 14963 samples
Loading AppleBananaTomato.csv...
  - Loaded 17651 samples
Loading AppleMandarin.csv...
  - Loaded 20911 samples
Loading BananaMandarin.csv...
  - Loaded 21499 samples
Loading TomatoMandarin.csv...
  - Loaded 21541 samples
Loading TomatoBanana.csv...
  - Loaded 23158 samples
Loading AppleBanana.csv...
  - Loaded 21501 samples
Loading AppleBananaMandarin.csv...
  - Loaded 18952 samples
Loading AppleTomato.csv...
  - Loaded 21695 samples

Full DataFrame created:
  - Total samples: 181871
  - Total columns: 11
  - Columns: ['Ticks', 'MQ2', 'MQ3', 'MQ4', 'MQ5', 'MQ6', 'MQ7', 'MQ8', 'MQ9', 'MQ135', 'label']

Label distribution:
label
AppleBanana            21501
AppleBananaMandarin    18952
AppleBananaTomato      17651
AppleMandarin          20911
AppleTomato            21695
BananaMandarin         21499
Mandarin               14963
TomatoBanana           23158
TomatoMandarin         21541
Name: count, dtype: int64


In [32]:
# Display preview of the full dataframe
print("Preview of Full DataFrame:")
print("=" * 50)
print(full_dataframe.head(10))
print(f"\nDataFrame Info:")
print(full_dataframe.info())

Preview of Full DataFrame:
           Ticks  MQ2  MQ3  MQ4  MQ5  MQ6  MQ7  MQ8  MQ9  MQ135     label
0  1650986958099   42  180   64   27   63   22   93   46    786  Mandarin
1  1650986958113   44  167   65   25   59   30   82   54    764  Mandarin
2  1650986958121   41  166   63   18   60   31   82   52    769  Mandarin
3  1650986958129   45  170   66   25   59   25   80   54    766  Mandarin
4  1650986958136   45  166   64   21   61   28   85   58    769  Mandarin
5  1650986958144   40  165   63   19   59   30   85   53    766  Mandarin
6  1650986958152   41  169   65   30   59   26   85   54    767  Mandarin
7  1650986958161   41  170   65   25   61   32   84   54    767  Mandarin
8  1650986958170   47  175   56   34   71   24   78   66    762  Mandarin
9  1650986958180   38  187   55   38   47   43   92   47    777  Mandarin

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181871 entries, 0 to 181870
Data columns (total 11 columns):
 #   Column  Non-Null Count   D

In [6]:
import numpy as np

# Randomly shuffle all rows in the full dataframe
# Using sample(frac=1.0) to randomly reorder all rows without changing the samples themselves
# reset_index(drop=True) to reset the index after shuffling
full_dataframe = full_dataframe.sample(frac=1.0, random_state=None).reset_index(drop=True)

print("DataFrame rows have been randomly shuffled:")
print("=" * 50)
print(f"Total samples: {len(full_dataframe)}")
print(f"\nFirst 10 rows after shuffling:")
print(full_dataframe.head(10))
print(f"\nLabel distribution (unchanged - only order changed):")
print(full_dataframe['label'].value_counts().sort_index())

DataFrame rows have been randomly shuffled:
Total samples: 181871

First 10 rows after shuffling:
           Ticks  MQ2  MQ3  MQ4  MQ5  MQ6  MQ7  MQ8  MQ9  MQ135  \
0  1650984907823   75  239   79   82  107   58  196   81   1262   
1  1650986236703   55  349   75   83   95   62  222   83   1166   
2  1650985588586   61  200   67   58   96   53  156   71   1102   
3  1650985406803   63  122   80   40   96   54  117   74    942   
4  1650984824843   60  241   75   78  106   53  186   75   1243   
5  1650987052423   48  290   71   69  105   66  215   73   1265   
6  1650985280734   61  116   71   37   88   49  117   74    903   
7  1650986148978   48  368   73   90   98   63  249   76   1109   
8  1650984318675   82  308   87  114  127   60  281   94   1480   
9  1650984917190   68  225   83   74  106   55  179   75   1254   

               label  
0       TomatoBanana  
1      AppleMandarin  
2  AppleBananaTomato  
3        AppleTomato  
4       TomatoBanana  
5           Mandarin  
6  

In [33]:
full_dataframe.head()
# Create a mapping from fruit names to multiclass integer labels
label_map = {name: idx for idx, name in enumerate(sorted(full_dataframe['label'].unique()))}
full_dataframe['label'] = full_dataframe['label'].map(label_map)

# Display the first few rows of the updated dataframe
print("\nFirst few rows of the updated dataframe:")
print(full_dataframe.head())
# Create a mapping from fruit names to multiclass integer labels
label_map = {name: idx for idx, name in enumerate(sorted(full_dataframe['label'].unique()))}


First few rows of the updated dataframe:
           Ticks  MQ2  MQ3  MQ4  MQ5  MQ6  MQ7  MQ8  MQ9  MQ135  label
0  1650986958099   42  180   64   27   63   22   93   46    786      6
1  1650986958113   44  167   65   25   59   30   82   54    764      6
2  1650986958121   41  166   63   18   60   31   82   52    769      6
3  1650986958129   45  170   66   25   59   25   80   54    766      6
4  1650986958136   45  166   64   21   61   28   85   58    769      6


In [34]:
# ============================================================================
# COMPLETE PIPELINE: Feature Preprocessing, Multiple Algorithms, CV, Test Set
# ============================================================================

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import warnings
warnings.filterwarnings('ignore')

# Create reverse label mapping for class names
# Recover fruit names from fruit_mapping (the original names)
try:
    # Try to get original fruit names from fruit_mapping
    fruit_names_sorted = sorted(fruit_mapping.keys())
    # Create mapping: integer label -> fruit name
    # We need to match the order used when labels were encoded
    unique_labels = sorted(full_dataframe['label'].unique())
    if len(unique_labels) == len(fruit_names_sorted):
        label_to_name = {int(unique_labels[i]): fruit_names_sorted[i] for i in range(len(unique_labels))}
    else:
        # Fallback: create from label_map if it exists
        if 'label_map' in locals():
            label_to_name = {v: k for k, v in label_map.items()}
        else:
            label_to_name = {i: f'Class_{i}' for i in unique_labels}
except:
    # Fallback: use generic class names
    unique_labels = sorted(full_dataframe['label'].unique())
    label_to_name = {i: f'Class_{i}' for i in unique_labels}

# Prepare features and labels
# Exclude 'Ticks' if it's a timestamp and not useful for classification
feature_columns = ['MQ2', 'MQ3', 'MQ4', 'MQ5', 'MQ6', 'MQ7', 'MQ8', 'MQ9', 'MQ135']
X = full_dataframe[feature_columns].values
y = full_dataframe['label'].values

# Split into train+validation and test sets (80-20 split)
# Stratified split to maintain class distribution
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Data Split:")
print("=" * 60)
print(f"Full dataset: {len(X)} samples")
print(f"Train+Validation: {len(X_temp)} samples ({len(X_temp)/len(X)*100:.1f}%)")
print(f"Test set (held out): {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")
print(f"\nNumber of features: {X.shape[1]}")
print(f"Number of classes: {len(np.unique(y))}")
print(f"\nClass distribution in train+validation:")
unique, counts = np.unique(y_temp, return_counts=True)
for u, c in zip(unique, counts):
    print(f"  Class {u}: {c} samples ({c/len(y_temp)*100:.2f}%)")

Data Split:
Full dataset: 181871 samples
Train+Validation: 145496 samples (80.0%)
Test set (held out): 36375 samples (20.0%)

Number of features: 9
Number of classes: 9

Class distribution in train+validation:
  Class 0: 17201 samples (11.82%)
  Class 1: 15161 samples (10.42%)
  Class 2: 14121 samples (9.71%)
  Class 3: 16729 samples (11.50%)
  Class 4: 17356 samples (11.93%)
  Class 5: 17199 samples (11.82%)
  Class 6: 11970 samples (8.23%)
  Class 7: 18526 samples (12.73%)
  Class 8: 17233 samples (11.84%)


In [23]:
# ============================================================================
# Define Models with Preprocessing Pipelines
# ============================================================================

# StandardScaler for feature normalization (important for SVM, ANN, KNN)
scaler = StandardScaler()

# Define all models with preprocessing
models = {
    'ANN (MLP)': Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', MLPClassifier(
            hidden_layer_sizes=(100, 50),
            max_iter=500,
            random_state=42,
            early_stopping=True,
            validation_fraction=0.1
        ))
    ]),
    'KNN': Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', KNeighborsClassifier(n_neighbors=5))
    ]),
    'SVM': Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42))
    ]),
    'Logistic Regression': Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(max_iter=1000, random_state=42))
    ]),
    'XGBoost': Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', XGBClassifier(
            random_state=42,
            eval_metric='mlogloss',
            use_label_encoder=False
        ))
    ])
}

print("Models defined with preprocessing pipelines:")
print("=" * 60)
for name in models.keys():
    print(f"  ✓ {name}")

Models defined with preprocessing pipelines:
  ✓ ANN (MLP)
  ✓ KNN
  ✓ SVM
  ✓ Logistic Regression
  ✓ XGBoost


In [24]:
# ============================================================================
# K-Fold Cross-Validation (5-Fold Stratified)
# ============================================================================

# Setup K-Fold Cross-Validation (5 folds, stratified)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store CV results
cv_results = {}

print("Performing 5-Fold Stratified Cross-Validation...")
print("=" * 60)

for name, model in models.items():
    print(f"\n{name}:")
    print("-" * 40)
    
    # Perform cross-validation with accuracy scoring
    cv_scores = cross_val_score(model, X_temp, y_temp, cv=kfold, scoring='accuracy', n_jobs=-1)
    
    # Also get F1 scores (macro-averaged)
    cv_f1_scores = cross_val_score(model, X_temp, y_temp, cv=kfold, scoring='f1_macro', n_jobs=-1)
    
    cv_results[name] = {
        'accuracy_mean': cv_scores.mean(),
        'accuracy_std': cv_scores.std(),
        'accuracy_scores': cv_scores,
        'f1_mean': cv_f1_scores.mean(),
        'f1_std': cv_f1_scores.std(),
        'f1_scores': cv_f1_scores
    }
    
    print(f"  Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print(f"  F1-Score: {cv_f1_scores.mean():.4f} (+/- {cv_f1_scores.std() * 2:.4f})")
    print(f"  Fold accuracies: {[f'{s:.4f}' for s in cv_scores]}")

print("\n" + "=" * 60)
print("Cross-Validation Summary:")
print("=" * 60)
print(f"{'Model':<25} {'CV Accuracy':<20} {'CV F1-Score':<20}")
print("-" * 60)
for name in sorted(cv_results.keys(), key=lambda x: cv_results[x]['accuracy_mean'], reverse=True):
    acc = cv_results[name]['accuracy_mean']
    f1 = cv_results[name]['f1_mean']
    print(f"{name:<25} {acc:.4f} ± {cv_results[name]['accuracy_std']:.4f}   {f1:.4f} ± {cv_results[name]['f1_std']:.4f}")

1400.01s - Error patching args (debugger not attached to subprocess).
Traceback (most recent call last):
  File "/home/abel/Project/2026/EduSaf-Feb2026/DL_Course/MQ-ML/.venv/lib/python3.11/site-packages/debugpy/_vendored/pydevd/_pydev_bundle/pydev_monkey.py", line 541, in patch_args
    new_args.append(_get_python_c_args(host, port, code, unquoted_args, SetupHolder.setup))
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/abel/Project/2026/EduSaf-Feb2026/DL_Course/MQ-ML/.venv/lib/python3.11/site-packages/debugpy/_vendored/pydevd/_pydev_bundle/pydev_monkey.py", line 193, in _get_python_c_args
    if "__future__" in code:
       ^^^^^^^^^^^^^^^^^^^^
TypeError: a bytes-like object is required, not 'str'


Performing 5-Fold Stratified Cross-Validation...

ANN (MLP):
----------------------------------------


0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to di

  Accuracy: 0.9979 (+/- 0.0010)
  F1-Score: 0.9979 (+/- 0.0010)
  Fold accuracies: ['0.9973', '0.9980', '0.9982', '0.9975', '0.9987']

KNN:
----------------------------------------


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


  Accuracy: 0.9950 (+/- 0.0005)
  F1-Score: 0.9948 (+/- 0.0005)
  Fold accuracies: ['0.9947', '0.9950', '0.9952', '0.9948', '0.9953']

SVM:
----------------------------------------
  Accuracy: 0.9968 (+/- 0.0006)
  F1-Score: 0.9967 (+/- 0.0006)
  Fold accuracies: ['0.9965', '0.9967', '0.9970', '0.9965', '0.9974']

Logistic Regression:
----------------------------------------
  Accuracy: 0.9538 (+/- 0.0031)
  F1-Score: 0.9500 (+/- 0.0033)
  Fold accuracies: ['0.9546', '0.9519', '0.9535', '0.9526', '0.9563']

XGBoost:
----------------------------------------
  Accuracy: 0.9982 (+/- 0.0002)
  F1-Score: 0.9981 (+/- 0.0002)
  Fold accuracies: ['0.9984', '0.9981', '0.9981', '0.9980', '0.9982']

Cross-Validation Summary:
Model                     CV Accuracy          CV F1-Score         
------------------------------------------------------------
XGBoost                   0.9982 ± 0.0001   0.9981 ± 0.0001
ANN (MLP)                 0.9979 ± 0.0005   0.9979 ± 0.0005
SVM                       0

In [25]:
# ============================================================================
# Train Final Models and Evaluate on Held-Out Test Set
# ============================================================================

print("Training final models on full train+validation set...")
print("Evaluating on held-out test set...")
print("=" * 60)

test_results = {}
trained_models = {}

for name, model in models.items():
    print(f"\n{name}:")
    print("-" * 40)
    
    # Train on full train+validation set
    model.fit(X_temp, y_temp)
    trained_models[name] = model
    
    # Predict on test set
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    
    test_results[name] = {
        'accuracy': accuracy,
        'f1_score': f1,
        'y_pred': y_pred,
        'y_test': y_test
    }
    
    print(f"  Test Accuracy: {accuracy:.4f}")
    print(f"  Test F1-Score: {f1:.4f}")

print("\n" + "=" * 60)
print("Final Test Set Results (Held-Out):")
print("=" * 60)
print(f"{'Model':<25} {'Test Accuracy':<20} {'Test F1-Score':<20}")
print("-" * 60)
for name in sorted(test_results.keys(), key=lambda x: test_results[x]['accuracy'], reverse=True):
    acc = test_results[name]['accuracy']
    f1 = test_results[name]['f1_score']
    print(f"{name:<25} {acc:.4f}              {f1:.4f}")

Training final models on full train+validation set...
Evaluating on held-out test set...

ANN (MLP):
----------------------------------------
  Test Accuracy: 0.9978
  Test F1-Score: 0.9978

KNN:
----------------------------------------
  Test Accuracy: 0.9955
  Test F1-Score: 0.9953

SVM:
----------------------------------------
  Test Accuracy: 0.9967
  Test F1-Score: 0.9966

Logistic Regression:
----------------------------------------
  Test Accuracy: 0.9536
  Test F1-Score: 0.9497

XGBoost:
----------------------------------------
  Test Accuracy: 0.9985
  Test F1-Score: 0.9985

Final Test Set Results (Held-Out):
Model                     Test Accuracy        Test F1-Score       
------------------------------------------------------------
XGBoost                   0.9985              0.9985
ANN (MLP)                 0.9978              0.9978
SVM                       0.9967              0.9966
KNN                       0.9955              0.9953
Logistic Regression       0.9536 

In [26]:
# ============================================================================
# Detailed Classification Reports for Each Model
# ============================================================================

# Get class names for better readability
class_names = [label_to_name[i] for i in sorted(label_to_name.keys())]

print("Detailed Classification Reports on Test Set:")
print("=" * 60)

for name in models.keys():
    print(f"\n{'='*60}")
    print(f"{name.upper()}")
    print(f"{'='*60}")
    
    y_test_vals = test_results[name]['y_test']
    y_pred_vals = test_results[name]['y_pred']
    
    print("\nClassification Report:")
    print(classification_report(y_test_vals, y_pred_vals, target_names=class_names, digits=4))
    
    print(f"Confusion Matrix:")
    cm = confusion_matrix(y_test_vals, y_pred_vals)
    print(cm)
    print()

Detailed Classification Reports on Test Set:

ANN (MLP)

Classification Report:
                     precision    recall  f1-score   support

        AppleBanana     1.0000    1.0000    1.0000      4300
AppleBananaMandarin     0.9963    0.9918    0.9941      3791
  AppleBananaTomato     0.9966    0.9960    0.9963      3530
      AppleMandarin     0.9990    0.9998    0.9994      4182
        AppleTomato     1.0000    1.0000    1.0000      4339
     BananaMandarin     0.9995    0.9993    0.9994      4300
           Mandarin     0.9987    0.9983    0.9985      2993
       TomatoBanana     0.9965    0.9970    0.9968      4632
     TomatoMandarin     0.9938    0.9974    0.9956      4308

           accuracy                         0.9978     36375
          macro avg     0.9978    0.9977    0.9978     36375
       weighted avg     0.9978    0.9978    0.9978     36375

Confusion Matrix:
[[4300    0    0    0    0    0    0    0    0]
 [   0 3760    0    1    0    2    1    1   26]
 [   0    

In [27]:
# ============================================================================
# Summary Comparison: CV vs Test Performance
# ============================================================================

import pandas as pd

# Create summary DataFrame
summary_data = []
for name in models.keys():
    summary_data.append({
        'Model': name,
        'CV Accuracy': f"{cv_results[name]['accuracy_mean']:.4f} ± {cv_results[name]['accuracy_std']:.4f}",
        'Test Accuracy': f"{test_results[name]['accuracy']:.4f}",
        'CV F1-Score': f"{cv_results[name]['f1_mean']:.4f} ± {cv_results[name]['f1_std']:.4f}",
        'Test F1-Score': f"{test_results[name]['f1_score']:.4f}"
    })

summary_df = pd.DataFrame(summary_data)
summary_df = summary_df.sort_values('Test Accuracy', ascending=False, key=lambda x: x.str.split().str[0].astype(float))

print("Complete Performance Summary:")
print("=" * 80)
print(summary_df.to_string(index=False))
print("\n" + "=" * 80)
print("Best Model (by Test Accuracy):", summary_df.iloc[0]['Model'])
print("Best Test Accuracy:", summary_df.iloc[0]['Test Accuracy'])
print("=" * 80)

Complete Performance Summary:
              Model     CV Accuracy Test Accuracy     CV F1-Score Test F1-Score
            XGBoost 0.9982 ± 0.0001        0.9985 0.9981 ± 0.0001        0.9985
          ANN (MLP) 0.9979 ± 0.0005        0.9978 0.9979 ± 0.0005        0.9978
                SVM 0.9968 ± 0.0003        0.9967 0.9967 ± 0.0003        0.9966
                KNN 0.9950 ± 0.0002        0.9955 0.9948 ± 0.0002        0.9953
Logistic Regression 0.9538 ± 0.0016        0.9536 0.9500 ± 0.0017        0.9497

Best Model (by Test Accuracy): XGBoost
Best Test Accuracy: 0.9985


In [29]:
# ============================================================================
# Save Models and Metadata for Production Use
# ============================================================================

import joblib
import os
from datetime import datetime

# Create models directory if it doesn't exist
models_dir = 'saved_models'
os.makedirs(models_dir, exist_ok=True)

# Save each trained model
print("Saving models for production...")
print("=" * 60)

for name, model in trained_models.items():
    # Clean model name for filename (remove special characters)
    model_filename = name.replace(' ', '_').replace('(', '').replace(')', '').lower()
    model_path = os.path.join(models_dir, f'{model_filename}.joblib')
    joblib.dump(model, model_path)
    print(f"  ✓ Saved: {model_path}")

# Save metadata (label mapping, feature columns, etc.)
metadata = {
    'label_to_name': label_to_name,
    'name_to_label': {v: k for k, v in label_to_name.items()},
    'feature_columns': feature_columns,
    'num_classes': len(np.unique(y)),
    'test_results': {k: {'accuracy': v['accuracy'], 'f1_score': v['f1_score']} 
                     for k, v in test_results.items()},
    'cv_results': {k: {'accuracy_mean': v['accuracy_mean'], 'accuracy_std': v['accuracy_std'],
                       'f1_mean': v['f1_mean'], 'f1_std': v['f1_std']}
                   for k, v in cv_results.items()},
    'saved_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'best_model': summary_df.iloc[0]['Model']
}

metadata_path = os.path.join(models_dir, 'metadata.joblib')
joblib.dump(metadata, metadata_path)
print(f"  ✓ Saved metadata: {metadata_path}")

print("\n" + "=" * 60)
print("All models and metadata saved successfully!")
print(f"Models directory: {os.path.abspath(models_dir)}")
print("=" * 60)

Saving models for production...
  ✓ Saved: saved_models/ann_mlp.joblib
  ✓ Saved: saved_models/knn.joblib
  ✓ Saved: saved_models/svm.joblib
  ✓ Saved: saved_models/logistic_regression.joblib
  ✓ Saved: saved_models/xgboost.joblib
  ✓ Saved metadata: saved_models/metadata.joblib

All models and metadata saved successfully!
Models directory: /home/abel/Project/2026/EduSaf-Feb2026/DL_Course/MQ-ML/saved_models
