Importing All Packages

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score
from sklearn.calibration import CalibratedClassifierCV
from imblearn.over_sampling import SMOTE

Loading Our Processed Dataset

In [4]:
file_path = "final_dataset.csv"

In [5]:
filtered_df =  pd.read_csv(file_path, on_bad_lines='skip', encoding='utf-8', low_memory=False)
filtered_df.shape

(87588, 8)

Seperating features and label

In [6]:

# Define features and target
num_features = [
    'energy_100g', 'saturated_fat_100g', 'sugars_100g',
    'fiber_100g', 'proteins_100g', 'salt_100g',
    'fruits_veg_nuts_100g',
]
target = 'nutriscore_grade'

# Split data
X = filtered_df[num_features]
y = filtered_df[target]

Encoding each labels

In [7]:
# Encode target labels
encoder = OrdinalEncoder()
y_encoded = encoder.fit_transform(y.values.reshape(-1, 1)).ravel()
set(y_encoded)

{0.0, 1.0, 2.0, 3.0}

Spliting to Train & Test data by 80 20 ratio & applying Normalization

In [8]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Importing best 3 models which was performed well individually.

In [9]:
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, random_state=42),
    'LightGBM': LGBMClassifier(n_estimators=100, random_state=42)
}

# Calibrate models
calibrated_models = {
    'rf': CalibratedClassifierCV(models['RandomForest'], method='isotonic', cv=5),
    'xgb': CalibratedClassifierCV(models['XGBoost'], method='sigmoid', cv=5),
    'lgbm': CalibratedClassifierCV(models['LightGBM'], method='isotonic', cv=5)
}

Training Loop

In [10]:
# Cross-validation for individual models
print("\n=== Individual Model Cross-Validation ===")
for name, model in calibrated_models.items():
    print(f"\n{name.upper()} Cross-Validation:")
    y_pred = cross_val_predict(model, X_train_scaled, y_train, n_jobs=-1, method='predict_proba').argmax(axis=1)
    print(f"Balanced Accuracy: {balanced_accuracy_score(y_train, y_pred):.4f}")
    print(classification_report(y_train, y_pred, target_names=['B', 'C', 'D', 'E'], digits=4))


=== Individual Model Cross-Validation ===

RF Cross-Validation:
Balanced Accuracy: 0.9822
              precision    recall  f1-score   support

           B     0.9930    0.9933    0.9932     22485
           C     0.9767    0.9809    0.9788     16253
           D     0.9687    0.9626    0.9656      9670
           E     0.9929    0.9922    0.9926     21662

    accuracy                         0.9858     70070
   macro avg     0.9828    0.9822    0.9825     70070
weighted avg     0.9858    0.9858    0.9858     70070


XGB Cross-Validation:
Balanced Accuracy: 0.9853
              precision    recall  f1-score   support

           B     0.9955    0.9947    0.9951     22485
           C     0.9814    0.9855    0.9834     16253
           D     0.9719    0.9677    0.9698      9670
           E     0.9936    0.9931    0.9934     21662

    accuracy                         0.9884     70070
   macro avg     0.9856    0.9853    0.9854     70070
weighted avg     0.9884    0.9884    0.9884  

Combining all models with Ensemble Soft Voting

In [None]:
# Train ensemble model
ensemble = VotingClassifier(
    estimators=list(calibrated_models.items()),
    voting='soft',
    n_jobs=-1
)

ensemble.fit(X_train_scaled, y_train)
# Generate predictions
y_pred = ensemble.predict(X_test_scaled)

# Evaluation metrics
print("\n=== Final Ensemble Evaluation ===")
print("Balanced Accuracy:", balanced_accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=['B', 'C', 'D', 'E'], digits=4))

Confusing Matrix

In [None]:
# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", 
            xticklabels=['B', 'C', 'D', 'E'], yticklabels=['B', 'C', 'D', 'E'])
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix of Ensemble")
plt.show()

ROC-AUC

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
import numpy as np

# Get predicted probabilities
y_score = ensemble.predict_proba(X_test_scaled)

# Binarize labels (since it's multi-class)
y_test_bin = label_binarize(y_test, classes=np.unique(y_test))

# Plot ROC curves
plt.figure(figsize=(8, 6))
for i in range(y_test_bin.shape[1]):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    plt.plot(fpr, tpr, label=f'Class {i} (AUC={auc(fpr, tpr):.2f})')

# Plot random guessing line
plt.plot([0, 1], [0, 1], 'k--')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multi-Class ROC Curve')
plt.legend()
plt.show()
