In [1]:
# Cell 1: Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

# Import our custom modules
from hyperparameter_tuning import get_param_grids, tune_hyperparameters
from model_visualization import plot_model_comparison



In [2]:
# Cell 2: Load and Prepare Data
# Load data
data = pd.read_csv('augmented_bakery_data.csv')

# Encode Product Type
le_product = LabelEncoder()
data['Product_Type_encoded'] = le_product.fit_transform(data['Product_Type'])

# One-hot encode Storage Conditions
storage_dummies = pd.get_dummies(data['Storage_Condition'], prefix='storage')
data = pd.concat([data, storage_dummies], axis=1)



In [3]:

# Cell 3: Run Classification Without Product Type
# Get features (gains and phases only)
feature_cols = [col for col in data.columns if col.startswith(('gain_', 'phase_'))]
X_without_product = data[feature_cols]
y_storage = data['Storage_Condition']

# Get parameter grids
param_grids = get_param_grids()

# Dictionary to store results
results_without_product = {}

# Run classification for each model
for model_name, param_grid in param_grids.items():
    print(f"\nTraining {model_name} without product type...")
    results_without_product[model_name] = tune_hyperparameters(X_without_product, y_storage, 
                                                             model_name, param_grid)




Training SVM without product type...
Fitting 10 folds for each of 32 candidates, totalling 320 fits

Training Random Forest without product type...
Fitting 10 folds for each of 108 candidates, totalling 1080 fits

Training KNN without product type...
Fitting 10 folds for each of 16 candidates, totalling 160 fits

Training Neural Network without product type...
Fitting 10 folds for each of 48 candidates, totalling 480 fits

Training Logistic Regression without product type...
Fitting 10 folds for each of 36 candidates, totalling 360 fits


In [4]:

# Cell 4: Run Classification With Product Type
# Add product type feature
X_with_product = pd.concat([X_without_product, 
                           pd.DataFrame(data['Product_Type_encoded'], 
                                      columns=['product_type'])], axis=1)

# Dictionary to store results
results_with_product = {}

# Run classification for each model
for model_name, param_grid in param_grids.items():
    print(f"\nTraining {model_name} with product type...")
    results_with_product[model_name] = tune_hyperparameters(X_with_product, y_storage, 
                                                          model_name, param_grid)



Training SVM with product type...
Fitting 10 folds for each of 32 candidates, totalling 320 fits


ValueError: 
All the 320 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
320 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\yohan\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\yohan\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\yohan\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\svm\_base.py", line 190, in fit
    X, y = self._validate_data(
  File "c:\Users\yohan\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\base.py", line 622, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "c:\Users\yohan\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\utils\validation.py", line 1146, in check_X_y
    X = check_array(
  File "c:\Users\yohan\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\utils\validation.py", line 957, in check_array
    _assert_all_finite(
  File "c:\Users\yohan\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\utils\validation.py", line 122, in _assert_all_finite
    _assert_all_finite_element_wise(
  File "c:\Users\yohan\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\utils\validation.py", line 171, in _assert_all_finite_element_wise
    raise ValueError(msg_err)
ValueError: Input X contains NaN.
SVC does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values


In [None]:

# Cell 5: Visualize Results
# Combine results
storage_results = {
    'Without Product Type': results_without_product,
    'With Product Type': results_with_product
}

# Create visualization
plot_model_comparison(
    storage_results,
    classification_type='storage',
    save_path='storage_condition_comparison.png'
)