In [14]:
# Cell 1: Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
import json
from hyperparameter_tuning import get_param_grids, run_all_models_comparison
# Cell 1: Imports and Setup
# Add these at the top of both notebooks
RANDOM_STATE = 42
CV_SPLITS = 10
# Cell 2: Load and Prepare Data
print("Loading and preparing data...")
data = pd.read_csv('augmented_bakery_data.csv')

# Encode labels
le_product = LabelEncoder()
le_storage = LabelEncoder()
data['Product_Type_encoded'] = le_product.fit_transform(data['Product_Type'])
data['Storage_Condition_encoded'] = le_storage.fit_transform(data['Storage_Condition'])

# Prepare features
# Separate spectral features and product type
spectral_cols = [col for col in data.columns if col.startswith(('gain_', 'phase_'))]
X_spectral = data[spectral_cols]
product_type = data['Product_Type_encoded'].values.reshape(-1, 1)

# Scale spectral features
scaler = StandardScaler()
X_spectral_scaled = scaler.fit_transform(X_spectral)

# Prepare datasets
X_with_product = np.column_stack([X_spectral_scaled, product_type])
X_without_product = X_spectral_scaled

# Target variable
y_storage = data['Storage_Condition_encoded'].values

print(f"Dataset shapes:")
print(f"X without product type: {X_without_product.shape}")
print(f"X with product type: {X_with_product.shape}")
print(f"Number of storage conditions: {len(np.unique(y_storage))}")

# Cell 3: Run Tuning
print("Running hyperparameter tuning...")
all_results = run_all_models_comparison(X_with_product, X_without_product, y_storage)

# Cell 5: Save Results
storage_condition_parameters_summary = {
    'with_product_type': {},
    'without_product_type': {}
}

# Extract best parameters for each model
for model_name, results in all_results.items():
    storage_condition_parameters_summary['with_product_type'][model_name] = results['with_product_type']['best_params']
    storage_condition_parameters_summary['without_product_type'][model_name] = results['without_product_type']['best_params']

# Save the best parameters
with open('best_parameters.json', 'w') as f:
    json.dump(storage_condition_parameters_summary, f, indent=4)

Product type tunning


In [None]:
# Cell 1: Imports and Setup
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
import json
from hyperparameter_tuning import get_param_grids, run_all_models_comparison

# Constants
RANDOM_STATE = 42
CV_SPLITS = 10

# Cell 2: Load and Prepare Data
print("Loading and preparing data...")
data = pd.read_csv('augmented_bakery_data.csv')

# Encode labels
le_product = LabelEncoder()
le_storage = LabelEncoder()
data['Product_Type_encoded'] = le_product.fit_transform(data['Product_Type'])
data['Storage_Condition_encoded'] = le_storage.fit_transform(data['Storage_Condition'])

# Separate spectral features and storage condition
spectral_cols = [col for col in data.columns if col.startswith(('gain_', 'phase_'))]
X_spectral = data[spectral_cols]
storage_condition = data['Storage_Condition_encoded'].values.reshape(-1, 1)

# Scale spectral features
scaler = StandardScaler()
X_spectral_scaled = scaler.fit_transform(X_spectral)

# Prepare two versions of the dataset
X_without_storage = X_spectral_scaled
X_with_storage = np.column_stack([X_spectral_scaled, storage_condition])  # Storage condition remains unscaled

# Target variable (now product type)
y_product = data['Product_Type_encoded'].values

print(f"Dataset shapes:")
print(f"X without storage condition: {X_without_storage.shape}")
print(f"X with storage condition: {X_with_storage.shape}")
print(f"Number of product types: {len(np.unique(y_product))}")
print(f"Product types: {dict(zip(le_product.classes_, le_product.transform(le_product.classes_)))}")

# Cell 3: Run Tuning
print("\nRunning hyperparameter tuning for product type classification...")
all_results = run_all_models_comparison(X_with_storage, X_without_storage, y_product)

# Cell 4: Save Results
Product_type_parameters_summary = {
    'with_storage_condition': {},
    'without_storage_condition': {}
}

# Extract best parameters and scores for each model
for model_name, results in all_results.items():
    Product_type_parameters_summary['with_storage_condition'][model_name] = results['with_product_type']['best_params'],  # Note: keys remain the same from the function
    Product_type_parameters_summary['without_storage_condition'][model_name] =results['without_product_type']['best_params'],

# Save the results
with open('product_type_tuning_results.json', 'w') as f:
    json.dump(Product_type_parameters_summary, f, indent=4)


In [None]:

# Cell 5: Print Summary
print("\nSummary of Best Results:")
print("=" * 50)
print("\nWith Storage Condition:")
for model, results in Product_type_parameters_summary['with_storage_condition'].items():
    print(f"\n{model}:")
    print(f"Best Score: {results['best_score']:.4f}")
    print(f"Best Parameters: {results['best_params']}")

print("\nWithout Storage Condition:")
for model, results in Product_type_parameters_summary['without_storage_condition'].items():
    print(f"\n{model}:")
    print(f"Best Score: {results['best_score']:.4f}")
    print(f"Best Parameters: {results['best_params']}")

In [None]:

# Cell 5: Print Summary
print("\nSummary of Best Results:")
print("=" * 50)
print("\nWith Storage Condition:")
for model, results in Product_type_parameters_summary['with_storage_condition'].items():
    print(f"\n{model}:")
    print(f"Best Score: {results['best_score']:.4f}")
    print(f"Best Parameters: {results['best_params']}")

print("\nWithout Storage Condition:")
for model, results in Product_type_parameters_summary['without_storage_condition'].items():
    print(f"\n{model}:")
    print(f"Best Score: {results['best_score']:.4f}")
    print(f"Best Parameters: {results['best_params']}")

In [None]:

# Cell 2: Load and Prepare Data
print("Loading and preparing data...")
data = pd.read_csv('augmented_bakery_data.csv')

# Encode Product Type
le_product = LabelEncoder()
data['Product_Type_encoded'] = le_product.fit_transform(data['Product_Type'])

# Encode Storage Conditions
le_storage = LabelEncoder()
data['Storage_Condition_encoded'] = le_storage.fit_transform(data['Storage_Condition'])

In [11]:

# Cell 3: Prepare Features
# Get features (gains and phases only)
feature_cols = [col for col in data.columns if col.startswith(('gain_', 'phase_'))]
X_base = data[feature_cols]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_base)

# Prepare datasets for both classifications
X_with_product = np.column_stack([X_scaled, data['Product_Type_encoded'].values.reshape(-1, 1)])
X_without_product = X_scaled
y_storage = data['Storage_Condition_encoded']

In [None]:

# Cell 4: Run Comprehensive Model Comparison
print("Running comprehensive model comparison...")
all_results = run_all_models_comparison(X_with_product, X_without_product, y_storage)

In [7]:

# Cell 5: Save Results
# Create a more detailed results summary
def create_results_summary(all_results):
    summary = {
        'model_comparisons': {}
    }
    
    for model_name, results in all_results.items():
        summary['model_comparisons'][model_name] = {
            'with_product_type': {
                'best_score': float(results['with_product_type']['best_score']),
                'best_params': results['with_product_type']['best_params']
            },
            'without_product_type': {
                'best_score': float(results['without_product_type']['best_score']),
                'best_params': results['without_product_type']['best_params']
            },
            'performance_difference': float(
                results['with_product_type']['best_score'] - 
                results['without_product_type']['best_score']
            )
        }
    
    return summary

# Save results to JSON file
results_summary = create_results_summary(all_results)
with open('hyperparameter_tuning_results.json', 'w') as f:
    json.dump(results_summary, f, indent=4)



In [None]:

# Cell 6: Find Best Overall Model
def find_best_model(all_results):
    best_score = -1
    best_model = None
    best_config = None
    
    for model_name, results in all_results.items():
        with_score = results['with_product_type']['best_score']
        without_score = results['without_product_type']['best_score']
        
        if with_score > best_score:
            best_score = with_score
            best_model = model_name
            best_config = 'with_product_type'
        
        if without_score > best_score:
            best_score = without_score
            best_model = model_name
            best_config = 'without_product_type'
    
    return best_model, best_config, best_score

best_model, best_config, best_score = find_best_model(all_results)
print("\nBest Overall Model:")
print(f"Model: {best_model}")
print(f"Configuration: {best_config}")
print(f"Score: {best_score:.4f}")
print(f"Parameters: {all_results[best_model][best_config]['best_params']}")

In [None]:

# Add this new cell
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

def plot_cv_score_distribution(X_with, X_without, y, model_name):
    """Plot distribution of cross-validation scores"""
    # Get the best models
    model_with = all_results[model_name]['with_product_type']['best_model']
    model_without = all_results[model_name]['without_product_type']['best_model']
    
    # Get cross-validation scores
    scores_with = cross_val_score(model_with, X_with, y, cv=10)
    scores_without = cross_val_score(model_without, X_without, y, cv=10)
    
    # Plot distributions
    plt.figure(figsize=(10, 6))
    plt.boxplot([scores_with, scores_without], labels=['With Product Type', 'Without Product Type'])
    plt.title(f'{model_name} Cross-validation Score Distribution')
    plt.ylabel('Accuracy')
    plt.grid(True)
    plt.show()
    
    # Print statistical summary
    print(f"\nCross-validation scores summary for {model_name}:")
    print("\nWith Product Type:")
    print(f"Mean: {scores_with.mean():.4f}")
    print(f"Std: {scores_with.std():.4f}")
    print("\nWithout Product Type:")
    print(f"Mean: {scores_without.mean():.4f}")
    print(f"Std: {scores_without.std():.4f}")

# Plot CV score distribution for Random Forest
plot_cv_score_distribution(X_with_product, X_without_product, y_storage, 'Random Forest')