Common Imports and Constants

In [7]:
# Cell 1: Common Imports and Constants
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
import json
from hyperparameter_tuning import get_param_grids, run_all_models_comparison

# Constants
RANDOM_STATE = 42
CV_SPLITS = 10




check class imbalance

In [None]:
def check_class_balance(data, target_column):
    """Check class balance in the dataset"""
    counts = data[target_column].value_counts()
    percentages = data[target_column].value_counts(normalize=True) * 100
    
    print(f"Class Distribution for {target_column}:")
    for label, count, percentage in zip(counts.index, counts, percentages):
        print(f"{label}: {count} samples ({percentage:.2f}%)")
    
    # Calculate imbalance ratio
    min_count = counts.min()
    max_count = counts.max()
    imbalance_ratio = min_count / max_count
    
    print(f"\nImbalance ratio: {imbalance_ratio:.4f}")
    if imbalance_ratio > 0.8:
        print("Status: Well balanced (ratio > 0.8)")
    elif imbalance_ratio > 0.5:
        print("Status: Moderately imbalanced (0.5 < ratio < 0.8)")
    else:
        print("Status: Highly imbalanced (ratio < 0.5)")
    
    return imbalance_ratio

# Add this at the beginning of your notebook
data = pd.read_csv('../augmented_bakery_data.csv')
print("Checking class balance...")
check_class_balance(data, 'Storage_Condition')
check_class_balance(data, 'Product_Type')

data preparation


In [8]:
def prepare_base_data():
    """Load and prepare initial data"""
    print("Loading and preparing data...")
    data = pd.read_csv('../augmented_bakery_data.csv')
    
    # Encode labels
    le_product = LabelEncoder()
    le_storage = LabelEncoder()
    data['Product_Type_encoded'] = le_product.fit_transform(data['Product_Type'])
    data['Storage_Condition_encoded'] = le_storage.fit_transform(data['Storage_Condition'])
    
    # Prepare spectral features
    spectral_cols = [col for col in data.columns if col.startswith(('gain_', 'phase_'))]
    X_spectral = data[spectral_cols]
    
    # Scale spectral features
    scaler = StandardScaler()
    X_spectral_scaled = scaler.fit_transform(X_spectral)
    
    return data, X_spectral_scaled, le_product, le_storage

def save_results(results, filename, with_label, without_label):
    """Save and display results"""
    results_summary = {
        with_label: {},
        without_label: {}
    }
    
    print(f"\nModel Performance Summary:")
    print("=" * 80)
    
    for dataset_type in [with_label, without_label]:
        print(f"\n{dataset_type.replace('_', ' ').title()}:")
        print("-" * 40)
        
        for model_name, model_results in results.items():
            key = 'with_product_type' if 'with' in dataset_type else 'without_product_type'
            # Get the parameters directly without the 'best_params' wrapper
            params = model_results[key]['best_params']
            
            # Store parameters directly in the results summary
            results_summary[dataset_type][model_name] = params
            
            print(f"\n{model_name}:")
            print(f"Best Parameters: {params}")
    
    with open(filename, 'w') as f:
        json.dump(results_summary, f, indent=4)
    print(f"\nResults have been saved to '{filename}'")

In [9]:
from scipy import stats

def compare_models_statistically(all_results):
    """Perform statistical comparison between models with and without additional features"""
    print("\nStatistical Comparison of Models:")
    print("=" * 50)
    
    for model_name, results in all_results.items():
        # Get CV results
        with_scores = results['with_product_type']['cv_results']['mean_test_score']
        without_scores = results['without_product_type']['cv_results']['mean_test_score']
        
        # Perform paired t-test
        t_stat, p_value = stats.ttest_rel(with_scores, without_scores)
        
        print(f"\n{model_name}:")
        print(f"Mean with product type: {results['with_product_type']['best_score']:.4f}")
        print(f"Mean without product type: {results['without_product_type']['best_score']:.4f}")
        print(f"Difference: {results['with_product_type']['best_score'] - results['without_product_type']['best_score']:.4f}")
        print(f"p-value: {p_value:.4f}")
        print(f"Statistically significant difference: {'Yes' if p_value < 0.05 else 'No'}")


Storage Condition Hyperparameter Tuning


In [10]:


# Cell 3: Storage Condition Classification
print("Running Storage Condition Classification...")
data, X_spectral_scaled, le_product, le_storage = prepare_base_data()

# Prepare datasets
product_type = data['Product_Type_encoded'].values.reshape(-1, 1)
X_with_product = np.column_stack([X_spectral_scaled, product_type])
X_without_product = X_spectral_scaled
y_storage = data['Storage_Condition_encoded'].values

print(f"Dataset shapes:")
print(f"X without product type: {X_without_product.shape}")
print(f"X with product type: {X_with_product.shape}")
print(f"Storage conditions: {dict(zip(le_storage.classes_, le_storage.transform(le_storage.classes_)))}")

# Run tuning
all_results = run_all_models_comparison(X_with_product, X_without_product, y_storage)
save_results(all_results, 'storage_condition_results.json', 'with_product_type', 'without_product_type')
# Add this after running the analysis
compare_models_statistically(all_results)


Running Storage Condition Classification...
Loading and preparing data...
Dataset shapes:
X without product type: (180, 202)
X with product type: (180, 203)
Storage conditions: {'Humid': 0, 'Open': 1, 'Wrapped': 2}

######################################################################
Running comparison for SVM
######################################################################

Tuning SVM with product_type...
Using RandomizedSearchCV with 30 iterations for SVM
Fitting 10 folds for each of 30 candidates, totalling 300 fits

Best parameters for SVM:
Mean CV accuracy: 0.9333
Parameters: {'C': 89.06204386161681, 'gamma': 'scale', 'kernel': 'rbf'}
Train-Test gap: 0.0617

Tuning SVM without product_type...
Using RandomizedSearchCV with 30 iterations for SVM
Fitting 10 folds for each of 30 candidates, totalling 300 fits

Best parameters for SVM:
Mean CV accuracy: 0.9167
Parameters: {'C': 89.06204386161681, 'gamma': 'scale', 'kernel': 'rbf'}
Train-Test gap: 0.0796

Comparison Results for 

Product Type Hyperparameter Tuning

In [None]:
# Cell 4: Product Type Classification
print("\nRunning Product Type Classification...")
data, X_spectral_scaled, le_product, le_storage = prepare_base_data()

# Prepare datasets
storage_condition = data['Storage_Condition_encoded'].values.reshape(-1, 1)
X_with_storage = np.column_stack([X_spectral_scaled, storage_condition])
X_without_storage = X_spectral_scaled
y_product = data['Product_Type_encoded'].values

print(f"Dataset shapes:")
print(f"X without storage condition: {X_without_storage.shape}")
print(f"X with storage condition: {X_with_storage.shape}")
print(f"Product types: {dict(zip(le_product.classes_, le_product.transform(le_product.classes_)))}")

# Run tuning
all_results = run_all_models_comparison(X_with_storage, X_without_storage, y_product)
save_results(all_results, 'product_type_results.json', 'with_storage_condition', 'without_storage_condition')
# Add this after running the analysis
compare_models_statistically(all_results)


Storage Condition Hyperparameter Tuning
 with LDA

In [None]:
# Cell 5: Storage Condition Classification with LDA
print("\nRunning Storage Condition Classification with LDA...")
data, X_spectral_scaled, le_product, le_storage = prepare_base_data()

# Apply LDA
y_storage = data['Storage_Condition_encoded'].values
lda = LDA(n_components=min(len(np.unique(y_storage)) - 1, X_spectral_scaled.shape[1]))
X_lda = lda.fit_transform(X_spectral_scaled, y_storage)

# Prepare datasets
product_type = data['Product_Type_encoded'].values.reshape(-1, 1)
X_with_product = np.column_stack([X_lda, product_type])
X_without_product = X_lda

print(f"Dataset shapes (LDA):")
print(f"X without product type: {X_without_product.shape}")
print(f"X with product type: {X_with_product.shape}")

# Run tuning
all_results = run_all_models_comparison(X_with_product, X_without_product, y_storage)
save_results(all_results, 'LDA_storage_condition_results.json', 'with_product_type', 'without_product_type')
# Add this after running the analysis
compare_models_statistically(all_results)


Product Type  Hyperparameter Tuning with LDA


In [None]:
# Cell 6: Product Type Classification with LDA
print("\nRunning Product Type Classification with LDA...")
data, X_spectral_scaled, le_product, le_storage = prepare_base_data()

# Apply LDA
y_product = data['Product_Type_encoded'].values
lda = LDA(n_components=min(len(np.unique(y_product)) - 1, X_spectral_scaled.shape[1]))
X_lda = lda.fit_transform(X_spectral_scaled, y_product)

# Prepare datasets
storage_condition = data['Storage_Condition_encoded'].values.reshape(-1, 1)
X_with_storage = np.column_stack([X_lda, storage_condition])
X_without_storage = X_lda

print(f"Dataset shapes (LDA):")
print(f"X without storage condition: {X_without_storage.shape}")
print(f"X with storage condition: {X_with_storage.shape}")

# Run tuning
all_results = run_all_models_comparison(X_with_storage, X_without_storage, y_product)
save_results(all_results, 'LDA_product_type_results.json', 'with_storage_condition', 'without_storage_condition')
# Add this after running the analysis
compare_models_statistically(all_results)


In [None]:
from scipy import stats

def compare_models_statistically(all_results):
    """Perform statistical comparison between models with and without additional features"""
    print("\nStatistical Comparison of Models:")
    print("=" * 50)
    
    for model_name, results in all_results.items():
        # Get CV results
        with_scores = results['with_product_type']['cv_results']['mean_test_score']
        without_scores = results['without_product_type']['cv_results']['mean_test_score']
        
        # Perform paired t-test
        t_stat, p_value = stats.ttest_rel(with_scores, without_scores)
        
        print(f"\n{model_name}:")
        print(f"Mean with product type: {results['with_product_type']['best_score']:.4f}")
        print(f"Mean without product type: {results['without_product_type']['best_score']:.4f}")
        print(f"Difference: {results['with_product_type']['best_score'] - results['without_product_type']['best_score']:.4f}")
        print(f"p-value: {p_value:.4f}")
        print(f"Statistically significant difference: {'Yes' if p_value < 0.05 else 'No'}")

# Add this after running the analysis
compare_models_statistically(all_results)

In [None]:
def plot_feature_importance(model, feature_names, top_n=20):
    """Plot feature importance for Random Forest model"""
    if hasattr(model, 'feature_importances_'):
        # Get feature importances
        importances = model.feature_importances_
        
        # Create DataFrame
        feature_importance = pd.DataFrame({
            'Feature': feature_names,
            'Importance': importances
        })
        
        # Sort by importance
        feature_importance = feature_importance.sort_values('Importance', ascending=False)
        
        # Plot top N features
        plt.figure(figsize=(12, 8))
        sns.barplot(x='Importance', y='Feature', data=feature_importance.head(top_n))
        plt.title('Top Feature Importances')
        plt.tight_layout()
        plt.savefig('feature_importance.png')
        plt.show()
        
        return feature_importance
    else:
        print("Model doesn't have feature_importances_ attribute")
        return None

# Add this after running Random Forest
spectral_cols = [col for col in data.columns if col.startswith(('gain_', 'phase_'))]
rf_model = all_results['Random Forest']['with_product_type']['best_model']
feature_names = spectral_cols + ['Product_Type']
plot_feature_importance(rf_model, feature_names)

In [None]:
# Cell: Model Performance Analysis
import matplotlib.pyplot as plt
import seaborn as sns

def analyze_model_performance(all_results):
    """Analyze and visualize model performance including overfitting detection"""
    
    # Prepare data for plotting
    models = []
    train_scores = []
    val_scores = []
    cv_scores = []
    variants = []
    
    for model_name, results in all_results.items():
        for variant in ['with_product_type', 'without_product_type']:
            models.append(model_name)
            variants.append(variant)
            res = results[variant]
            train_scores.append(res['train_score'])
            val_scores.append(res['validation_score'])
            cv_scores.append(res['best_score'])
    
    # Create DataFrame
    df = pd.DataFrame({
        'Model': models,
        'Variant': variants,
        'Training Score': train_scores,
        'Validation Score': val_scores,
        'CV Score': cv_scores
    })
    
    # Plot performance comparison
    plt.figure(figsize=(15, 8))
    
    x = np.arange(len(df['Model'].unique()))
    width = 0.15
    
    plt.bar(x - width, df[df['Variant'] == 'with_product_type']['Training Score'], 
            width, label='With Features (Train)', alpha=0.7)
    plt.bar(x, df[df['Variant'] == 'with_product_type']['Validation Score'], 
            width, label='With Features (Val)', alpha=0.7)
    plt.bar(x + width, df[df['Variant'] == 'without_product_type']['Training Score'], 
            width, label='Without Features (Train)', alpha=0.7)
    plt.bar(x + 2*width, df[df['Variant'] == 'without_product_type']['Validation Score'], 
            width, label='Without Features (Val)', alpha=0.7)
    
    plt.xlabel('Models')
    plt.ylabel('Score')
    plt.title('Model Performance Comparison')
    plt.xticks(x, df['Model'].unique(), rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    # Print overfitting analysis
    print("\nOverfitting Analysis:")
    print("=" * 50)
    for model_name, results in all_results.items():
        print(f"\n{model_name}:")
        for variant in ['with_product_type', 'without_product_type']:
            res = results[variant]
            print(f"\n{variant}:")
            print(f"Train-Val difference: {res['overfitting_score']:.4f}")
            if res['overfitting_score'] > 0.05:
                print("WARNING: Potential overfitting detected!")
            elif res['overfitting_score'] < -0.05:
                print("WARNING: Potential underfitting detected!")
            else:
                print("Good fit!")

# Run the analysis
analyze_model_performance(all_results)


# Add this new cell
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

def plot_cv_score_distribution(X_with, X_without, y, model_name):
    """Plot distribution of cross-validation scores"""
    # Get the best models
    model_with = all_results[model_name]['with_product_type']['best_model']
    model_without = all_results[model_name]['without_product_type']['best_model']
    
    # Get cross-validation scores
    scores_with = cross_val_score(model_with, X_with, y, cv=10)
    scores_without = cross_val_score(model_without, X_without, y, cv=10)
    
    # Plot distributions
    plt.figure(figsize=(10, 6))
    plt.boxplot([scores_with, scores_without], labels=['With Product Type', 'Without Product Type'])
    plt.title(f'{model_name} Cross-validation Score Distribution')
    plt.ylabel('Accuracy')
    plt.grid(True)
    plt.show()
    
    # Print statistical summary
    print(f"\nCross-validation scores summary for {model_name}:")
    print("\nWith Product Type:")
    print(f"Mean: {scores_with.mean():.4f}")
    print(f"Std: {scores_with.std():.4f}")
    print("\nWithout Product Type:")
    print(f"Mean: {scores_without.mean():.4f}")
    print(f"Std: {scores_without.std():.4f}")

# Plot CV score distribution for Random Forest
plot_cv_score_distribution(X_with_product, X_without_product, y_storage, 'Random Forest')