In [85]:
# Cell 1: Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold, learning_curve
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import json
import os
from itertools import cycle

# Import models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression



In [None]:
all_data = pd.read_csv('augmented_bakery_data.csv')
sns.scatterplot(x='gain_0',y='gain_100',hue='Product_Type', data=all_data)


In [None]:
# Cell 2: Load and Prepare Data
print("Loading and preparing data...")
# Load data
data = pd.read_csv('augmented_bakery_data.csv')

# Encode Product Type
le_product = LabelEncoder()
data['Product_Type_encoded'] = le_product.fit_transform(data['Product_Type'])

# One-hot encode Storage Conditions
storage_dummies = pd.get_dummies(data['Storage_Condition'], prefix='storage')
data = pd.concat([data, storage_dummies], axis=1)

print("\nProduct Type Encoding:")
for i, label in enumerate(le_product.classes_):
    print(f"{label}: {i}")

print("\nStorage Condition Columns:")
print(storage_dummies.columns.tolist())


In [None]:
# Cell 3: Prepare Features and Target
print("\nPreparing features...")
# Get base features (gains and phases)
feature_cols = [col for col in data.columns if col.startswith(('gain_', 'phase_'))]
X_without_storage = data[feature_cols].copy()
y_product = data['Product_Type_encoded']

# Prepare features with storage conditions
storage_cols = [col for col in data.columns if col.startswith('storage_')]
X_with_storage = pd.concat([X_without_storage, data[storage_cols]], axis=1)

# Scale the features
scaler = StandardScaler()
X_without_storage_scaled = scaler.fit_transform(X_without_storage)
X_with_storage_scaled = scaler.fit_transform(X_with_storage)



In [None]:
#features 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


feature_cols = [col for col in all_data.columns if col.startswith(('gain_', 'phase_'))]
X = all_data[feature_cols]
y = all_data['Storage_Condition']

# LDA for Storage Condition with n_components=2
lda = LinearDiscriminantAnalysis(n_components=2)
X_lda = lda.fit_transform(X, y)

# Plot LDA results
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_lda[:, 0], y=X_lda[:, 1], hue=y, palette='Set2')
plt.xlabel('LDA Component 1')
plt.ylabel('LDA Component 2')
plt.title('LDA of PCA Components for Storage Condition')
plt.legend()
plt.show()




In [90]:

# Constants
RANDOM_STATE = 42
CV_SPLITS = 10

# Cell 2: Load Best Parameters and Create Models
def load_best_parameters():
    """Load best parameters from JSON file"""
    with open('hyperparameter_tuning/LDA_storage_condition_results.json', 'r') as f:
        return json.load(f)

RESULTS_DIR = 'results/LDA4Storage_Condition_Classification'
os.makedirs(RESULTS_DIR, exist_ok=True)
        
def create_models(best_params, scenario='with_product_type'):
    """Create models with best parameters"""
    models = {}
    params = best_params[scenario]
    
    for model_name, model_params in params.items():
        if model_name == 'SVM':
            models[model_name] = SVC(**model_params, random_state=42, probability=True)
        elif model_name == 'Random Forest':
            models[model_name] = RandomForestClassifier(**model_params, random_state=42)
        elif model_name == 'KNN':
            models[model_name] = KNeighborsClassifier(**model_params)
        elif model_name == 'Neural Network':
            models[model_name] = MLPClassifier(**model_params, random_state=42)
        elif model_name == 'Logistic Regression':
            models[model_name] = LogisticRegression(**model_params, random_state=42)
    
    return models


# Cell 3: Visualization Functions

In [91]:

# Cell 3: Visualization Functions
def plot_model_comparison(results_without, results_with):
    """Plot model comparison bar chart"""
    models = list(results_without.keys())
    
    means_without = [results_without[model]['mean_accuracy'] for model in models]
    stds_without = [results_without[model]['std_accuracy'] for model in models]
    
    means_with = [results_with[model]['mean_accuracy'] for model in models]
    stds_with = [results_with[model]['std_accuracy'] for model in models]
    
    x = np.arange(len(models))
    width = 0.35
    
    fig, ax = plt.subplots(figsize=(12, 6))
    rects1 = ax.bar(x - width/2, means_without, width, yerr=stds_without,
                    label='Without Product Type', capsize=5)
    rects2 = ax.bar(x + width/2, means_with, width, yerr=stds_with,
                    label='With Product Type', capsize=5)
    
    ax.set_ylabel('Mean Accuracy')
    ax.set_title('Model Comparison for Storage Condition Classification')
    ax.set_xticks(x)
    ax.set_xticklabels(models, rotation=45, ha='right')
    ax.legend()
    
    def autolabel(rects):
        for rect in rects:
            height = rect.get_height()
            ax.annotate(f'{height:.3f}',
                       xy=(rect.get_x() + rect.get_width()/2, height),
                       xytext=(0, 3),
                       textcoords="offset points",
                       ha='center', va='bottom')
    
    autolabel(rects1)
    autolabel(rects2)
    
    plt.tight_layout()
    plt.savefig(os.path.join(RESULTS_DIR, 'storage_condition_comparison.png'))
    plt.show()

In [92]:

def plot_confusion_matrices(results_without, results_with, class_labels):
    """Plot confusion matrices side by side for with/without product type"""
    for model_name in results_without.keys():
        # Create two subplots side by side
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
        
        # Plot first confusion matrix (Without Product Type)
        sns.heatmap(results_without[model_name]['confusion_matrix'], 
                   annot=True, fmt='d', cmap='Blues',
                   xticklabels=range(len(class_labels)),
                   yticklabels=range(len(class_labels)), ax=ax1)
        ax1.set_title(f'{model_name} - Without Product Type')
        ax1.set_ylabel('Actual')
        ax1.set_xlabel('Predicted')
        
        # Plot second confusion matrix (With Product Type)
        sns.heatmap(results_with[model_name]['confusion_matrix'], 
                   annot=True, fmt='d', cmap='Blues',
                   xticklabels=range(len(class_labels)),
                   yticklabels=range(len(class_labels)), ax=ax2)
        ax2.set_title(f'{model_name} - With Product Type')
        ax2.set_ylabel('Actual')
        ax2.set_xlabel('Predicted')
        
        plt.tight_layout()
        filename = f'confusion_matrix_{model_name.lower().replace(" ", "_")}.png'
        plt.savefig(os.path.join(RESULTS_DIR, filename))
        plt.close()
        


In [93]:
        
def plot_learning_curves(X, y, model, model_name, cv=CV_SPLITS):
    """Plot learning curves with confidence intervals"""
    train_sizes = np.linspace(0.1, 1.0, 10)
    #cv = StratifiedKFold(n_splits=CV_SPLITS, shuffle=True, random_state=RANDOM_STATE)
    train_sizes, train_scores, val_scores = learning_curve(
        model, X, y,
        train_sizes=train_sizes,
        cv=cv,
        n_jobs=-1,
        scoring='accuracy'
    )
    
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    val_mean = np.mean(val_scores, axis=1)
    val_std = np.std(val_scores, axis=1)
    
    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_mean, label='Training Score', color='blue', marker='o')
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, 
                     alpha=0.15, color='blue')
    plt.plot(train_sizes, val_mean, label='Cross-validation Score', color='red', marker='o')
    plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, 
                     alpha=0.15, color='red')
    
    plt.title(f'Learning Curves - {model_name}')
    plt.xlabel('Training Examples')
    plt.ylabel('Accuracy')
    plt.grid(True)
    plt.legend(loc='lower right')
    
    final_train = f"Final training score: {train_mean[-1]:.4f} ± {train_std[-1]:.4f}"
    final_val = f"Final validation score: {val_mean[-1]:.4f} ± {val_std[-1]:.4f}"
    plt.annotate(final_train, xy=(0.6, 0.2), xycoords='axes fraction')
    plt.annotate(final_val, xy=(0.6, 0.15), xycoords='axes fraction')
    
    plt.tight_layout()
    #plt.savefig(f'learning_curve_{model_name.lower().replace(" ", "_")}.png')
    #plt.show()
    # Save plot
    filename = f'learning_curve_{model_name.lower().replace(" ", "_")}.png'
    plt.savefig(os.path.join(RESULTS_DIR, filename))
    plt.close()
    
    return {
        'train_sizes': train_sizes,
        'train_scores': {'mean': train_mean, 'std': train_std},
        'val_scores': {'mean': val_mean, 'std': val_std}
    }

In [94]:

def perform_statistical_analysis(results_without, results_with):
    """Perform statistical analysis"""
    analysis_results = []
    
    print("\nStatistical Analysis Results:")
    print("="*50)
    
    for model in results_without.keys():
        acc_without = results_without[model]['fold_accuracies']
        acc_with = results_with[model]['fold_accuracies']
        
        t_stat, p_value = stats.ttest_rel(acc_with, acc_without)
        improvement = (np.mean(acc_with) - np.mean(acc_without)) * 100
        
        result = {
            'Model': model,
            'Accuracy without Product': f"{np.mean(acc_without):.4f} ± {np.std(acc_without):.4f}",
            'Accuracy with Product': f"{np.mean(acc_with):.4f} ± {np.std(acc_with):.4f}",
            'Improvement (%)': f"{improvement:.2f}%",
            'p-value': f"{p_value:.4f}",
            'Significant': "Yes" if p_value < 0.05 else "No"
        }
        analysis_results.append(result)
        
        print(f"\n{model}:")
        print(f"- Without Product Type: {result['Accuracy without Product']}")
        print(f"- With Product Type: {result['Accuracy with Product']}")
        print(f"- Improvement: {result['Improvement (%)']}")
        print(f"- Statistical Significance (p < 0.05): {result['Significant']} (p = {result['p-value']})")

    results_df = pd.DataFrame(analysis_results)
    results_df.to_csv('statistical_analysis_results.csv', index=False)
    return results_df
    


# Cell 4: Data Preparation

In [95]:

# Cell 4: Data Preparation
def prepare_data():
    """Load and prepare data consistently"""
    print("Loading data and preparing features...")
    data = pd.read_csv('processed_bakery_data.csv')
    
    # Encode labels
    le_product = LabelEncoder()
    le_storage = LabelEncoder()
    data['Product_Type_encoded'] = le_product.fit_transform(data['Product_Type'])
    data['Storage_Condition_encoded'] = le_storage.fit_transform(data['Storage_Condition'])
    
    # Prepare features
    feature_cols = [col for col in data.columns if col.startswith(('gain_', 'phase_'))]
    X_base = data[feature_cols]
    
    # Scale features on entire dataset
    scaler = StandardScaler()
    X_base_scaled = scaler.fit_transform(X_base)
    
    # Prepare datasets
    X_without_product = X_base_scaled
    X_with_product = np.column_stack([X_base_scaled, data['Product_Type_encoded'].values.reshape(-1, 1)])
    y_storage = data['Storage_Condition_encoded'].values
    
    return X_with_product, X_without_product, y_storage, le_storage.classes_


# Cell 5: Model Evaluation

In [96]:
from sklearn.calibration import cross_val_predict
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

def evaluate_models(X, y, models, cv=CV_SPLITS):
    """Evaluate models using cross-validation with scikit-learn built-in functions and LDA"""
    results = {}
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=RANDOM_STATE)
    num_classes = len(np.unique(y))  # Number of unique classes in the target variable
    
    for model_name, model in models.items():
        print(f"\nEvaluating {model_name}...")
        
        # Create a pipeline with LDA and the model
        pipeline = Pipeline([
            ('lda', LDA(n_components=num_classes - 1)),  # LDA for dimensionality reduction
            (model_name, model)  # The model to evaluate
        ])
        
        # Use cross_val_score to compute fold-wise accuracies
        cv_scores = cross_val_score(pipeline, X, y, cv=skf, scoring='accuracy', n_jobs=-1)
        
        # Use cross_val_predict to get predictions for the entire dataset
        y_pred = cross_val_predict(pipeline, X, y, cv=skf, n_jobs=-1)
        
        # Calculate metrics
        mean_accuracy = cv_scores.mean()
        std_accuracy = cv_scores.std()
        conf_matrix = confusion_matrix(y, y_pred)
        clf_report = classification_report(y, y_pred)
        
        # Store results
        results[model_name] = {
            'fold_accuracies': cv_scores,
            'mean_accuracy': mean_accuracy,
            'std_accuracy': std_accuracy,
            'confusion_matrix': conf_matrix,
            'classification_report': clf_report
        }
        
        # Print results
        print(f"\n{model_name} Final Results:")
        print(f"Mean accuracy: {mean_accuracy:.4f} ± {std_accuracy:.4f}")
        print("\nClassification Report:")
        print(clf_report)
    
    return results

In [97]:
def save_analysis_summary(results_without, results_with, stats_results):
    """Generate and save comprehensive analysis summary"""
    
    with open('analysis_summary.txt', 'w') as f:
        # Header
        f.write("Storage Condition Classification Analysis Summary\n")
        f.write("==================================================\n\n")
        
        # 1. Best Performing Models section
        f.write("1. Best Performing Models:\n\n")
        
        # Find best models
        best_without = max(results_without.items(), 
                         key=lambda x: x[1]['mean_accuracy'])
        best_with = max(results_with.items(), 
                       key=lambda x: x[1]['mean_accuracy'])
        
        # Without Product Type
        f.write("Without Product Type:\n")
        f.write(f"- Best Model: {best_without[0]}\n")
        f.write(f"- Accuracy: {best_without[1]['mean_accuracy']:.4f} ± "
                f"{best_without[1]['std_accuracy']:.4f}\n\n")
        
        # With Product Type
        f.write("With Product Type:\n")
        f.write(f"- Best Model: {best_with[0]}\n")
        f.write(f"- Accuracy: {best_with[1]['mean_accuracy']:.4f} ± "
                f"{best_with[1]['std_accuracy']:.4f}\n\n")
        
        # 2. Impact of Product Type section
        f.write("2. Impact of Product Type:\n")
        
        # Create DataFrame for formatted table
        data = []
        for model_name in results_without.keys():
            acc_without = results_without[model_name]['mean_accuracy']
            std_without = results_without[model_name]['std_accuracy']
            
            acc_with = results_with[model_name]['mean_accuracy']
            std_with = results_with[model_name]['std_accuracy']
            
            improvement = ((acc_with - acc_without) / acc_without) * 100
            
            # Get p-value from stats_results DataFrame
            p_value = float(stats_results[stats_results['Model'] == model_name]['p-value'].values[0].strip())
            
            data.append({
                'Model': model_name,
                'Accuracy without Product': f"{acc_without:.4f} ± {std_without:.4f}",
                'Accuracy with Product': f"{acc_with:.4f} ± {std_with:.4f}",
                'Improvement (%)': f"{improvement:.2f}%",
                'p-value': f"{p_value:.4f}",
                'Significant': 'Yes' if p_value < 0.05 else 'No'
            })
        
        # Convert to DataFrame and write as formatted table
        df = pd.DataFrame(data)
        f.write(df.to_string(index=False))
        
        # Additional Analysis
        f.write("\n\n3. Additional Insights:\n")
        f.write("-----------------------\n")
        
        # Count models with significant improvement
        significant_improvements = sum(1 for d in data if float(d['Improvement (%)'].strip('%')) > 0 
                                    and d['Significant'] == 'Yes')
        f.write(f"\nNumber of models with significant improvement: {significant_improvements}\n")
        
        # Best improvement
        best_improvement = max(data, key=lambda x: float(x['Improvement (%)'].strip('%')))
        f.write(f"Model with best improvement: {best_improvement['Model']} "
                f"({best_improvement['Improvement (%)']})\n")
        
        # Overall recommendation
        f.write("\nOverall Recommendation:\n")
        if significant_improvements > 0:
            f.write("Including product type information appears beneficial for classification performance.\n")
        else:
            f.write("Product type information does not significantly improve classification performance.\n")

# # Cell 6: Main Execution

In [None]:
# # Cell 6: Main Execution
if __name__ == "__main__":
    # 1. Prepare Data
    X_with_product, X_without_product, y_storage, class_labels = prepare_data()
    
    # 2. Load Models
    print("Loading best parameters and creating models...")
    best_params = load_best_parameters()
    models_with_product = create_models(best_params, 'with_product_type')
    models_without_product = create_models(best_params, 'without_product_type')
    
    # 3. Evaluate Models
    print("\nEvaluating models with product type...")
    results_with_product = evaluate_models(X_with_product, y_storage, models_with_product)
    
    print("\nEvaluating models without product type...")
    results_without_product = evaluate_models(X_without_product, y_storage, models_without_product)
    
    # 4. Generate Visualizations and Analysis
    print("\nGenerating visualizations and analysis...")
    
    # Model comparison plot
    plot_model_comparison(results_without_product, results_with_product)
    
    # Confusion matrices
    plot_confusion_matrices(results_without_product, results_with_product, class_labels)
    
    # Statistical analysis
    stats_results = perform_statistical_analysis(results_without_product, results_with_product)
    
    # Save analysis summary
    save_analysis_summary(results_without_product, results_with_product, stats_results)

    # Learning curves
    for model_name, model in models_with_product.items():
        print(f"\nGenerating learning curve for {model_name}...")
        plot_learning_curves(X_with_product, y_storage, model, model_name)
    
    print("\nAnalysis complete!")