In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
import json
import os

# Constants
RANDOM_STATE = 42
CV_SPLITS = 10

def plot_confusion_matrix(y_true, y_pred, title="Confusion Matrix"):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(title)
    plt.show()

def load_best_parameters():
    """Load best parameters from JSON file"""
    with open('best_parameters.json', 'r') as f:
        return json.load(f)

def prepare_data():
    """Load and prepare data consistently"""
    print("Loading data and preparing features...")
    data = pd.read_csv('augmented_bakery_data.csv')
    
    # Encode labels
    le_product = LabelEncoder()
    le_storage = LabelEncoder()
    data['Product_Type_encoded'] = le_product.fit_transform(data['Product_Type'])
    data['Storage_Condition_encoded'] = le_storage.fit_transform(data['Storage_Condition'])
    
    # Prepare features
    feature_cols = [col for col in data.columns if col.startswith(('gain_', 'phase_'))]
    X_base = data[feature_cols]
    
    # Scale features on entire dataset
    scaler = StandardScaler()
    X_base_scaled = scaler.fit_transform(X_base)
    
    # Prepare datasets
    X_without_product = X_base_scaled
    X_with_product = np.column_stack([X_base_scaled, data['Product_Type_encoded'].values.reshape(-1, 1)])
    y_storage = data['Storage_Condition_encoded'].values
    
    return X_with_product, X_without_product, y_storage, le_storage.classes_

def classify_model(model, features, target, model_name, return_accuracy=False):
    """Performs classification with the given model"""
    # Use the same CV strategy as in hyperparameter tuning
    cv = StratifiedKFold(n_splits=CV_SPLITS, shuffle=True, random_state=RANDOM_STATE)
    
    # Expected accuracies from hyperparameter tuning
    expected_accuracies = {
        'svm': 0.950 if features.shape[1] > X_without_product.shape[1] else 0.917,
        'rf': 0.967,
        'knn': 0.906,
        'mlp': 0.867 if features.shape[1] > X_without_product.shape[1] else 0.922,
        'lr': 0.889
    }
    
    # Perform cross-validation
    cv_scores = cross_val_score(model, features, target, cv=cv, scoring='accuracy', n_jobs=-1)
    mean_accuracy = cv_scores.mean()
    std_accuracy = cv_scores.std()
    
    print(f"\n{model_name.upper()} Classification Results:")
    print(f"Cross-Validation Scores: {cv_scores}")
    print(f"Mean CV Accuracy: {mean_accuracy:.4f} ± {std_accuracy:.4f}")
    print(f"Expected Accuracy: {expected_accuracies.get(model_name.lower(), 'N/A')}")
    
    # Predict using cross-validation
    y_pred = cross_val_predict(model, features, target, cv=cv, n_jobs=-1)
    plot_confusion_matrix(target, y_pred, title=f"{model_name.upper()} Confusion Matrix")
    print(classification_report(target, y_pred))
    
    if return_accuracy:
        return mean_accuracy

def run_classification(features, target, params, scenario='with_product_type'):
    """Run classification for all models"""
    results = {}
    
    # Create and run models
    models = {
        'SVM': SVC(**params[scenario]['SVM'], random_state=RANDOM_STATE),
        'Random Forest': RandomForestClassifier(**params[scenario]['Random Forest'], random_state=RANDOM_STATE),
        'KNN': KNeighborsClassifier(**params[scenario]['KNN']),
        'Neural Network': MLPClassifier(**params[scenario]['Neural Network'], random_state=RANDOM_STATE),
        'Logistic Regression': LogisticRegression(**params[scenario]['Logistic Regression'], random_state=RANDOM_STATE)
    }
    
    for name, model in models.items():
        print(f"\nRunning {name}...")
        acc = classify_model(model, features, target, name.lower().replace(' ', '_'), return_accuracy=True)
        results[name] = acc
    
    return results

def plot_accuracy_comparisons(results_without, results_with):
    models = list(results_without.keys())
    accuracies_without = list(results_without.values())
    accuracies_with = list(results_with.values())
    
    x = np.arange(len(models))
    width = 0.35

    fig, ax = plt.subplots(figsize=(12, 6))
    bars1 = ax.bar(x - width/2, accuracies_without, width, label='Without')
    bars2 = ax.bar(x + width/2, accuracies_with, width, label='With')

    ax.set_ylabel('Mean Accuracy')
    ax.set_title('Model Comparison for Storage Condition Classification')
    ax.set_xticks(x)
    ax.set_xticklabels(models, rotation=45, ha='right')
    ax.legend()

    def autolabel(bars):
        for bar in bars:
            height = bar.get_height()
            ax.annotate(f'{height:.3f}',
                       xy=(bar.get_x() + bar.get_width()/2, height),
                       xytext=(0, 3),
                       textcoords="offset points",
                       ha='center', va='bottom')

    autolabel(bars1)
    autolabel(bars2)
    
    plt.tight_layout()
    plt.show()

# Main execution
if __name__ == "__main__":
    # Load data and best parameters
    X_with_product, X_without_product, y_storage, class_labels = prepare_data()
    best_params = load_best_parameters()
    
    # Run classifications
    print("\nRunning classification without Product Type...")
    results_without_product = run_classification(X_without_product, y_storage, best_params, 'without_product_type')
    
    print("\nRunning classification with Product Type...")
    results_with_product = run_classification(X_with_product, y_storage, best_params, 'with_product_type')
    
    # Plot results
    plot_accuracy_comparisons(results_without_product, results_with_product)
    
    # Save results
    results = {
        'without_product_type': results_without_product,
        'with_product_type': results_with_product
    }
    
    with open('classification_results.json', 'w') as f:
        json.dump(results, f, indent=4)

In [None]:
def plot_confusion_matrix(y_true, y_pred, title="Confusion Matrix"):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(title)
    plt.show()


In [7]:

def load_best_parameters():
    """Load best parameters from JSON file"""
    with open('best_parameters.json', 'r') as f:
        return json.load(f)

def prepare_data():
    """Load and prepare data consistently"""
    print("Loading data and preparing features...")
    data = pd.read_csv('augmented_bakery_data.csv')
    
    # Encode labels
    le_product = LabelEncoder()
    le_storage = LabelEncoder()
    data['Product_Type_encoded'] = le_product.fit_transform(data['Product_Type'])
    data['Storage_Condition_encoded'] = le_storage.fit_transform(data['Storage_Condition'])
    
    # Prepare features
    feature_cols = [col for col in data.columns if col.startswith(('gain_', 'phase_'))]
    X_base = data[feature_cols]
    
    # Scale features on entire dataset
    scaler = StandardScaler()
    X_base_scaled = scaler.fit_transform(X_base)
    
    # Prepare datasets
    X_without_product = X_base_scaled
    X_with_product = np.column_stack([X_base_scaled, data['Product_Type_encoded'].values.reshape(-1, 1)])
    y_storage = data['Storage_Condition_encoded'].values
    
    return X_with_product, X_without_product, y_storage, le_storage.classes_


In [8]:

def classify_model(model, features, target, model_name, return_accuracy=False):
    """Performs classification with the given model"""
    # Use the same CV strategy as in hyperparameter tuning
    cv = StratifiedKFold(n_splits=CV_SPLITS, shuffle=True, random_state=RANDOM_STATE)
    
    # Expected accuracies from hyperparameter tuning
    expected_accuracies = {
        'svm': 0.950 if features.shape[1] > X_without_product.shape[1] else 0.917,
        'rf': 0.967,
        'knn': 0.906,
        'mlp': 0.867 if features.shape[1] > X_without_product.shape[1] else 0.922,
        'lr': 0.889
    }
    
    # Perform cross-validation
    cv_scores = cross_val_score(model, features, target, cv=cv, scoring='accuracy', n_jobs=-1)
    mean_accuracy = cv_scores.mean()
    std_accuracy = cv_scores.std()
    
    print(f"\n{model_name.upper()} Classification Results:")
    print(f"Cross-Validation Scores: {cv_scores}")
    print(f"Mean CV Accuracy: {mean_accuracy:.4f} ± {std_accuracy:.4f}")
    print(f"Expected Accuracy: {expected_accuracies.get(model_name.lower(), 'N/A')}")
    
    # Predict using cross-validation
    y_pred = cross_val_predict(model, features, target, cv=cv, n_jobs=-1)
    plot_confusion_matrix(target, y_pred, title=f"{model_name.upper()} Confusion Matrix")
    print(classification_report(target, y_pred))
    
    if return_accuracy:
        return mean_accuracy

def run_classification(features, target, params, scenario='with_product_type'):
    """Run classification for all models"""
    results = {}
    
    # Create and run models
    models = {
        'SVM': SVC(**params[scenario]['SVM'], random_state=RANDOM_STATE),
        'Random Forest': RandomForestClassifier(**params[scenario]['Random Forest'], random_state=RANDOM_STATE),
        'KNN': KNeighborsClassifier(**params[scenario]['KNN']),
        'Neural Network': MLPClassifier(**params[scenario]['Neural Network'], random_state=RANDOM_STATE),
        'Logistic Regression': LogisticRegression(**params[scenario]['Logistic Regression'], random_state=RANDOM_STATE)
    }
    
    for name, model in models.items():
        print(f"\nRunning {name}...")
        acc = classify_model(model, features, target, name.lower().replace(' ', '_'), return_accuracy=True)
        results[name] = acc
    
    return results

def plot_accuracy_comparisons(results_without, results_with):
    models = list(results_without.keys())
    accuracies_without = list(results_without.values())
    accuracies_with = list(results_with.values())
    
    x = np.arange(len(models))
    width = 0.35

    fig, ax = plt.subplots(figsize=(12, 6))
    bars1 = ax.bar(x - width/2, accuracies_without, width, label='Without')
    bars2 = ax.bar(x + width/2, accuracies_with, width, label='With')

    ax.set_ylabel('Mean Accuracy')
    ax.set_title('Model Comparison for Storage Condition Classification')
    ax.set_xticks(x)
    ax.set_xticklabels(models, rotation=45, ha='right')
    ax.legend()

    def autolabel(bars):
        for bar in bars:
            height = bar.get_height()
            ax.annotate(f'{height:.3f}',
                       xy=(bar.get_x() + bar.get_width()/2, height),
                       xytext=(0, 3),
                       textcoords="offset points",
                       ha='center', va='bottom')

    autolabel(bars1)
    autolabel(bars2)
    
    plt.tight_layout()
    plt.show()

# Main execution
if __name__ == "__main__":
    # Load data and best parameters
    X_with_product, X_without_product, y_storage, class_labels = prepare_data()
    best_params = load_best_parameters()
    
    # Run classifications
    print("\nRunning classification without Product Type...")
    results_without_product = run_classification(X_without_product, y_storage, best_params, 'without_product_type')
    
    print("\nRunning classification with Product Type...")
    results_with_product = run_classification(X_with_product, y_storage, best_params, 'with_product_type')
    
    # Plot results
    plot_accuracy_comparisons(results_without_product, results_with_product)
    
    # Save results
    results = {
        'without_product_type': results_without_product,
        'with_product_type': results_with_product
    }
    
    with open('classification_results.json', 'w') as f:
        json.dump(results, f, indent=4)

In [9]:

def run_classification(features, target, params, scenario='with_product_type'):
    """Run classification for all models"""
    results = {}
    
    # Create and run models
    models = {
        'SVM': SVC(**params[scenario]['SVM'], random_state=RANDOM_STATE),
        'Random Forest': RandomForestClassifier(**params[scenario]['Random Forest'], random_state=RANDOM_STATE),
        'KNN': KNeighborsClassifier(**params[scenario]['KNN']),
        'Neural Network': MLPClassifier(**params[scenario]['Neural Network'], random_state=RANDOM_STATE),
        'Logistic Regression': LogisticRegression(**params[scenario]['Logistic Regression'], random_state=RANDOM_STATE)
    }
    
    for name, model in models.items():
        print(f"\nRunning {name}...")
        acc = classify_model(model, features, target, name.lower().replace(' ', '_'), return_accuracy=True)
        results[name] = acc
    
    return results

def plot_accuracy_comparisons(results_without, results_with):
    models = list(results_without.keys())
    accuracies_without = list(results_without.values())
    accuracies_with = list(results_with.values())
    
    x = np.arange(len(models))
    width = 0.35

    fig, ax = plt.subplots(figsize=(12, 6))
    bars1 = ax.bar(x - width/2, accuracies_without, width, label='Without')
    bars2 = ax.bar(x + width/2, accuracies_with, width, label='With')

    ax.set_ylabel('Mean Accuracy')
    ax.set_title('Model Comparison for Storage Condition Classification')
    ax.set_xticks(x)
    ax.set_xticklabels(models, rotation=45, ha='right')
    ax.legend()

    def autolabel(bars):
        for bar in bars:
            height = bar.get_height()
            ax.annotate(f'{height:.3f}',
                       xy=(bar.get_x() + bar.get_width()/2, height),
                       xytext=(0, 3),
                       textcoords="offset points",
                       ha='center', va='bottom')

    autolabel(bars1)
    autolabel(bars2)
    
    plt.tight_layout()
    plt.show()

# Main execution
if __name__ == "__main__":
    # Load data and best parameters
    X_with_product, X_without_product, y_storage, class_labels = prepare_data()
    best_params = load_best_parameters()
    
    # Run classifications
    print("\nRunning classification without Product Type...")
    results_without_product = run_classification(X_without_product, y_storage, best_params, 'without_product_type')
    
    print("\nRunning classification with Product Type...")
    results_with_product = run_classification(X_with_product, y_storage, best_params, 'with_product_type')
    
    # Plot results
    plot_accuracy_comparisons(results_without_product, results_with_product)
    
    # Save results
    results = {
        'without_product_type': results_without_product,
        'with_product_type': results_with_product
    }
    
    with open('classification_results.json', 'w') as f:
        json.dump(results, f, indent=4)

In [None]:

def plot_accuracy_comparisons(results_without, results_with):
    models = list(results_without.keys())
    accuracies_without = list(results_without.values())
    accuracies_with = list(results_with.values())
    
    x = np.arange(len(models))
    width = 0.35

    fig, ax = plt.subplots(figsize=(12, 6))
    bars1 = ax.bar(x - width/2, accuracies_without, width, label='Without')
    bars2 = ax.bar(x + width/2, accuracies_with, width, label='With')

    ax.set_ylabel('Mean Accuracy')
    ax.set_title('Model Comparison for Storage Condition Classification')
    ax.set_xticks(x)
    ax.set_xticklabels(models, rotation=45, ha='right')
    ax.legend()

    def autolabel(bars):
        for bar in bars:
            height = bar.get_height()
            ax.annotate(f'{height:.3f}',
                       xy=(bar.get_x() + bar.get_width()/2, height),
                       xytext=(0, 3),
                       textcoords="offset points",
                       ha='center', va='bottom')

    autolabel(bars1)
    autolabel(bars2)
    
    plt.tight_layout()
    plt.show()

# Main execution
if __name__ == "__main__":
    # Load data and best parameters
    X_with_product, X_without_product, y_storage, class_labels = prepare_data()
    best_params = load_best_parameters()
    
    # Run classifications
    print("\nRunning classification without Product Type...")
    results_without_product = run_classification(X_without_product, y_storage, best_params, 'without_product_type')
    
    print("\nRunning classification with Product Type...")
    results_with_product = run_classification(X_with_product, y_storage, best_params, 'with_product_type')
    
    # Plot results
    plot_accuracy_comparisons(results_without_product, results_with_product)
    
    # Save results
    results = {
        'without_product_type': results_without_product,
        'with_product_type': results_with_product
    }
    
    with open('classification_results.json', 'w') as f:
        json.dump(results, f, indent=4)

In [None]:
# Cell 1: Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold, learning_curve
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import json
from itertools import cycle

# Import models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

def plot_learning_curves(model, X, y, title="Learning Curves"):
    """Plot learning curves for a given model"""
    train_sizes = np.linspace(0.1, 1.0, 10)
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    plt.figure(figsize=(10, 6))
    train_sizes, train_scores, val_scores = learning_curve(
        model, X, y, train_sizes=train_sizes, cv=cv, n_jobs=-1,
        scoring='accuracy'
    )
    
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    val_mean = np.mean(val_scores, axis=1)
    val_std = np.std(val_scores, axis=1)
    
    plt.plot(train_sizes, train_mean, label='Training Score', color='blue', marker='o')
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.15, color='blue')
    
    plt.plot(train_sizes, val_mean, label='Cross-validation Score', color='red', marker='o')
    plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.15, color='red')
    
    plt.xlabel('Training Examples')
    plt.ylabel('Accuracy')
    plt.title(title)
    plt.legend(loc='lower right')
    plt.grid(True)
    
    # Add final scores to plot
    plt.text(0.02, 0.02, f'Final training score: {train_mean[-1]:.4f} ± {train_std[-1]:.4f}',
             transform=plt.gca().transAxes)
    plt.text(0.02, 0.06, f'Final validation score: {val_mean[-1]:.4f} ± {val_std[-1]:.4f}',
             transform=plt.gca().transAxes)
    
    plt.show()

def analyze_model_performance(model, X_with, X_without, y, model_name, class_labels):
    """Detailed analysis of a single model's performance"""
    print(f"\n{'='*50}")
    print(f"Detailed Analysis for {model_name}")
    print('='*50)
    
    # 1. Cross-validation scores
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    # Without product type
    scores_without = cross_val_score(model, X_without, y, cv=cv)
    y_pred_without = cross_val_predict(model, X_without, y, cv=cv)
    
    # With product type
    scores_with = cross_val_score(model, X_with, y, cv=cv)
    y_pred_with = cross_val_predict(model, X_with, y, cv=cv)
    
    # Statistical comparison
    t_stat, p_value = stats.ttest_ind(scores_with, scores_without)
    
    print("\nPerformance Metrics:")
    print(f"Without Product Type: {scores_without.mean():.4f} ± {scores_without.std():.4f}")
    print(f"With Product Type: {scores_with.mean():.4f} ± {scores_with.std():.4f}")
    print(f"P-value: {p_value:.4f}")
    print(f"Significant difference: {p_value < 0.05}")
    
    # 2. Confusion Matrices
    plt.figure(figsize=(12, 5))
    
    plt.subplot(121)
    cm_without = confusion_matrix(y, y_pred_without)
    sns.heatmap(cm_without, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{model_name} - Without Product Type')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    
    plt.subplot(122)
    cm_with = confusion_matrix(y, y_pred_with)
    sns.heatmap(cm_with, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{model_name} - With Product Type')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    
    plt.tight_layout()
    plt.show()
    
    # 3. Classification Reports

    # 4. Model-specific analysis
    if model_name == "Random Forest":
        model.fit(X_with, y)
        # Create feature names
        feature_names = [f'Feature_{i+1}' for i in range(X_with.shape[1])]
        feature_importance = pd.DataFrame({
            'feature': feature_names,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        plt.figure(figsize=(10, 6))
        sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
        plt.title('Top 10 Most Important Features')
        plt.show()
    
    elif model_name == "Neural Network":
        plot_learning_curves(model, X_with, y, f"Learning Curves - {model_name}")
# Main execution
if __name__ == "__main__":
    # Load your data and prepare features
    X_with_product, X_without_product, y_storage, class_labels = prepare_data()
    
    # Load best parameters and create models
    best_params = load_best_parameters()
    models = {
        'SVM': SVC(**best_params['with_product_type']['SVM']),
        'Random Forest': RandomForestClassifier(**best_params['with_product_type']['Random Forest']),
        'KNN': KNeighborsClassifier(**best_params['with_product_type']['KNN']),
        'Neural Network': MLPClassifier(**best_params['with_product_type']['Neural Network']),
        'Logistic Regression': LogisticRegression(**best_params['with_product_type']['Logistic Regression'])
    }
    
    # Perform detailed analysis for each model
    for model_name, model in models.items():
        analyze_model_performance(
            model,
            X_with_product,
            X_without_product,
            y_storage,
            model_name,
            class_labels
        )

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report
import glob
import os

# Classification models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

class BakerySpectraClassifier:
    def __init__(self):
        self.models = {
            'SVM': Pipeline([
                ('scaler', StandardScaler()),
                ('classifier', SVC(kernel='rbf', probability=True))
            ]),
            'Random Forest': Pipeline([
                ('scaler', StandardScaler()),
                ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
            ]),
            'KNN': Pipeline([
                ('scaler', StandardScaler()),
                ('classifier', KNeighborsClassifier(n_neighbors=5))
            ]),
            'Neural Network': Pipeline([
                ('scaler', StandardScaler()),
                ('classifier', MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42))
            ]),
            'LDA': Pipeline([
                ('scaler', StandardScaler()),
                ('classifier', LinearDiscriminantAnalysis())
            ])
        }
        
    def load_s1p_file(self, file_path):
        """Load data from s1p file"""
        data = np.loadtxt(file_path, skiprows=9)
        freq = data[:, 0]  # 300-900 MHz range
        gain = data[:, 1]
        return freq, gain

    def extract_features(self, gain):
        """Extract features from the gain data"""
        features = []
        # Statistical features
        features.extend([
            np.mean(gain),
            np.std(gain),
            np.max(gain),
            np.min(gain),
            np.median(gain),
            np.percentile(gain, 25),
            np.percentile(gain, 75)
        ])
        # Frequency domain features
        fft_features = np.abs(np.fft.fft(gain))[:10]  # First 10 FFT coefficients
        features.extend(fft_features)
        
        return features

    def load_dataset(self):
        """Load and prepare the dataset"""
        X_data = []
        y_product = []  # Product type labels (A=bread, B=cookies)
        y_storage = []  # Storage condition labels (1=Open, 2=Wrapped, 3=Humid)
        
        files = glob.glob('./RawData/Bakery/[A-B]_*_*.s1p')
        frequencies = None
        
        for file_path in files:
            # Extract metadata from filename
            filename = os.path.basename(file_path)
            product_type = filename[0]  # A or B
            storage_condition = int(filename.split('_')[1])  # 1, 2, or 3
            
            # Load and process data
            freq, gain = self.load_s1p_file(file_path)
            if frequencies is None:
                frequencies = freq
            
            # Extract features
            features = self.extract_features(gain)
            
            X_data.append(features)
            y_product.append(product_type)
            y_storage.append(storage_condition)
        
        return np.array(X_data), np.array(y_product), np.array(y_storage), frequencies

    def train_and_evaluate(self, X, y, task_name):
        """Train and evaluate all models"""
        results = {}
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        for name, model in self.models.items():
            # Train model
            model.fit(X_train, y_train)
            
            # Evaluate
            train_score = model.score(X_train, y_train)
            test_score = model.score(X_test, y_test)
            cv_scores = cross_val_score(model, X, y, cv=5)
            
            y_pred = model.predict(X_test)
            
            results[name] = {
                'train_score': train_score,
                'test_score': test_score,
                'cv_scores': cv_scores,
                'confusion_matrix': confusion_matrix(y_test, y_pred),
                'classification_report': classification_report(y_test, y_pred)
            }
            
            # Plot confusion matrix
            plt.figure(figsize=(8, 6))
            sns.heatmap(results[name]['confusion_matrix'], annot=True, fmt='d', cmap='Blues')
            plt.title(f'Confusion Matrix - {name} ({task_name})')
            plt.ylabel('True Label')
            plt.xlabel('Predicted Label')
            plt.savefig(f'confusion_matrix_{task_name}_{name.replace(" ", "_")}.png')
            plt.close()
            
        return results

    def plot_learning_curves(self, X, y, task_name):
        """Plot learning curves for all models"""
        train_sizes = np.linspace(0.1, 1.0, 5)
        
        plt.figure(figsize=(15, 10))
        for idx, (name, model) in enumerate(self.models.items(), 1):
            train_sizes_abs, train_scores, test_scores = learning_curve(
                model, X, y, train_sizes=train_sizes, cv=5, n_jobs=-1
            )
            
            train_mean = np.mean(train_scores, axis=1)
            train_std = np.std(train_scores, axis=1)
            test_mean = np.mean(test_scores, axis=1)
            test_std = np.std(test_scores, axis=1)
            
            plt.subplot(2, 3, idx)
            plt.plot(train_sizes_abs, train_mean, label='Training score')
            plt.plot(train_sizes_abs, test_mean, label='Cross-validation score')
            plt.fill_between(train_sizes_abs, train_mean - train_std, train_mean + train_std, alpha=0.1)
            plt.fill_between(train_sizes_abs, test_mean - test_std, test_mean + test_std, alpha=0.1)
            plt.title(f'Learning Curve - {name}')
            plt.xlabel('Training Examples')
            plt.ylabel('Score')
            plt.legend(loc='best')
            plt.grid(True)
        
        plt.tight_layout()
        plt.savefig(f'learning_curves_{task_name}.png')
        plt.close()

def main():
    # Initialize classifier
    classifier = BakerySpectraClassifier()
    
    # Load dataset
    print("Loading dataset...")
    X, y_product, y_storage, frequencies = classifier.load_dataset()
    
    # Product type classification
    print("\nTraining models for product type classification...")
    product_results = classifier.train_and_evaluate(X, y_product, "Product_Type")
    classifier.plot_learning_curves(X, y_product, "Product_Type")
    
    # Storage condition classification
    print("\nTraining models for storage condition classification...")
    storage_results = classifier.train_and_evaluate(X, y_storage, "Storage_Condition")
    classifier.plot_learning_curves(X, y_storage, "Storage_Condition")
    
    # Print results
    print("\nResults for Product Type Classification:")
    for name, result in product_results.items():
        print(f"\n{name}:")
        print(f"Train Score: {result['train_score']:.3f}")
        print(f"Test Score: {result['test_score']:.3f}")
        print(f"CV Scores Mean ± Std: {np.mean(result['cv_scores']):.3f} ± {np.std(result['cv_scores']):.3f}")
        print("\nClassification Report:")
        print(result['classification_report'])
    
    print("\nResults for Storage Condition Classification:")
    for name, result in storage_results.items():
        print(f"\n{name}:")
        print(f"Train Score: {result['train_score']:.3f}")
        print(f"Test Score: {result['test_score']:.3f}")
        print(f"CV Scores Mean ± Std: {np.mean(result['cv_scores']):.3f} ± {np.std(result['cv_scores']):.3f}")
        print("\nClassification Report:")
        print(result['classification_report'])

if __name__ == "__main__":
    main()

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_selection import SelectKBest, f_classif
from scipy import stats, signal
import glob
import os
import json

# Classification models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

# Constants
RANDOM_STATE = 42
CV_SPLITS = 10

class BakerySpectraClassifier:
    def __init__(self):
        self.models = {
            'SVM': Pipeline([
                ('scaler', StandardScaler()),
                ('feature_selection', SelectKBest(score_func=f_classif, k=20)),
                ('classifier', SVC(probability=True, random_state=RANDOM_STATE))
            ]),
            'Random Forest': Pipeline([
                ('scaler', StandardScaler()),
                ('feature_selection', SelectKBest(score_func=f_classif, k=20)),
                ('classifier', RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE))
            ]),
            'KNN': Pipeline([
                ('scaler', StandardScaler()),
                ('feature_selection', SelectKBest(score_func=f_classif, k=20)),
                ('classifier', KNeighborsClassifier(n_neighbors=5))
            ]),
            'Neural Network': Pipeline([
                ('scaler', StandardScaler()),
                ('feature_selection', SelectKBest(score_func=f_classif, k=20)),
                ('classifier', MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=RANDOM_STATE))
            ]),
            'Logistic Regression': Pipeline([
                ('scaler', StandardScaler()),
                ('feature_selection', SelectKBest(score_func=f_classif, k=20)),
                ('classifier', LogisticRegression(random_state=RANDOM_STATE))
            ])
        }

    def prepare_data(self):
        """Load and prepare data from augmented dataset"""
        print("Loading data and preparing features...")
        data = pd.read_csv('augmented_bakery_data.csv')
        
        # Encode labels
        le_product = LabelEncoder()
        le_storage = LabelEncoder()
        data['Product_Type_encoded'] = le_product.fit_transform(data['Product_Type'])
        data['Storage_Condition_encoded'] = le_storage.fit_transform(data['Storage_Condition'])
        
        # Prepare features
        feature_cols = [col for col in data.columns if col.startswith(('gain_', 'phase_'))]
        X_base = data[feature_cols]
        
        # Scale features on entire dataset
        scaler = StandardScaler()
        X_base_scaled = scaler.fit_transform(X_base)
        
        # Prepare datasets
        X_without_product = X_base_scaled
        X_with_product = np.column_stack([X_base_scaled, data['Product_Type_encoded'].values.reshape(-1, 1)])
        y_storage = data['Storage_Condition_encoded'].values
        y_product = data['Product_Type_encoded'].values
        
        return X_with_product, X_without_product, y_product, y_storage, le_storage.classes_, feature_cols

    def analyze_feature_importance(self, X, y, feature_names, task_name):
        """Analyze and visualize feature importance"""
        rf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
        rf.fit(X, y)
        
        importance = rf.feature_importances_
        indices = np.argsort(importance)[::-1]
        
        plt.figure(figsize=(12, 6))
        plt.title(f"Feature Importances for {task_name}")
        plt.bar(range(X.shape[1]), importance[indices])
        plt.xticks(range(X.shape[1]), [feature_names[i] for i in indices], rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig(f'feature_importance_{task_name.lower().replace(" ", "_")}.png')
        plt.close()
        
        print(f"\nTop 10 most important features for {task_name}:")
        for i in range(min(10, len(feature_names))):
            print(f"{feature_names[indices[i]]}: {importance[indices[i]]:.4f}")

    def plot_confusion_matrix(self, y_true, y_pred, title):
        """Plot confusion matrix"""
        cm = confusion_matrix(y_true, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(title)
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.tight_layout()
        plt.savefig(f'{title.lower().replace(" ", "_")}.png')
        plt.close()

    def evaluate_model(self, model, X, y, model_name, task):
        """Evaluate a single model"""
        cv = StratifiedKFold(n_splits=CV_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        
        # Cross-validation scores
        cv_scores = cross_val_score(model, X, y, cv=cv, n_jobs=-1)
        y_pred = model.fit(X, y).predict(X)
        
        print(f"\n{model_name} - {task} Classification Results:")
        print(f"CV Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
        print("\nClassification Report:")
        print(classification_report(y, y_pred))
        
        # Plot confusion matrix
        self.plot_confusion_matrix(y, y_pred, f"{model_name} - {task}")
        
        return {
            'cv_scores': cv_scores,
            'confusion_matrix': confusion_matrix(y, y_pred),
            'classification_report': classification_report(y, y_pred)
        }

    def plot_learning_curves(self, model, X, y, title):
        """Plot learning curves for a model"""
        train_sizes = np.linspace(0.1, 1.0, 10)
        cv = StratifiedKFold(n_splits=CV_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        
        plt.figure(figsize=(10, 6))
        train_sizes, train_scores, val_scores = learning_curve(
            model, X, y, train_sizes=train_sizes, cv=cv, n_jobs=-1,
            scoring='accuracy'
        )
        
        train_mean = np.mean(train_scores, axis=1)
        train_std = np.std(train_scores, axis=1)
        val_mean = np.mean(val_scores, axis=1)
        val_std = np.std(val_scores, axis=1)
        
        plt.plot(train_sizes, train_mean, label='Training Score', color='blue', marker='o')
        plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.15, color='blue')
        
        plt.plot(train_sizes, val_mean, label='Cross-validation Score', color='red', marker='o')
        plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.15, color='red')
        
        plt.xlabel('Training Examples')
        plt.ylabel('Accuracy')
        plt.title(title)
        plt.legend(loc='lower right')
        plt.grid(True)
        
        # Add final scores to plot
        plt.text(0.02, 0.02, f'Final training score: {train_mean[-1]:.4f} ± {train_std[-1]:.4f}',
                transform=plt.gca().transAxes)
        plt.text(0.02, 0.06, f'Final validation score: {val_mean[-1]:.4f} ± {val_std[-1]:.4f}',
                transform=plt.gca().transAxes)
        
        plt.savefig(f'{title.lower().replace(" ", "_")}.png')
        plt.close()

def main():
    # Initialize classifier
    classifier = BakerySpectraClassifier()
    
    # Load and prepare data
    X_with_product, X_without_product, y_product, y_storage, class_labels, feature_names = classifier.prepare_data()
    
    # Analyze feature importance for both tasks
    print("\nAnalyzing feature importance...")
    classifier.analyze_feature_importance(X_without_product, y_storage, feature_names, "Storage Classification")
    classifier.analyze_feature_importance(X_without_product, y_product, feature_names, "Product Classification")
    
    # Storage condition classification without product type
    print("\nPerforming Storage Condition Classification (without Product Type)...")
    storage_results_without = {}
    for name, model in classifier.models.items():
        storage_results_without[name] = classifier.evaluate_model(
            model, X_without_product, y_storage, f"{name} (without Product Type)", "Storage"
        )
        classifier.plot_learning_curves(
            model, X_without_product, y_storage,
            f"Learning Curves - {name} (Storage, without Product Type)"
        )
    
    # Storage condition classification with product type
    print("\nPerforming Storage Condition Classification (with Product Type)...")
    storage_results_with = {}
    for name, model in classifier.models.items():
        storage_results_with[name] = classifier.evaluate_model(
            model, X_with_product, y_storage, f"{name} (with Product Type)", "Storage"
        )
        classifier.plot_learning_curves(
            model, X_with_product, y_storage,
            f"Learning Curves - {name} (Storage, with Product Type)"
        )
    
    # Product type classification
    print("\nPerforming Product Type Classification...")
    product_results = {}
    for name, model in classifier.models.items():
        product_results[name] = classifier.evaluate_model(
            model, X_without_product, y_product, name, "Product Type"
        )
        classifier.plot_learning_curves(
            model, X_without_product, y_product,
            f"Learning Curves - {name} (Product Type)"
        )
    
    # Save results
    results = {
        'storage_without_product': {name: {
            'cv_scores': list(res['cv_scores']),
            'confusion_matrix': res['confusion_matrix'].tolist(),
            'classification_report': res['classification_report']
        } for name, res in storage_results_without.items()},
        'storage_with_product': {name: {
            'cv_scores': list(res['cv_scores']),
            'confusion_matrix': res['confusion_matrix'].tolist(),
            'classification_report': res['classification_report']
        } for name, res in storage_results_with.items()},
        'product_type': {name: {
            'cv_scores': list(res['cv_scores']),
            'confusion_matrix': res['confusion_matrix'].tolist(),
            'classification_report': res['classification_report']
        } for name, res in product_results.items()}
    }
    
    with open('classification_results.json', 'w') as f:
        json.dump(results, f, indent=4)

if __name__ == "__main__":
    main()

Loading data and preparing features...

Analyzing feature importance...

Top 10 most important features for Storage Classification:
phase_28: 0.0232
phase_0: 0.0216
phase_64: 0.0165
phase_61: 0.0158
phase_2: 0.0153
phase_91: 0.0152
phase_63: 0.0151
phase_36: 0.0141
phase_32: 0.0130
phase_62: 0.0128

Top 10 most important features for Product Classification:
phase_100: 0.0809
gain_99: 0.0565
gain_100: 0.0501
phase_99: 0.0483
phase_98: 0.0334
phase_30: 0.0307
gain_24: 0.0259
phase_95: 0.0252
gain_17: 0.0246
gain_96: 0.0231

Performing Storage Condition Classification (without Product Type)...

SVM (without Product Type) - Storage Classification Results:
CV Accuracy: 0.5611 ± 0.1124

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.30      0.43        60
           1       0.52      0.70      0.60        60
           2       0.56      0.70      0.62        60

    accuracy                           0.57       180
   macro avg   