In [4]:
# Wine Quality Classification - Comprehensive Analysis
# This notebook provides detailed evaluation and analysis of machine learning models

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pickle
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
from src.complete_workflow import *

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configure matplotlib for better plots
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 11
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['legend.fontsize'] = 10

print("🍷 Wine Quality Classification - Analysis Notebook")
print("=" * 60)


# =============================================================================
# 1. LOAD RESULTS
# =============================================================================

def load_latest_results(results_dir="results", experiment_type="combined_comparison"):
    """Load the most recent experiment results"""
    if not os.path.exists(results_dir):
        print(f"❌ Results directory '{results_dir}' not found!")
        return None

    # Find the latest file
    files = [f for f in os.listdir(results_dir) if experiment_type in f and f.endswith('.pkl')]
    if not files:
        print(f"❌ No {experiment_type} results found in {results_dir}")
        return None

    latest_file = max(files, key=lambda x: os.path.getctime(os.path.join(results_dir, x)))
    file_path = os.path.join(results_dir, latest_file)

    print(f"📂 Loading results from: {latest_file}")

    with open(file_path, 'rb') as f:
        data = pickle.load(f)

    return data

# Load experiment results
results_data = load_latest_results()

if results_data is None:
    print("⚠️  No results found. Please run main.py first to generate results.")
    print("   Then restart this notebook.")
else:
    print("✅ Results loaded successfully!")

    # Extract data
    if 'results' in results_data and 'baseline' in results_data['results']:
        # Combined results format
        baseline_results = results_data['results']['baseline']
        smote_results = results_data['results']['smote']
        metadata = results_data['results'].get('metadata', {})
    else:
        # Single experiment format - try to load both baseline and smote
        baseline_data = load_latest_results(experiment_type="baseline")
        smote_data = load_latest_results(experiment_type="smote_oversampling")

        if baseline_data and smote_data:
            baseline_results = baseline_data['results']
            smote_results = smote_data['results']
            metadata = {}
        else:
            print("❌ Could not find both baseline and SMOTE results")
            baseline_results = smote_results = metadata = {}



🍷 Wine Quality Classification - Analysis Notebook
❌ Results directory 'results' not found!
⚠️  No results found. Please run main.py first to generate results.
   Then restart this notebook.


In [5]:
# =============================================================================
# 2. DATA OVERVIEW AND PREPROCESSING ANALYSIS
# =============================================================================

def analyze_data_overview():
    """Analyze the original dataset and preprocessing steps"""
    print("\n" + "="*60)
    print("📊 DATA OVERVIEW & PREPROCESSING ANALYSIS")
    print("="*60)

    # Load original data to analyze
    try:
        red_path = os.path.join("data", "winequality-red.csv")
        white_path = os.path.join("data", "winequality-white.csv")

        red_wine = pd.read_csv(red_path, sep=';')
        white_wine = pd.read_csv(white_path, sep=';')

        print(f"🔴 Red wine samples: {len(red_wine)}")
        print(f"⚪ White wine samples: {len(white_wine)}")
        print(f"📈 Total samples: {len(red_wine) + len(white_wine)}")
        print(f"🔢 Features: {len(red_wine.columns) - 1}")  # -1 for quality column

        # Combine data
        red_wine['wine_type'] = 'red'
        white_wine['wine_type'] = 'white'
        combined_data = pd.concat([red_wine, white_wine], ignore_index=True)

        # Create binary target
        combined_data['quality_binary'] = (combined_data['quality'] >= 6).astype(int)

        # Class distribution analysis
        fig, axes = plt.subplots(1, 3, figsize=(18, 5))

        # Original quality distribution
        combined_data['quality'].value_counts().sort_index().plot(kind='bar', ax=axes[0], color='skyblue')
        axes[0].set_title('Original Quality Distribution (3-9 scale)')
        axes[0].set_xlabel('Quality Score')
        axes[0].set_ylabel('Count')
        axes[0].tick_params(axis='x', rotation=0)

        # Binary quality distribution
        binary_counts = combined_data['quality_binary'].value_counts()
        axes[1].pie(binary_counts.values, labels=['Low Quality (< 6)', 'High Quality (≥ 6)'],
                   autopct='%1.1f%%', startangle=90, colors=['lightcoral', 'lightgreen'])
        axes[1].set_title('Binary Classification Target')

        # Wine type distribution
        type_counts = combined_data['wine_type'].value_counts()
        axes[2].pie(type_counts.values, labels=type_counts.index.str.title(),
                   autopct='%1.1f%%', startangle=90, colors=['darkred', 'gold'])
        axes[2].set_title('Wine Type Distribution')

        plt.tight_layout()
        plt.show()

        # Class imbalance analysis
        class_distribution = combined_data['quality_binary'].value_counts()
        imbalance_ratio = class_distribution.max() / class_distribution.min()

        print(f"\n📊 Class Distribution:")
        print(f"   Low Quality (0): {class_distribution[0]:,} samples ({class_distribution[0]/len(combined_data)*100:.1f}%)")
        print(f"   High Quality (1): {class_distribution[1]:,} samples ({class_distribution[1]/len(combined_data)*100:.1f}%)")
        print(f"   Imbalance Ratio: {imbalance_ratio:.2f}:1")

        if imbalance_ratio > 1.5:
            print(f"   ⚠️  Dataset is imbalanced (ratio > 1.5:1)")
            print(f"   💡 SMOTE oversampling should help address this")

        return combined_data

    except FileNotFoundError:
        print("❌ Data files not found. Please ensure winequality-red.csv and winequality-white.csv are in the data/ directory.")
        return None

# Run data analysis
original_data = analyze_data_overview()




📊 DATA OVERVIEW & PREPROCESSING ANALYSIS
❌ Data files not found. Please ensure winequality-red.csv and winequality-white.csv are in the data/ directory.


In [6]:
# =============================================================================
# 3. MODEL PERFORMANCE COMPARISON
# =============================================================================

def create_performance_comparison():
    """Create comprehensive performance comparison visualizations"""
    if not baseline_results or not smote_results:
        print("❌ Results not available for comparison")
        return

    print("\n" + "="*60)
    print("🎯 MODEL PERFORMANCE COMPARISON")
    print("="*60)

    # Model name mapping for better display
    model_names = {
        'lr_scratch': 'Logistic Regression\n(From Scratch)',
        'lr_sklearn': 'Logistic Regression\n(Scikit-learn)',
        'svm_scratch': 'Linear SVM\n(From Scratch)',
        'svm_sklearn': 'Linear SVM\n(Scikit-learn)',
        'klr_scratch': 'Kernel Logistic\nRegression',
        'ksvm_scratch': 'Kernel SVM\n(Pegasos)',
        'rbf_svm_sklearn': 'RBF SVM\n(Scikit-learn)',
        'poly_svm_sklearn': 'Polynomial SVM\n(Scikit-learn)'
    }

    # Prepare data for plotting
    metrics = ['accuracy', 'balanced_accuracy', 'precision', 'recall', 'f1']
    metric_labels = ['Accuracy', 'Balanced Accuracy', 'Precision', 'Recall', 'F1-Score']

    # Find common models between baseline and SMOTE
    common_models = set(baseline_results.keys()) & set(smote_results.keys())

    if not common_models:
        print("❌ No common models found between baseline and SMOTE results")
        return

    # Create comparison plots
    fig, axes = plt.subplots(2, 3, figsize=(20, 12))
    axes = axes.flatten()

    for i, (metric, label) in enumerate(zip(metrics, metric_labels)):
        ax = axes[i]

        models = []
        baseline_scores = []
        smote_scores = []

        for model_key in sorted(common_models):
            if model_key in baseline_results and model_key in smote_results:
                models.append(model_names.get(model_key, model_key))
                baseline_scores.append(baseline_results[model_key][metric])
                smote_scores.append(smote_results[model_key][metric])

        x = np.arange(len(models))
        width = 0.35

        bars1 = ax.bar(x - width/2, baseline_scores, width, label='Baseline',
                      color='lightcoral', alpha=0.8)
        bars2 = ax.bar(x + width/2, smote_scores, width, label='SMOTE',
                      color='lightgreen', alpha=0.8)

        ax.set_xlabel('Models')
        ax.set_ylabel(label)
        ax.set_title(f'{label} Comparison')
        ax.set_xticks(x)
        ax.set_xticklabels(models, rotation=45, ha='right')
        ax.legend()
        ax.grid(True, alpha=0.3)

        # Add value labels on bars
        for bar in bars1:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                   f'{height:.3f}', ha='center', va='bottom', fontsize=8)

        for bar in bars2:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                   f'{height:.3f}', ha='center', va='bottom', fontsize=8)

    # Remove empty subplot
    axes[5].remove()

    plt.tight_layout()
    plt.show()

    # Performance improvement analysis
    print("\n📈 PERFORMANCE IMPROVEMENTS (SMOTE vs Baseline):")
    print("-" * 60)

    improvements = {}
    for model_key in common_models:
        model_name = model_names.get(model_key, model_key).replace('\n', ' ')
        improvements[model_name] = {}

        for metric in metrics:
            baseline_score = baseline_results[model_key][metric]
            smote_score = smote_results[model_key][metric]
            improvement = smote_score - baseline_score
            improvements[model_name][metric] = improvement

        # Print F1 improvements (most important for imbalanced data)
        f1_improvement = improvements[model_name]['f1']
        status = "📈" if f1_improvement > 0.01 else "📉" if f1_improvement < -0.01 else "➖"
        print(f"{status} {model_name:<25}: F1 {f1_improvement:+.4f}")

    return improvements

# Create performance comparison
performance_improvements = create_performance_comparison()



NameError: name 'baseline_results' is not defined

In [None]:
# =============================================================================
# 4. DETAILED METRICS ANALYSIS
# =============================================================================

def analyze_detailed_metrics():
    """Analyze metrics in detail with radar charts and heatmaps"""
    if not baseline_results or not smote_results:
        return

    print("\n" + "="*60)
    print("🔍 DETAILED METRICS ANALYSIS")
    print("="*60)

    # Create radar chart for top performing models
    from math import pi

    def create_radar_chart(results, title, ax):
        """Create radar chart for model performance"""
        metrics = ['accuracy', 'balanced_accuracy', 'precision', 'recall', 'f1']
        metric_labels = ['Accuracy', 'Balanced\nAccuracy', 'Precision', 'Recall', 'F1-Score']

        # Find top 4 models by F1 score
        model_f1_scores = {model: results[model]['f1'] for model in results.keys()}
        top_models = sorted(model_f1_scores.items(), key=lambda x: x[1], reverse=True)[:4]

        # Number of variables
        categories = metric_labels
        N = len(categories)

        # Compute angle for each axis
        angles = [n / float(N) * 2 * pi for n in range(N)]
        angles += angles[:1]  # Complete the circle

        colors = ['red', 'blue', 'green', 'orange']

        for i, (model_key, _) in enumerate(top_models):
            values = [results[model_key][metric] for metric in metrics]
            values += values[:1]  # Complete the circle

            model_name = model_key.replace('_', ' ').title()
            ax.plot(angles, values, 'o-', linewidth=2, label=model_name, color=colors[i])
            ax.fill(angles, values, alpha=0.1, color=colors[i])

        ax.set_xticks(angles[:-1])
        ax.set_xticklabels(categories)
        ax.set_ylim(0, 1)
        ax.set_title(title, size=14, fontweight='bold')
        ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
        ax.grid(True)

    # Create radar charts
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8), subplot_kw=dict(projection='polar'))

    create_radar_chart(baseline_results, 'Baseline Performance', ax1)
    create_radar_chart(smote_results, 'SMOTE Performance', ax2)

    plt.tight_layout()
    plt.show()

    # Metrics correlation heatmap
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

    def create_metrics_heatmap(results, title, ax):
        """Create correlation heatmap of metrics across models"""
        metrics = ['accuracy', 'balanced_accuracy', 'precision', 'recall', 'f1']

        # Create dataframe with metrics for each model
        data = []
        for model in results.keys():
            row = [results[model][metric] for metric in metrics]
            data.append(row)

        df = pd.DataFrame(data,
                         columns=['Accuracy', 'Bal. Accuracy', 'Precision', 'Recall', 'F1-Score'],
                         index=list(results.keys()))

        # Compute correlation matrix
        corr_matrix = df.corr()

        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0,
                   square=True, ax=ax, cbar_kws={'shrink': 0.8})
        ax.set_title(f'{title} - Metrics Correlation')

    create_metrics_heatmap(baseline_results, 'Baseline', ax1)
    create_metrics_heatmap(smote_results, 'SMOTE', ax2)

    plt.tight_layout()
    plt.show()

# Run detailed metrics analysis
analyze_detailed_metrics()



In [None]:
# =============================================================================
# 5. MODEL ARCHITECTURE ANALYSIS
# =============================================================================

def analyze_model_architectures():
    """Analyze different model architectures and their characteristics"""
    print("\n" + "="*60)
    print("🏗️ MODEL ARCHITECTURE ANALYSIS")
    print("="*60)

    # Group models by type
    model_groups = {
        'Linear Models': ['lr_scratch', 'lr_sklearn', 'svm_scratch', 'svm_sklearn'],
        'Kernel Methods': ['klr_scratch', 'ksvm_scratch', 'rbf_svm_sklearn', 'poly_svm_sklearn']
    }

    if not baseline_results:
        return

    # Analyze performance by model type
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))

    for i, (group_name, models) in enumerate(model_groups.items()):
        row = i

        # Filter models that exist in results
        existing_models = [m for m in models if m in baseline_results]
        if not existing_models:
            continue

        # F1 scores comparison
        ax1 = axes[row, 0]
        baseline_f1 = [baseline_results[m]['f1'] for m in existing_models if m in baseline_results]
        smote_f1 = [smote_results[m]['f1'] for m in existing_models if m in smote_results]

        x = np.arange(len(existing_models))
        width = 0.35

        ax1.bar(x - width/2, baseline_f1, width, label='Baseline', color='lightcoral', alpha=0.8)
        ax1.bar(x + width/2, smote_f1, width, label='SMOTE', color='lightgreen', alpha=0.8)

        ax1.set_title(f'{group_name} - F1 Score Comparison')
        ax1.set_xlabel('Models')
        ax1.set_ylabel('F1 Score')
        ax1.set_xticks(x)
        ax1.set_xticklabels([m.replace('_', '\n') for m in existing_models], rotation=45)
        ax1.legend()
        ax1.grid(True, alpha=0.3)

        # Balanced accuracy comparison
        ax2 = axes[row, 1]
        baseline_bal_acc = [baseline_results[m]['balanced_accuracy'] for m in existing_models if m in baseline_results]
        smote_bal_acc = [smote_results[m]['balanced_accuracy'] for m in existing_models if m in smote_results]

        ax2.bar(x - width/2, baseline_bal_acc, width, label='Baseline', color='lightcoral', alpha=0.8)
        ax2.bar(x + width/2, smote_bal_acc, width, label='SMOTE', color='lightgreen', alpha=0.8)

        ax2.set_title(f'{group_name} - Balanced Accuracy Comparison')
        ax2.set_xlabel('Accuracy')
        ax2.set_ylabel('Balanced Accuracy')
        ax2.set_title('SMOTE: Accuracy vs Balanced Accuracy')
        ax2.legend()
        ax2.grid(True, alpha=0.3)

        # Add model labels
        for i, model in enumerate(smote_models):
            ax2.annotate(model.replace('_', '\n'),
                        (smote_accuracy_scores[i], smote_balanced_accuracy_scores[i]),
                        xytext=(5, 5), textcoords='offset points', fontsize=8)

    # 2. Precision vs Recall analysis (indicates class bias issues)
    ax3 = axes[1, 0]
    precision_scores = [baseline_results[m]['precision'] for m in models]
    recall_scores = [baseline_results[m]['recall'] for m in models]

    ax3.scatter(precision_scores, recall_scores, alpha=0.7, s=100)
    ax3.set_xlabel('Precision')
    ax3.set_ylabel('Recall')
    ax3.set_title('Baseline: Precision vs Recall Trade-off')
    ax3.grid(True, alpha=0.3)

    # Add model labels
    for i, model in enumerate(models):
        ax3.annotate(model.replace('_', '\n'),
                    (precision_scores[i], recall_scores[i]),
                    xytext=(5, 5), textcoords='offset points', fontsize=8)

    # SMOTE Precision vs Recall
    if smote_results:
        ax4 = axes[1, 1]
        smote_precision_scores = [smote_results[m]['precision'] for m in smote_models]
        smote_recall_scores = [smote_results[m]['recall'] for m in smote_models]

        ax4.scatter(smote_precision_scores, smote_recall_scores, alpha=0.7, s=100, color='green')
        ax4.set_xlabel('Precision')
        ax4.set_ylabel('Recall')
        ax4.set_title('SMOTE: Precision vs Recall Trade-off')
        ax4.grid(True, alpha=0.3)

        # Add model labels
        for i, model in enumerate(smote_models):
            ax4.annotate(model.replace('_', '\n'),
                        (smote_precision_scores[i], smote_recall_scores[i]),
                        xytext=(5, 5), textcoords='offset points', fontsize=8)

    plt.tight_layout()
    plt.show()

    # Fitting behavior analysis
    print("🔍 FITTING BEHAVIOR ANALYSIS:")
    print("-" * 40)

    # Identify potential overfitting/underfitting patterns
    for model in models:
        if model not in baseline_results:
            continue

        accuracy = baseline_results[model]['accuracy']
        balanced_accuracy = baseline_results[model]['balanced_accuracy']
        precision = baseline_results[model]['precision']
        recall = baseline_results[model]['recall']
        f1 = baseline_results[model]['f1']

        # Analysis criteria
        acc_bal_diff = accuracy - balanced_accuracy
        precision_recall_diff = abs(precision - recall)

        print(f"\n📊 {model.replace('_', ' ').title()}:")

        # Overfitting indicators
        if acc_bal_diff > 0.05:
            print(f"   ⚠️  Potential overfitting: Accuracy ({accuracy:.3f}) >> Balanced Accuracy ({balanced_accuracy:.3f})")
            print(f"      → Model may be biased toward majority class")

        # Class imbalance handling
        if precision_recall_diff > 0.1:
            if precision > recall:
                print(f"   📈 High precision ({precision:.3f}), lower recall ({recall:.3f})")
                print(f"      → Conservative predictions (few false positives)")
            else:
                print(f"   📈 High recall ({recall:.3f}), lower precision ({precision:.3f})")
                print(f"      → Liberal predictions (few false negatives)")

        # Overall performance assessment
        if f1 < 0.7:
            print(f"   📉 Low F1-score ({f1:.3f}) suggests underfitting")
            print(f"      → Consider more complex model or better features")
        elif f1 > 0.85:
            print(f"   🎯 Excellent F1-score ({f1:.3f}) - well-fitted model")

        # SMOTE improvement analysis
        if model in smote_results:
            smote_f1 = smote_results[model]['f1']
            smote_bal_acc = smote_results[model]['balanced_accuracy']

            f1_improvement = smote_f1 - f1
            bal_acc_improvement = smote_bal_acc - balanced_accuracy

            if f1_improvement > 0.02:
                print(f"   ✅ SMOTE significantly improved F1 (+{f1_improvement:.3f})")
                print(f"      → Original model suffered from class imbalance")
            elif f1_improvement < -0.02:
                print(f"   ❌ SMOTE hurt performance ({f1_improvement:.3f})")
                print(f"      → Model was already handling imbalance well")



In [None]:
# =============================================================================
# 7. MISCLASSIFICATION ANALYSIS
# =============================================================================

def analyze_misclassifications():
    """Analyze patterns in misclassified examples"""
    print("\n" + "="*60)
    print("🔍 MISCLASSIFICATION ANALYSIS")
    print("="*60)

    # Note: This is a conceptual analysis since we don't have access to individual predictions
    # In a real scenario, you would retrain models and capture prediction details

    print("📋 MISCLASSIFICATION PATTERNS (Conceptual Analysis):")
    print("-" * 50)

    if not baseline_results:
        return

    # Analyze performance patterns to infer misclassification issues
    models_analysis = {}

    for model in baseline_results.keys():
        accuracy = baseline_results[model]['accuracy']
        balanced_accuracy = baseline_results[model]['balanced_accuracy']
        precision = baseline_results[model]['precision']
        recall = baseline_results[model]['recall']

        analysis = {
            'accuracy': accuracy,
            'balanced_accuracy': balanced_accuracy,
            'precision': precision,
            'recall': recall,
            'accuracy_gap': accuracy - balanced_accuracy,
            'precision_recall_ratio': precision / recall if recall > 0 else 0
        }

        models_analysis[model] = analysis

    # Create visualization of misclassification patterns
    fig, axes = plt.subplots(2, 2, figsize=(16, 10))

    # 1. Accuracy vs Balanced Accuracy Gap (indicates majority class bias)
    ax1 = axes[0, 0]
    models = list(models_analysis.keys())
    accuracy_gaps = [models_analysis[m]['accuracy_gap'] for m in models]

    bars = ax1.bar(range(len(models)), accuracy_gaps, color=['red' if gap > 0.03 else 'orange' if gap > 0.01 else 'green' for gap in accuracy_gaps])
    ax1.set_xlabel('Models')
    ax1.set_ylabel('Accuracy - Balanced Accuracy')
    ax1.set_title('Majority Class Bias Indicator')
    ax1.set_xticks(range(len(models)))
    ax1.set_xticklabels([m.replace('_', '\n') for m in models], rotation=45)
    ax1.axhline(y=0, color='black', linestyle='-', alpha=0.3)
    ax1.axhline(y=0.03, color='red', linestyle='--', alpha=0.5, label='High Bias Threshold')
    ax1.legend()
    ax1.grid(True, alpha=0.3)

    # 2. Precision/Recall Ratio (indicates prediction tendency)
    ax2 = axes[0, 1]
    pr_ratios = [models_analysis[m]['precision_recall_ratio'] for m in models]

    bars = ax2.bar(range(len(models)), pr_ratios,
                   color=['blue' if ratio > 1.2 else 'purple' if ratio < 0.8 else 'green' for ratio in pr_ratios])
    ax2.set_xlabel('Models')
    ax2.set_ylabel('Precision / Recall Ratio')
    ax2.set_title('Prediction Tendency Analysis')
    ax2.set_xticks(range(len(models)))
    ax2.set_xticklabels([m.replace('_', '\n') for m in models], rotation=45)
    ax2.axhline(y=1, color='black', linestyle='-', alpha=0.3, label='Perfect Balance')
    ax2.axhline(y=1.2, color='blue', linestyle='--', alpha=0.5, label='Conservative Threshold')
    ax2.axhline(y=0.8, color='purple', linestyle='--', alpha=0.5, label='Liberal Threshold')
    ax2.legend()
    ax2.grid(True, alpha=0.3)

    # 3. Model Complexity vs Performance (overfitting indicator)
    ax3 = axes[1, 0]

    # Assign complexity scores (conceptual)
    complexity_scores = {
        'lr_scratch': 1, 'lr_sklearn': 1,
        'svm_scratch': 2, 'svm_sklearn': 2,
        'klr_scratch': 4, 'ksvm_scratch': 4,
        'rbf_svm_sklearn': 3, 'poly_svm_sklearn': 3
    }

    complexities = [complexity_scores.get(m, 2) for m in models]
    f1_scores = [baseline_results[m]['f1'] for m in models]

    colors = ['red', 'orange', 'yellow', 'green', 'blue']
    ax3.scatter(complexities, f1_scores, c=[colors[min(c-1, 4)] for c in complexities], s=100, alpha=0.7)

    for i, model in enumerate(models):
        ax3.annotate(model.replace('_', '\n'),
                    (complexities[i], f1_scores[i]),
                    xytext=(5, 5), textcoords='offset points', fontsize=8)

    ax3.set_xlabel('Model Complexity (1=Linear, 4=Kernel)')
    ax3.set_ylabel('F1 Score')
    ax3.set_title('Complexity vs Performance Trade-off')
    ax3.grid(True, alpha=0.3)

    # 4. SMOTE Impact Analysis
    ax4 = axes[1, 1]

    if smote_results:
        baseline_f1 = [baseline_results[m]['f1'] for m in models if m in smote_results]
        smote_f1 = [smote_results[m]['f1'] for m in models if m in smote_results]
        improvements = [s - b for b, s in zip(baseline_f1, smote_f1)]

        bars = ax4.bar(range(len(improvements)), improvements,
                       color=['green' if imp > 0.01 else 'red' if imp < -0.01 else 'yellow' for imp in improvements])

        ax4.set_xlabel('Models')
        ax4.set_ylabel('F1 Score Improvement (SMOTE - Baseline)')
        ax4.set_title('SMOTE Impact on Performance')
        ax4.set_xticks(range(len(improvements)))
        ax4.set_xticklabels([m.replace('_', '\n') for m in models if m in smote_results], rotation=45)
        ax4.axhline(y=0, color='black', linestyle='-', alpha=0.3)
        ax4.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

    # Detailed misclassification insights
    print("\n🔍 MISCLASSIFICATION INSIGHTS:")
    print("-" * 40)

    for model in models:
        print(f"\n📊 {model.replace('_', ' ').title()}:")

        gap = models_analysis[model]['accuracy_gap']
        pr_ratio = models_analysis[model]['precision_recall_ratio']

        # Bias analysis
        if gap > 0.05:
            print(f"   🔴 HIGH BIAS: Likely many false negatives (missing high-quality wines)")
            print(f"      → Model over-predicts majority class (low quality)")
        elif gap > 0.02:
            print(f"   🟡 MODERATE BIAS: Some tendency toward majority class")
        else:
            print(f"   🟢 LOW BIAS: Well-balanced predictions")

        # Prediction tendency analysis
        if pr_ratio > 1.3:
            print(f"   🔵 CONSERVATIVE: High precision, low recall")
            print(f"      → Few false positives, but misses many true positives")
            print(f"      → Misclassifies many high-quality wines as low-quality")
        elif pr_ratio < 0.7:
            print(f"   🟣 LIBERAL: Low precision, high recall")
            print(f"      → Few false negatives, but many false positives")
            print(f"      → Misclassifies many low-quality wines as high-quality")
        else:
            print(f"   🟢 BALANCED: Good precision-recall trade-off")

        # SMOTE impact
        if model in smote_results:
            smote_f1 = smote_results[model]['f1']
            baseline_f1 = baseline_results[model]['f1']
            improvement = smote_f1 - baseline_f1

            if improvement > 0.02:
                print(f"   ✅ SMOTE HELPED: Reduced class imbalance misclassifications")
            elif improvement < -0.02:
                print(f"   ❌ SMOTE HURT: May have introduced noise or overfitting")
            else:
                print(f"   ➖ SMOTE NEUTRAL: Minimal impact on misclassifications")

# Run misclassification analysis
analyze_misclassifications()



In [None]:
# =============================================================================
# 8. FEATURE IMPORTANCE AND MODEL INSIGHTS
# =============================================================================

def analyze_feature_insights():
    """Analyze feature importance and model behavior insights"""
    print("\n" + "="*60)
    print("🔬 FEATURE IMPORTANCE & MODEL INSIGHTS")
    print("="*60)

    # Load and analyze original data for feature insights
    try:
        if original_data is not None:
            # Correlation analysis
            features = original_data.select_dtypes(include=[np.number]).columns
            features = [f for f in features if f not in ['quality', 'quality_binary']]

            correlation_matrix = original_data[features + ['quality_binary']].corr()
            target_correlations = correlation_matrix['quality_binary'].drop('quality_binary').abs().sort_values(ascending=False)

            # Visualize feature correlations with target
            fig, axes = plt.subplots(2, 2, figsize=(16, 12))

            # Feature correlation with target
            ax1 = axes[0, 0]
            target_correlations.plot(kind='bar', ax=ax1, color='skyblue')
            ax1.set_title('Feature Correlation with Wine Quality (Binary)')
            ax1.set_xlabel('Features')
            ax1.set_ylabel('Absolute Correlation')
            ax1.tick_params(axis='x', rotation=45)
            ax1.grid(True, alpha=0.3)

            # Feature correlation heatmap
            ax2 = axes[0, 1]
            top_features = target_correlations.head(8).index.tolist()
            subset_corr = original_data[top_features + ['quality_binary']].corr()

            sns.heatmap(subset_corr, annot=True, cmap='coolwarm', center=0,
                       square=True, ax=ax2, cbar_kws={'shrink': 0.8})
            ax2.set_title('Top Features Correlation Matrix')

            # Distribution of top features by quality
            ax3 = axes[1, 0]
            top_feature = target_correlations.index[0]

            original_data.boxplot(column=top_feature, by='quality_binary', ax=ax3)
            ax3.set_title(f'{top_feature} Distribution by Quality')
            ax3.set_xlabel('Quality (0=Low, 1=High)')
            ax3.set_ylabel(top_feature)

            # Feature importance ranking
            ax4 = axes[1, 1]
            importance_data = target_correlations.head(10)

            bars = ax4.barh(range(len(importance_data)), importance_data.values,
                           color='lightgreen', alpha=0.7)
            ax4.set_yticks(range(len(importance_data)))
            ax4.set_yticklabels(importance_data.index)
            ax4.set_xlabel('Correlation with Quality')
            ax4.set_title('Top 10 Most Important Features')
            ax4.grid(True, alpha=0.3)

            plt.tight_layout()
            plt.show()

            # Print feature insights
            print(f"🔍 TOP PREDICTIVE FEATURES:")
            print("-" * 30)
            for i, (feature, corr) in enumerate(target_correlations.head(5).items(), 1):
                print(f"{i}. {feature}: {corr:.3f} correlation")

            print(f"\n💡 FEATURE INSIGHTS:")
            print("-" * 20)

            # Analyze top features
            top_3_features = target_correlations.head(3).index.tolist()
            for feature in top_3_features:
                corr = target_correlations[feature]

                high_quality_mean = original_data[original_data['quality_binary'] == 1][feature].mean()
                low_quality_mean = original_data[original_data['quality_binary'] == 0][feature].mean()

                if corr > 0:
                    print(f"📈 {feature}: Higher values → Higher quality")
                    print(f"   High quality avg: {high_quality_mean:.3f}")
                    print(f"   Low quality avg: {low_quality_mean:.3f}")
                else:
                    print(f"📉 {feature}: Higher values → Lower quality")
                    print(f"   High quality avg: {high_quality_mean:.3f}")
                    print(f"   Low quality avg: {low_quality_mean:.3f}")

        else:
            print("❌ Original data not available for feature analysis")

    except Exception as e:
        print(f"❌ Error in feature analysis: {e}")

# Run feature insights analysis
analyze_feature_insights()



In [None]:
# =============================================================================
# 9. FINAL RECOMMENDATIONS AND CONCLUSIONS
# =============================================================================

def generate_final_recommendations():
    """Generate final recommendations based on all analyses"""
    print("\n" + "="*60)
    print("📋 FINAL RECOMMENDATIONS & CONCLUSIONS")
    print("="*60)

    if not baseline_results or not smote_results:
        print("❌ Cannot generate recommendations without complete results")
        return

    # Find best performing models
    baseline_best = max(baseline_results.items(), key=lambda x: x[1]['f1'])
    smote_best = max(smote_results.items(), key=lambda x: x[1]['f1'])

    print(f"🏆 BEST PERFORMING MODELS:")
    print("-" * 30)
    print(f"Baseline: {baseline_best[0]} (F1: {baseline_best[1]['f1']:.4f})")
    print(f"SMOTE: {smote_best[0]} (F1: {smote_best[1]['f1']:.4f})")

    # Overall recommendations
    print(f"\n💡 KEY RECOMMENDATIONS:")
    print("-" * 25)

    # 1. SMOTE effectiveness
    smote_improvements = []
    for model in baseline_results.keys():
        if model in smote_results:
            improvement = smote_results[model]['f1'] - baseline_results[model]['f1']
            smote_improvements.append(improvement)

    avg_smote_improvement = np.mean(smote_improvements) if smote_improvements else 0

    if avg_smote_improvement > 0.01:
        print(f"✅ 1. USE SMOTE: Average F1 improvement of {avg_smote_improvement:.3f}")
        print(f"      SMOTE effectively addresses class imbalance in this dataset")
    else:
        print(f"⚠️  1. SMOTE IMPACT: Minimal average improvement ({avg_smote_improvement:.3f})")
        print(f"      Consider other rebalancing techniques or feature engineering")

    # 2. Model architecture recommendations
    kernel_models = [k for k in baseline_results.keys() if 'kernel' in k or 'rbf' in k or 'poly' in k]
    linear_models = [k for k in baseline_results.keys() if k not in kernel_models]

    if kernel_models and linear_models:
        kernel_avg_f1 = np.mean([baseline_results[m]['f1'] for m in kernel_models])
        linear_avg_f1 = np.mean([baseline_results[m]['f1'] for m in linear_models])

        if kernel_avg_f1 > linear_avg_f1 + 0.02:
            print(f"✅ 2. USE KERNEL METHODS: {kernel_avg_f1:.3f} vs {linear_avg_f1:.3f} avg F1")
            print(f"      Non-linear relationships are important in this dataset")
        else:
            print(f"✅ 2. LINEAR MODELS SUFFICIENT: Comparable performance to kernel methods")
            print(f"      Consider linear models for faster training and better interpretability")

    # 3. Implementation recommendations
    scratch_models = [k for k in baseline_results.keys() if 'scratch' in k]
    sklearn_models = [k for k in baseline_results.keys() if 'sklearn' in k]

    if scratch_models and sklearn_models:
        scratch_avg_f1 = np.mean([baseline_results[m]['f1'] for m in scratch_models])
        sklearn_avg_f1 = np.mean([baseline_results[m]['f1'] for m in sklearn_models])

        if abs(scratch_avg_f1 - sklearn_avg_f1) < 0.02:
            print(f"✅ 3. IMPLEMENTATION QUALITY: Custom implementations work well")
            print(f"      From-scratch: {scratch_avg_f1:.3f}, Scikit-learn: {sklearn_avg_f1:.3f}")
        else:
            print(f"⚠️  3. IMPLEMENTATION GAP: {abs(scratch_avg_f1 - sklearn_avg_f1):.3f} difference")
            print(f"      Consider optimizing custom implementations")

    # 4. Overfitting analysis
    high_variance_models = []
    for model in baseline_results.keys():
        acc_gap = baseline_results[model]['accuracy'] - baseline_results[model]['balanced_accuracy']
        if acc_gap > 0.03:
            high_variance_models.append(model)

    if high_variance_models:
        print(f"⚠️  4. OVERFITTING DETECTED in: {', '.join(high_variance_models)}")
        print(f"      Consider regularization or cross-validation tuning")
    else:
        print(f"✅ 4. NO MAJOR OVERFITTING: Models show good generalization")

    # 5. Production recommendations
    print(f"\n🚀 PRODUCTION RECOMMENDATIONS:")
    print("-" * 35)

    # Balance performance vs complexity
    simple_models = ['lr_scratch', 'lr_sklearn', 'svm_scratch', 'svm_sklearn']
    complex_models = [k for k in baseline_results.keys() if k not in simple_models]

    best_simple = max([(k, v) for k, v in smote_results.items() if k in simple_models],
                     key=lambda x: x[1]['f1'], default=(None, None))
    best_complex = max([(k, v) for k, v in smote_results.items() if k in complex_models],
                      key=lambda x: x[1]['f1'], default=(None, None))

    if best_simple[0] and best_complex[0]:
        simple_f1 = best_simple[1]['f1']
        complex_f1 = best_complex[1]['f1']

        if complex_f1 > simple_f1 + 0.03:
            print(f"🎯 RECOMMENDED: {best_complex[0]} (F1: {complex_f1:.4f})")
            print(f"   Performance gain ({complex_f1 - simple_f1:.3f}) justifies complexity")
        else:
            print(f"🎯 RECOMMENDED: {best_simple[0]} (F1: {simple_f1:.4f})")
            print(f"   Simpler model with comparable performance ({simple_f1:.4f} vs {complex_f1:.4f})")

    # Model deployment considerations
    print(f"\n⚙️  DEPLOYMENT CONSIDERATIONS:")
    print("-" * 30)
    print(f"• Data preprocessing: Standardization + log transformation essential")
    print(f"• Class imbalance: Monitor precision/recall in production")
    print(f"• Feature monitoring: Track {original_data.select_dtypes(include=[np.number]).columns.tolist()[:3] if original_data is not None else 'key features'}")
    print(f"• Model retraining: Consider when class distribution shifts")

    return {
        'best_baseline': baseline_best,
        'best_smote': smote_best,
        'smote_improvement': avg_smote_improvement,
        'recommendations': 'Generated successfully'
    }

# Generate final recommendations
final_recommendations = generate_final_recommendations()

print(f"\n🎉 ANALYSIS COMPLETE!")
print("="*60)
print("This comprehensive analysis covers:")
print("✅ Data overview and preprocessing analysis")
print("✅ Model performance comparison")
print("✅ Detailed metrics analysis with visualizations")
print("✅ Model architecture comparison")
print("✅ Overfitting/underfitting analysis")
print("✅ Misclassification pattern analysis")
print("✅ Feature importance insights")
print("✅ Final recommendations for production")
print(f"\nAll visualizations and metrics provide insights for model selection and deployment decisions.")('Models')
        ax2.set_ylabel('Balanced Accuracy')
        ax2.set_xticks(x)
        ax2.set_xticklabels([m.replace('_', '\n') for m in existing_models], rotation=45)
        ax2.legend()
        ax2.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

    # Architecture insights
    print("🔍 ARCHITECTURE INSIGHTS:")
    print("-" * 40)

    # Compare scratch vs sklearn implementations
    scratch_models = [k for k in baseline_results.keys() if 'scratch' in k]
    sklearn_models = [k for k in baseline_results.keys() if 'sklearn' in k]

    if scratch_models and sklearn_models:
        print("\n📊 Implementation Comparison (From Scratch vs. Scikit-learn):")

        scratch_avg_f1 = np.mean([baseline_results[m]['f1'] for m in scratch_models])
        sklearn_avg_f1 = np.mean([baseline_results[m]['f1'] for m in sklearn_models])

        print(f"   From Scratch Average F1: {scratch_avg_f1:.4f}")
        print(f"   Scikit-learn Average F1: {sklearn_avg_f1:.4f}")
        print(f"   Difference: {sklearn_avg_f1 - scratch_avg_f1:+.4f}")

        if abs(sklearn_avg_f1 - scratch_avg_f1) < 0.02:
            print("   ✅ Custom implementations perform comparably to scikit-learn")
        elif scratch_avg_f1 > sklearn_avg_f1:
            print("   🚀 Custom implementations outperform scikit-learn!")
        else:
            print("   📚 Scikit-learn implementations are more optimized")

# Run architecture analysis
analyze_model_architectures()



In [None]:
# =============================================================================
# 6. OVERFITTING/UNDERFITTING ANALYSIS
# =============================================================================

def analyze_fitting_behavior():
    """Analyze overfitting and underfitting patterns"""
    print("\n" + "="*60)
    print("📈 OVERFITTING/UNDERFITTING ANALYSIS")
    print("="*60)

    if not baseline_results:
        return

    # Analyze performance patterns that indicate overfitting/underfitting

    # 1. Compare accuracy vs balanced accuracy (balanced accuracy drops more with overfitting)
    fig, axes = plt.subplots(2, 2, figsize=(16, 10))

    # Baseline analysis
    models = list(baseline_results.keys())
    accuracy_scores = [baseline_results[m]['accuracy'] for m in models]
    balanced_accuracy_scores = [baseline_results[m]['balanced_accuracy'] for m in models]

    ax1 = axes[0, 0]
    ax1.scatter(accuracy_scores, balanced_accuracy_scores, alpha=0.7, s=100)

    # Add diagonal line (perfect balance)
    min_score = min(min(accuracy_scores), min(balanced_accuracy_scores))
    max_score = max(max(accuracy_scores), max(balanced_accuracy_scores))
    ax1.plot([min_score, max_score], [min_score, max_score], 'r--', alpha=0.5, label='Perfect Balance')

    ax1.set_xlabel('Accuracy')
    ax1.set_ylabel('Balanced Accuracy')
    ax1.set_title('Baseline: Accuracy vs Balanced Accuracy')
    ax1.legend()
    ax1.grid(True, alpha=0.3)

    # Add model labels
    for i, model in enumerate(models):
        ax1.annotate(model.replace('_', '\n'),
                    (accuracy_scores[i], balanced_accuracy_scores[i]),
                    xytext=(5, 5), textcoords='offset points', fontsize=8)

    # SMOTE analysis
    if smote_results:
        smote_accuracy_scores = [smote_results[m]['accuracy'] for m in models if m in smote_results]
        smote_balanced_accuracy_scores = [smote_results[m]['balanced_accuracy'] for m in models if m in smote_results]
        smote_models = [m for m in models if m in smote_results]

        ax2 = axes[0, 1]
        ax2.scatter(smote_accuracy_scores, smote_balanced_accuracy_scores, alpha=0.7, s=100, color='green')

        min_score = min(min(smote_accuracy_scores), min(smote_balanced_accuracy_scores))
        max_score = max(max(smote_accuracy_scores), max(smote_balanced_accuracy_scores))
        ax2.plot([min_score, max_score], [min_score, max_score], 'r--', alpha=0.5, label='Perfect Balance')

        ax2.set_xlabel

# Run fitting behavior analysis
analyze_fitting_behavior()