In [18]:
# CELL 1: Complete Visualization Analyzer for Jupyter Notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os
import warnings
warnings.filterwarnings('ignore')

# Set style for better looking plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

class KellerVisualizationAnalyzer:
    """Analyzer for Keller graph statistics with comprehensive visualizations."""
    
    def __init__(self):
        self.data = {}
        self.comparative_stats = {}
        
    def load_all_data(self, dimensions=[3, 4, 5, 6, 7]):
        """Load CSV data for all specified dimensions."""
        print("Loading Keller graph statistics data...")
        print("=" * 60)
        
        for dim in dimensions:
            filename = f'keller_{dim}_statistics.csv'
            if os.path.exists(filename):
                df = pd.read_csv(filename)
                
                # Convert string columns
                df['contains_0_5'] = df['contains_0_5'].map({'True': True, 'False': False})
                df['success'] = df['success'].map({'True': True, 'False': False})
                
                # Convert clique_indices from string to list
                df['clique_indices_list'] = df['clique_indices'].apply(
                    lambda x: eval(x) if isinstance(x, str) else x
                )
                
                self.data[dim] = df
                
                # Calculate basic statistics
                vertices = 4**dim
                success_rate = df['success'].mean() * 100
                avg_time = df['time'].mean() * 1000
                avg_ops = df['total_operations'].mean()
                
                print(f"K({dim}): {len(df):3d} runs | Vertices: {vertices:6,} | "
                      f"Success: {success_rate:5.1f}% | Time: {avg_time:6.2f} ms | "
                      f"Ops: {avg_ops:10,.0f}")
                
                # Store comparative statistics
                self.comparative_stats[dim] = {
                    'dimension': dim,
                    'vertices': vertices,
                    'target_size': df['target_size'].iloc[0],
                    'runs': len(df),
                    'success_rate': success_rate,
                    'avg_time_ms': avg_time,
                    'avg_time_std': df['time'].std() * 1000,
                    'avg_ops': avg_ops,
                    'avg_ops_std': df['total_operations'].std(),
                    'avg_size': df['size'].mean(),
                    'size_std': df['size'].std(),
                    'avg_low_indices': df['low_indices_0_9'].mean(),
                    'contains_0_5_rate': df['contains_0_5'].mean() * 100
                }
            else:
                print(f"Warning: File '{filename}' not found.")
        
        print(f"\nLoaded data for {len(self.data)} dimensions.")
        return self.data
    
    def create_output_directory(self):
        """Create directory for saving visualizations."""
        os.makedirs('keller_analysis_plots', exist_ok=True)
        return 'keller_analysis_plots'
    
    # =========================================================================
    # 1. RUNTIME ANALYSIS VISUALIZATIONS
    # =========================================================================
    
    def plot_runtime_distributions(self, output_dir='keller_analysis_plots', show_plot=True):
        """Plot runtime distributions for each dimension."""
        print("\nCreating runtime distribution plots...")
        
        n_dims = len(self.data)
        n_cols = min(3, n_dims)
        n_rows = (n_dims + n_cols - 1) // n_cols
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(6*n_cols, 4*n_rows))
        if n_dims == 1:
            axes = np.array([axes])
        axes = axes.flatten()
        
        for idx, (dim, df) in enumerate(self.data.items()):
            ax = axes[idx]
            
            # Convert time to milliseconds
            times_ms = df['time'] * 1000
            
            # Create histogram
            n, bins, patches = ax.hist(times_ms, bins=30, edgecolor='black', 
                                      alpha=0.7, density=True)
            
            # Add KDE curve
            from scipy.stats import gaussian_kde
            kde = gaussian_kde(times_ms)
            x = np.linspace(times_ms.min(), times_ms.max(), 1000)
            ax.plot(x, kde(x), 'r-', linewidth=2, label='KDE')
            
            # Add vertical line for mean
            mean_time = times_ms.mean()
            ax.axvline(mean_time, color='green', linestyle='--', 
                      linewidth=2, label=f'Mean: {mean_time:.2f} ms')
            
            # Add text with statistics
            stats_text = f"K({dim})\n"
            stats_text += f"Mean: {mean_time:.2f} ms\n"
            stats_text += f"Std: {times_ms.std():.2f} ms\n"
            stats_text += f"Min: {times_ms.min():.2f} ms\n"
            stats_text += f"Max: {times_ms.max():.2f} ms"
            
            ax.text(0.95, 0.95, stats_text, transform=ax.transAxes,
                   verticalalignment='top', horizontalalignment='right',
                   bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
            
            ax.set_xlabel('Runtime (ms)', fontsize=10)
            ax.set_ylabel('Density', fontsize=10)
            ax.set_title(f'K({dim}) - Runtime Distribution', fontsize=12, fontweight='bold')
            ax.legend(fontsize=9)
            ax.grid(True, alpha=0.3)
        
        # Remove empty subplots
        for idx in range(len(self.data), len(axes)):
            axes[idx].set_visible(False)
        
        plt.suptitle('Runtime Distributions for Keller Graphs', fontsize=16, fontweight='bold')
        plt.tight_layout()
        
        # Save plot
        os.makedirs(output_dir, exist_ok=True)
        plt.savefig(f'{output_dir}/runtime_distributions.png', dpi=300, bbox_inches='tight')
        plt.savefig(f'{output_dir}/runtime_distributions.pdf', bbox_inches='tight')
        
        if show_plot:
            plt.show()
        else:
            plt.close()
        
        print(f"  Saved: {output_dir}/runtime_distributions.png")
    
    def plot_runtime_boxplots(self, output_dir='keller_analysis_plots', show_plot=True):
        """Plot comparative boxplots of runtime across dimensions."""
        print("Creating runtime boxplots...")
        
        plt.figure(figsize=(12, 8))
        
        # Prepare data for boxplot
        box_data = []
        labels = []
        
        for dim, df in enumerate(self.data.values(), start=3):
            if dim in self.data:
                df = self.data[dim]
                box_data.append(df['time'] * 1000)  # Convert to ms
                labels.append(f'K({dim})\nn={4**dim:,}')
        
        # Create boxplot
        bp = plt.boxplot(box_data, labels=labels, patch_artist=True, 
                        showmeans=True, meanline=True)
        
        # Customize boxes
        colors = plt.cm.Set3(np.linspace(0, 1, len(box_data)))
        for patch, color in zip(bp['boxes'], colors):
            patch.set_facecolor(color)
            patch.set_alpha(0.7)
        
        # Customize other elements
        plt.setp(bp['whiskers'], color='black', linewidth=1.5)
        plt.setp(bp['caps'], color='black', linewidth=1.5)
        plt.setp(bp['medians'], color='red', linewidth=2)
        plt.setp(bp['means'], color='blue', linewidth=2, linestyle='--')
        
        # Add individual data points
        for i, data in enumerate(box_data):
            y = data
            x = np.random.normal(i + 1, 0.04, size=len(y))
            plt.plot(x, y, 'r.', alpha=0.4, markersize=3)
        
        plt.xlabel('Keller Graph Dimension', fontsize=12)
        plt.ylabel('Runtime (ms)', fontsize=12)
        plt.title('Comparative Runtime Analysis Across Keller Graphs', 
                 fontsize=14, fontweight='bold')
        plt.grid(True, alpha=0.3, axis='y')
        
        # Add statistics table
        stats_text = "Summary Statistics (ms):\n"
        stats_text += "Dim | Mean  | Median | Std\n"
        stats_text += "-" * 30 + "\n"
        
        for i, dim in enumerate(self.data.keys()):
            if dim in self.data:
                df = self.data[dim]
                times_ms = df['time'] * 1000
                stats_text += f"K({dim}) | {times_ms.mean():5.2f} | {times_ms.median():6.2f} | {times_ms.std():5.2f}\n"
        
        plt.figtext(0.02, 0.02, stats_text, fontsize=9, 
                   bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
        
        plt.tight_layout()
        
        # Save plot
        os.makedirs(output_dir, exist_ok=True)
        plt.savefig(f'{output_dir}/runtime_boxplots.png', dpi=300, bbox_inches='tight')
        plt.savefig(f'{output_dir}/runtime_boxplots.pdf', bbox_inches='tight')
        
        if show_plot:
            plt.show()
        else:
            plt.close()
        
        print(f"  Saved: {output_dir}/runtime_boxplots.png")
    
    # =========================================================================
    # 2. CLIQUE SIZE ANALYSIS VISUALIZATIONS
    # =========================================================================
    
    def plot_clique_size_distributions(self, output_dir='keller_analysis_plots', show_plot=True):
        """Plot clique size distributions for each dimension."""
        print("\nCreating clique size distribution plots...")
        
        n_dims = len(self.data)
        n_cols = min(3, n_dims)
        n_rows = (n_dims + n_cols - 1) // n_cols
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(6*n_cols, 4*n_rows))
        if n_dims == 1:
            axes = np.array([axes])
        axes = axes.flatten()
        
        for idx, (dim, df) in enumerate(self.data.items()):
            ax = axes[idx]
            target_size = df['target_size'].iloc[0]
            
            # Create histogram
            sizes = df['size']
            unique_sizes = sorted(sizes.unique())
            
            bars = ax.hist(sizes, bins=np.arange(min(sizes)-0.5, max(sizes)+1.5), 
                          edgecolor='black', alpha=0.7)
            
            # Highlight target size
            for i, (x, height) in enumerate(zip(bars[1][:-1], bars[0])):
                if abs(x - target_size) < 0.5:
                    bars[2][i].set_facecolor('green')
                    bars[2][i].set_alpha(0.8)
            
            # Add success rate
            success_rate = df['success'].mean() * 100
            ax.axvline(target_size, color='red', linestyle='--', 
                      linewidth=2, label=f'Target: {target_size}')
            
            # Add statistics
            stats_text = f"K({dim})\n"
            stats_text += f"Target: {target_size}\n"
            stats_text += f"Success: {success_rate:.1f}%\n"
            stats_text += f"Mean: {sizes.mean():.2f}\n"
            stats_text += f"Std: {sizes.std():.2f}"
            
            ax.text(0.95, 0.95, stats_text, transform=ax.transAxes,
                   verticalalignment='top', horizontalalignment='right',
                   bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
            
            ax.set_xlabel('Clique Size', fontsize=10)
            ax.set_ylabel('Frequency', fontsize=10)
            ax.set_title(f'K({dim}) - Clique Size Distribution', fontsize=12, fontweight='bold')
            ax.legend(fontsize=9)
            ax.grid(True, alpha=0.3)
        
        # Remove empty subplots
        for idx in range(len(self.data), len(axes)):
            axes[idx].set_visible(False)
        
        plt.suptitle('Clique Size Distributions for Keller Graphs', fontsize=16, fontweight='bold')
        plt.tight_layout()
        
        # Save plot
        os.makedirs(output_dir, exist_ok=True)
        plt.savefig(f'{output_dir}/clique_size_distributions.png', dpi=300, bbox_inches='tight')
        plt.savefig(f'{output_dir}/clique_size_distributions.pdf', bbox_inches='tight')
        
        if show_plot:
            plt.show()
        else:
            plt.close()
        
        print(f"  Saved: {output_dir}/clique_size_distributions.png")
    
    def plot_success_rate_analysis(self, output_dir='keller_analysis_plots', show_plot=True):
        """Plot success rate analysis across dimensions."""
        print("Creating success rate analysis plot...")
        
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
        
        # Prepare data
        dimensions = []
        success_rates = []
        vertices = []
        
        for dim, stats in self.comparative_stats.items():
            dimensions.append(dim)
            success_rates.append(stats['success_rate'])
            vertices.append(stats['vertices'])
        
        # Plot 1: Success rate vs dimension
        bars1 = ax1.bar([f'K({d})' for d in dimensions], success_rates, 
                       color=plt.cm.viridis(np.linspace(0, 1, len(dimensions))))
        
        # Add value labels on bars
        for bar, rate in zip(bars1, success_rates):
            height = bar.get_height()
            ax1.text(bar.get_x() + bar.get_width()/2, height + 1,
                    f'{rate:.1f}%', ha='center', va='bottom', fontsize=10)
        
        ax1.set_xlabel('Keller Graph Dimension', fontsize=12)
        ax1.set_ylabel('Success Rate (%)', fontsize=12)
        ax1.set_title('Success Rate by Dimension', fontsize=14, fontweight='bold')
        ax1.set_ylim(0, 105)
        ax1.grid(True, alpha=0.3, axis='y')
        
        # Plot 2: Success rate vs graph size (log scale)
        ax2.scatter(vertices, success_rates, s=150, alpha=0.7, 
                   c=dimensions, cmap='viridis', edgecolors='black')
        
        # Add labels
        for dim, v, sr in zip(dimensions, vertices, success_rates):
            ax2.text(v, sr, f' K({dim})', fontsize=9, va='center')
        
        # Add trend line
        if len(vertices) > 1:
            z = np.polyfit(np.log10(vertices), success_rates, 1)
            p = np.poly1d(z)
            x_log = np.linspace(min(np.log10(vertices)), max(np.log10(vertices)), 100)
            ax2.plot(10**x_log, p(x_log), 'r--', alpha=0.8, label=f'Trend: y={z[0]:.2f}log(x)+{z[1]:.2f}')
        
        ax2.set_xscale('log')
        ax2.set_xlabel('Number of Vertices (log scale)', fontsize=12)
        ax2.set_ylabel('Success Rate (%)', fontsize=12)
        ax2.set_title('Success Rate vs Graph Size', fontsize=14, fontweight='bold')
        ax2.grid(True, alpha=0.3)
        ax2.legend()
        
        plt.suptitle('Success Rate Analysis for Maximum Clique Finding', 
                    fontsize=16, fontweight='bold')
        plt.tight_layout()
        
        # Save plot
        os.makedirs(output_dir, exist_ok=True)
        plt.savefig(f'{output_dir}/success_rate_analysis.png', dpi=300, bbox_inches='tight')
        plt.savefig(f'{output_dir}/success_rate_analysis.pdf', bbox_inches='tight')
        
        if show_plot:
            plt.show()
        else:
            plt.close()
        
        print(f"  Saved: {output_dir}/success_rate_analysis.png")
    
    # =========================================================================
    # 3. COMPLEXITY ANALYSIS VISUALIZATIONS
    # =========================================================================
    
    def plot_operations_analysis(self, output_dir='keller_analysis_plots', show_plot=True):
        """Plot operations analysis across dimensions."""
        print("\nCreating operations analysis plots...")
        
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
        
        # Prepare data
        dimensions = []
        avg_ops = []
        avg_ops_std = []
        vertices = []
        ops_per_second = []
        
        for dim, stats in self.comparative_stats.items():
            dimensions.append(dim)
            avg_ops.append(stats['avg_ops'])
            avg_ops_std.append(stats['avg_ops_std'])
            vertices.append(stats['vertices'])
            
            # Calculate average ops per second from data
            df = self.data[dim]
            ops_per_second.append(df['operations_per_second'].mean())
        
        # Plot 1: Average operations by dimension
        x_pos = np.arange(len(dimensions))
        bars1 = ax1.bar(x_pos, avg_ops, yerr=avg_ops_std, capsize=5,
                       color=plt.cm.coolwarm(np.linspace(0, 1, len(dimensions))))
        
        ax1.set_xlabel('Keller Graph Dimension', fontsize=12)
        ax1.set_ylabel('Average Operations', fontsize=12)
        ax1.set_title('Average Operations by Dimension', fontsize=14, fontweight='bold')
        ax1.set_xticks(x_pos)
        ax1.set_xticklabels([f'K({d})' for d in dimensions])
        ax1.set_yscale('log')
        ax1.grid(True, alpha=0.3, axis='y')
        
        # Add value labels
        for bar, ops in zip(bars1, avg_ops):
            height = bar.get_height()
            ax1.text(bar.get_x() + bar.get_width()/2, height * 1.05,
                    f'{ops:,.0f}', ha='center', va='bottom', fontsize=9)
        
        # Plot 2: Operations vs vertices (log-log)
        ax2.scatter(vertices, avg_ops, s=100, alpha=0.7, 
                   c=dimensions, cmap='viridis', edgecolors='black')
        
        # Add labels
        for dim, v, ops in zip(dimensions, vertices, avg_ops):
            ax2.text(v, ops, f' K({dim})', fontsize=9, va='center')
        
        # Fit power law
        if len(vertices) > 1:
            log_v = np.log10(vertices)
            log_ops = np.log10(avg_ops)
            z = np.polyfit(log_v, log_ops, 1)
            p = np.poly1d(z)
            
            x_fit = np.linspace(min(log_v), max(log_v), 100)
            ax2.plot(10**x_fit, 10**p(x_fit), 'r--', alpha=0.8, 
                    label=f'Fit: O(n^{z[0]:.2f})')
        
        ax2.set_xscale('log')
        ax2.set_yscale('log')
        ax2.set_xlabel('Number of Vertices (log scale)', fontsize=12)
        ax2.set_ylabel('Average Operations (log scale)', fontsize=12)
        ax2.set_title('Operations Complexity Analysis', fontsize=14, fontweight='bold')
        ax2.grid(True, alpha=0.3)
        ax2.legend()
        
        # Plot 3: Operations per second
        bars3 = ax3.bar(x_pos, ops_per_second, 
                       color=plt.cm.plasma(np.linspace(0, 1, len(dimensions))))
        
        ax3.set_xlabel('Keller Graph Dimension', fontsize=12)
        ax3.set_ylabel('Operations per Second', fontsize=12)
        ax3.set_title('Computational Throughput', fontsize=14, fontweight='bold')
        ax3.set_xticks(x_pos)
        ax3.set_xticklabels([f'K({d})' for d in dimensions])
        ax3.set_yscale('log')
        ax3.grid(True, alpha=0.3, axis='y')
        
        # Add value labels
        for bar, ops_sec in zip(bars3, ops_per_second):
            height = bar.get_height()
            ax3.text(bar.get_x() + bar.get_width()/2, height * 1.05,
                    f'{ops_sec:,.0f}', ha='center', va='bottom', fontsize=9)
        
        # Plot 4: Efficiency (size/operations)
        efficiencies = []
        for dim, stats in self.comparative_stats.items():
            df = self.data[dim]
            efficiency = df['size'] / df['total_operations']
            efficiencies.append(efficiency.mean() * 1e6)  # Size per million operations
        
        bars4 = ax4.bar(x_pos, efficiencies,
                       color=plt.cm.spring(np.linspace(0, 1, len(dimensions))))
        
        ax4.set_xlabel('Keller Graph Dimension', fontsize=12)
        ax4.set_ylabel('Size per Million Operations', fontsize=12)
        ax4.set_title('Algorithm Efficiency', fontsize=14, fontweight='bold')
        ax4.set_xticks(x_pos)
        ax4.set_xticklabels([f'K({d})' for d in dimensions])
        ax4.grid(True, alpha=0.3, axis='y')
        
        # Add value labels
        for bar, eff in zip(bars4, efficiencies):
            height = bar.get_height()
            ax4.text(bar.get_x() + bar.get_width()/2, height + 0.1,
                    f'{eff:.3f}', ha='center', va='bottom', fontsize=9)
        
        plt.suptitle('Computational Complexity Analysis', fontsize=16, fontweight='bold')
        plt.tight_layout()
        
        # Save plot
        os.makedirs(output_dir, exist_ok=True)
        plt.savefig(f'{output_dir}/operations_analysis.png', dpi=300, bbox_inches='tight')
        plt.savefig(f'{output_dir}/operations_analysis.pdf', bbox_inches='tight')
        
        if show_plot:
            plt.show()
        else:
            plt.close()
        
        print(f"  Saved: {output_dir}/operations_analysis.png")
    
    # =========================================================================
    # 4. SUMMARY DASHBOARD
    # =========================================================================
    
    def create_summary_dashboard(self, output_dir='keller_analysis_plots', show_plot=True):
        """Create a comprehensive summary dashboard."""
        print("\nCreating summary dashboard...")
        
        fig = plt.figure(figsize=(20, 16))
        
        # Define grid layout
        gs = fig.add_gridspec(4, 4, hspace=0.3, wspace=0.3)
        
        # 1. Success Rate by Dimension (top left)
        ax1 = fig.add_subplot(gs[0, 0])
        dimensions = list(self.comparative_stats.keys())
        success_rates = [self.comparative_stats[d]['success_rate'] for d in dimensions]
        bars = ax1.bar([f'K({d})' for d in dimensions], success_rates,
                      color=plt.cm.viridis(np.linspace(0, 1, len(dimensions))))
        ax1.set_ylabel('Success Rate (%)', fontsize=11)
        ax1.set_title('Success Rate', fontsize=12, fontweight='bold')
        ax1.set_ylim(0, 105)
        ax1.grid(True, alpha=0.3, axis='y')
        for bar, rate in zip(bars, success_rates):
            ax1.text(bar.get_x() + bar.get_width()/2, rate + 2,
                    f'{rate:.1f}%', ha='center', va='bottom', fontsize=9)
        
        # 2. Average Runtime by Dimension (top middle)
        ax2 = fig.add_subplot(gs[0, 1])
        avg_times = [self.comparative_stats[d]['avg_time_ms'] for d in dimensions]
        bars = ax2.bar([f'K({d})' for d in dimensions], avg_times,
                      color=plt.cm.plasma(np.linspace(0, 1, len(dimensions))))
        ax2.set_ylabel('Average Runtime (ms)', fontsize=11)
        ax2.set_title('Runtime Performance', fontsize=12, fontweight='bold')
        ax2.grid(True, alpha=0.3, axis='y')
        for bar, time_val in zip(bars, avg_times):
            ax2.text(bar.get_x() + bar.get_width()/2, time_val * 1.05,
                    f'{time_val:.2f}', ha='center', va='bottom', fontsize=9)
        
        # 3. Average Operations by Dimension (top right)
        ax3 = fig.add_subplot(gs[0, 2])
        avg_ops = [self.comparative_stats[d]['avg_ops'] for d in dimensions]
        bars = ax3.bar([f'K({d})' for d in dimensions], avg_ops,
                      color=plt.cm.coolwarm(np.linspace(0, 1, len(dimensions))))
        ax3.set_ylabel('Average Operations', fontsize=11)
        ax3.set_title('Computational Cost', fontsize=12, fontweight='bold')
        ax3.set_yscale('log')
        ax3.grid(True, alpha=0.3, axis='y')
        for bar, ops in zip(bars, avg_ops):
            ax3.text(bar.get_x() + bar.get_width()/2, ops * 1.1,
                    f'{ops:,.0f}', ha='center', va='bottom', fontsize=9)
        
        # 4. Scalability Plot (middle left, spans 2 rows)
        ax4 = fig.add_subplot(gs[1:3, 0:2])
        vertices = [self.comparative_stats[d]['vertices'] for d in dimensions]
        
        # Plot multiple metrics
        ax4.plot(vertices, avg_times, 'o-', linewidth=2, markersize=8,
                label='Runtime (ms)', color='blue')
        ax4.set_xlabel('Number of Vertices', fontsize=11)
        ax4.set_ylabel('Runtime (ms)', fontsize=11, color='blue')
        ax4.tick_params(axis='y', labelcolor='blue')
        ax4.set_xscale('log')
        ax4.set_yscale('log')
        ax4.grid(True, alpha=0.3)
        
        # Add second y-axis for operations
        ax4b = ax4.twinx()
        ax4b.plot(vertices, avg_ops, 's--', linewidth=2, markersize=8,
                 label='Operations', color='red')
        ax4b.set_ylabel('Operations', fontsize=11, color='red')
        ax4b.tick_params(axis='y', labelcolor='red')
        ax4b.set_yscale('log')
        
        # Add dimension labels
        for dim, v, t, o in zip(dimensions, vertices, avg_times, avg_ops):
            ax4.annotate(f'K({dim})', xy=(v, t), xytext=(5, 5),
                        textcoords='offset points', fontsize=9,
                        bbox=dict(boxstyle='round,pad=0.3', facecolor='yellow', alpha=0.7))
        
        ax4.set_title('Scalability Analysis', fontsize=12, fontweight='bold')
        
        # Combine legends
        lines1, labels1 = ax4.get_legend_handles_labels()
        lines2, labels2 = ax4b.get_legend_handles_labels()
        ax4.legend(lines1 + lines2, labels1 + labels2, loc='upper left')
        
        # 7. Statistical Summary Table (bottom row, spans 4 columns)
        ax7 = fig.add_subplot(gs[3, :])
        ax7.axis('tight')
        ax7.axis('off')
        
        # Create summary table
        table_data = []
        headers = ['Dimension', 'Vertices', 'Target', 'Success%', 
                  'Avg Time(ms)', 'Avg Ops', 'Avg Size', 'Vertices<10']
        
        for dim in dimensions:
            stats = self.comparative_stats[dim]
            row = [
                f'K({dim})',
                f'{stats["vertices"]:,}',
                f'{stats["target_size"]}',
                f'{stats["success_rate"]:.1f}%',
                f'{stats["avg_time_ms"]:.2f}',
                f'{stats["avg_ops"]:,.0f}',
                f'{stats["avg_size"]:.2f}',
                f'{stats["avg_low_indices"]:.2f}'
            ]
            table_data.append(row)
        
        table = ax7.table(cellText=table_data, colLabels=headers,
                         loc='center', cellLoc='center')
        table.auto_set_font_size(False)
        table.set_fontsize(9)
        table.scale(1, 1.5)
        
        # Style table
        for i in range(len(headers)):
            table[(0, i)].set_facecolor('#40466e')
            table[(0, i)].set_text_props(weight='bold', color='white')
        
        for i in range(1, len(dimensions) + 1):
            if i % 2 == 0:
                for j in range(len(headers)):
                    table[(i, j)].set_facecolor('#f2f2f2')
        
        ax7.set_title('Summary Statistics', fontsize=14, fontweight='bold', y=0.95)
        
        plt.suptitle('Keller Graph Maximum Clique Analysis Dashboard', 
                    fontsize=18, fontweight='bold', y=0.98)
        plt.tight_layout(rect=[0, 0, 1, 0.96])
        
        # Save plot
        os.makedirs(output_dir, exist_ok=True)
        plt.savefig(f'{output_dir}/summary_dashboard.png', dpi=300, bbox_inches='tight')
        plt.savefig(f'{output_dir}/summary_dashboard.pdf', bbox_inches='tight')
        
        if show_plot:
            plt.show()
        else:
            plt.close()
        
        print(f"  Saved: {output_dir}/summary_dashboard.png")
    
    def generate_statistical_report(self, output_dir='keller_analysis_plots'):
        """Generate a comprehensive statistical report."""
        print("\nGenerating statistical report...")
        
        report_lines = []
        report_lines.append("=" * 80)
        report_lines.append("KELLER GRAPH MAXIMUM CLIQUE - STATISTICAL ANALYSIS REPORT")
        report_lines.append("=" * 80)
        report_lines.append("\n")
        
        # Overall summary
        report_lines.append("OVERALL SUMMARY")
        report_lines.append("-" * 40)
        report_lines.append(f"Dimensions analyzed: {len(self.data)}")
        report_lines.append(f"Total runs analyzed: {sum(len(df) for df in self.data.values())}")
        report_lines.append(f"Date of analysis: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}")
        report_lines.append("\n")
        
        # Detailed analysis for each dimension
        for dim, df in self.data.items():
            stats = self.comparative_stats[dim]
            
            report_lines.append(f"K({dim}) - DETAILED ANALYSIS")
            report_lines.append("-" * 40)
            report_lines.append(f"Graph size: {stats['vertices']:,} vertices")
            report_lines.append(f"Target clique size: {stats['target_size']}")
            report_lines.append(f"Number of runs: {stats['runs']}")
            report_lines.append(f"Success rate: {stats['success_rate']:.1f}%")
            report_lines.append("\n")
            
            # Performance metrics
            report_lines.append("PERFORMANCE METRICS:")
            report_lines.append(f"  Average runtime: {stats['avg_time_ms']:.2f} ms")
            report_lines.append(f"  Runtime std dev: {stats['avg_time_std']:.2f} ms")
            report_lines.append(f"  Average operations: {stats['avg_ops']:,.0f}")
            report_lines.append(f"  Operations std dev: {stats['avg_ops_std']:,.0f}")
            report_lines.append(f"  Average operations/second: {df['operations_per_second'].mean():,.0f}")
            report_lines.append("\n")
            
            # Clique statistics
            report_lines.append("CLIQUE STATISTICS:")
            report_lines.append(f"  Average size: {stats['avg_size']:.2f}")
            report_lines.append(f"  Size std dev: {stats['size_std']:.2f}")
            report_lines.append(f"  Minimum size: {df['size'].min()}")
            report_lines.append(f"  Maximum size: {df['size'].max()}")
            report_lines.append(f"  Median size: {df['size'].median():.2f}")
            report_lines.append("\n")
            
            # Clique structure
            report_lines.append("CLIQUE STRUCTURE:")
            report_lines.append(f"  Average vertices with index < 10: {stats['avg_low_indices']:.2f}")
            report_lines.append(f"  Contains vertices 0-5: {stats['contains_0_5_rate']:.1f}% of runs")
            report_lines.append(f"  Average minimum index: {df['min_index'].mean():.1f}")
            report_lines.append(f"  Average maximum index: {df['max_index'].mean():.1f}")
            report_lines.append(f"  Average index range: {df['max_index'].mean() - df['min_index'].mean():.1f}")
            report_lines.append("\n")
        
        # Comparative analysis
        report_lines.append("COMPARATIVE ANALYSIS ACROSS DIMENSIONS")
        report_lines.append("-" * 40)
        
        if len(self.data) > 1:
            # Calculate scalability metrics
            dimensions = list(self.comparative_stats.keys())
            vertices = [self.comparative_stats[d]['vertices'] for d in dimensions]
            avg_times = [self.comparative_stats[d]['avg_time_ms'] for d in dimensions]
            avg_ops = [self.comparative_stats[d]['avg_ops'] for d in dimensions]
            
            # Fit power laws
            log_v = np.log10(vertices)
            log_t = np.log10(avg_times)
            log_o = np.log10(avg_ops)
            
            z_time = np.polyfit(log_v, log_t, 1)
            z_ops = np.polyfit(log_v, log_o, 1)
            
            report_lines.append(f"Time complexity: O(n^{z_time[0]:.3f})")
            report_lines.append(f"Operation complexity: O(n^{z_ops[0]:.3f})")
            report_lines.append("\n")
            
            report_lines.append("DIMENSION | VERTICES | SUCCESS% | TIME(ms) | OPS")
            report_lines.append("-" * 60)
            
            for dim in dimensions:
                stats = self.comparative_stats[dim]
                report_lines.append(f"K({dim:1})     | {stats['vertices']:8,} | {stats['success_rate']:7.1f}% | "
                                  f"{stats['avg_time_ms']:8.2f} | {stats['avg_ops']:10,.0f}")
        
        # Save report
        os.makedirs(output_dir, exist_ok=True)
        report_path = f'{output_dir}/statistical_report.txt'
        with open(report_path, 'w') as f:
            f.write("\n".join(report_lines))
        
        print(f"  Saved: {report_path}")
        
        # Also save as CSV for easy import
        summary_df = pd.DataFrame.from_dict(self.comparative_stats, orient='index')
        summary_path = f'{output_dir}/summary_statistics.csv'
        summary_df.to_csv(summary_path, index=False)
        print(f"  Saved: {summary_path}")
    
    def run_complete_analysis(self, dimensions=[3, 4, 5, 6, 7], show_plots=True):
        """Run complete analysis with all visualizations."""
        print("=" * 80)
        print("KELLER GRAPH VISUALIZATION ANALYZER")
        print("=" * 80)
        
        # Load data
        self.load_all_data(dimensions)
        
        if not self.data:
            print("No data loaded. Exiting.")
            return
        
        # Create output directory
        output_dir = self.create_output_directory()
        
        # Generate all visualizations
        self.plot_runtime_distributions(output_dir, show_plots)
        self.plot_runtime_boxplots(output_dir, show_plots)
        self.plot_clique_size_distributions(output_dir, show_plots)
        self.plot_success_rate_analysis(output_dir, show_plots)
        self.plot_operations_analysis(output_dir, show_plots)
        self.create_summary_dashboard(output_dir, show_plots)
        self.generate_statistical_report(output_dir)
        
        print("\n" + "=" * 80)
        print("ANALYSIS COMPLETE!")
        print("=" * 80)
        print(f"\nAll visualizations and reports have been saved to: {output_dir}/")
        print("\nGenerated files:")
        print("  1. runtime_distributions.png/pdf")
        print("  2. runtime_boxplots.png/pdf")
        print("  3. clique_size_distributions.png/pdf")
        print("  4. success_rate_analysis.png/pdf")
        print("  5. operations_analysis.png/pdf")
        print("  6. summary_dashboard.png/pdf")
        print("  7. statistical_report.txt")
        print("  8. summary_statistics.csv")


# =============================================================================
# MAIN FUNCTION FOR JUPYTER
# =============================================================================

def analyze_keller_statistics(dimensions=[3, 4, 5, 6, 7], show_plots=True):
    """
    Main function to run in Jupyter notebook.
    
    Parameters:
    -----------
    dimensions : list of int
        Keller graph dimensions to analyze (default: [3, 4, 5, 6, 7])
    show_plots : bool
        Whether to display plots inline (default: True)
    
    Returns:
    --------
    analyzer : KellerVisualizationAnalyzer
        The analyzer object with loaded data
    """
    
    analyzer = KellerVisualizationAnalyzer()
    analyzer.run_complete_analysis(dimensions, show_plots)
    return analyzer

def quick_analysis():
    """Quick analysis for demonstration."""
    analyzer = KellerVisualizationAnalyzer()
    analyzer.load_all_data([5])  # Just K(5) for quick demo
    
    if 5 in analyzer.data:
        print("\nQuick Analysis of K(5):")
        df = analyzer.data[5]
        print(f"Success rate: {df['success'].mean()*100:.1f}%")
        print(f"Average runtime: {df['time'].mean()*1000:.2f} ms")
        print(f"Average clique size: {df['size'].mean():.2f}")
        
        # Create output directory
        output_dir = analyzer.create_output_directory()
        
        # Generate a couple of key plots
        analyzer.plot_runtime_distributions(output_dir, show_plot=True)
        analyzer.plot_clique_size_distributions(output_dir, show_plot=True)
    
    return analyzer

print("Keller Visualization Analyzer loaded successfully!")
print("Available functions:")
print("  1. analyze_keller_statistics([3,4,5,6,7], show_plots=True)")
print("  2. quick_analysis()")
print("  3. analyzer = KellerVisualizationAnalyzer()")
print("     analyzer.load_all_data([5,6])")
print("     analyzer.plot_runtime_distributions(show_plot=True)")

Keller Visualization Analyzer loaded successfully!
Available functions:
  1. analyze_keller_statistics([3,4,5,6,7], show_plots=True)
  2. quick_analysis()
  3. analyzer = KellerVisualizationAnalyzer()
     analyzer.load_all_data([5,6])
     analyzer.plot_runtime_distributions(show_plot=True)


In [15]:
# FIXED Keller Statistics Generator with Correct Algorithm
import pickle
import random
import time
import csv
import statistics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from typing import List, Tuple, Dict
import warnings
warnings.filterwarnings('ignore')

# Set style for better looking plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("=" * 80)
print("KELLER GRAPH STATISTICS GENERATOR (FIXED VERSION)")
print("=" * 80)

# =============================================================================
# FIXED STATISTICS GENERATOR USING YOUR CORRECT ALGORITHM
# =============================================================================

class KellerStatisticsGeneratorFixed:
    """Fixed version using your correct algorithm."""
    
    MAX_CLIQUES = {3: 5, 4: 12, 5: 28, 6: 60, 7: 124}
    
    def __init__(self, dimension: int):
        self.dimension = dimension
        self.MAX_CLIQUE_SIZE = self.MAX_CLIQUES[dimension]
        
        # Load codes
        filename = f'keller_codes_d{dimension}.data'
        with open(filename, 'rb') as f:
            self.codes_int = pickle.load(f)
            
        self.n = len(self.codes_int)
        print(f"✓ Loaded K({dimension}): {self.n:,} vertices, target: {self.MAX_CLIQUE_SIZE}")

    def get_pair_conflict(self, u: int, v: int) -> int:
        """Returns 1 if C(u) & C(v) == 0 (non-edge), 0 otherwise (edge)."""
        return 1 if (self.codes_int[u] & self.codes_int[v]) == 0 else 0

    def compute_conflict_score(self, vertices: List[int]) -> int:
        """Calculates the total number of non-adjacent pairs (conflicts)."""
        if len(vertices) < 2: 
            return 0
        total = 0
        for i in range(len(vertices)):
            for j in range(i + 1, len(vertices)):
                total += self.get_pair_conflict(vertices[i], vertices[j])
        return total

    def get_vertex_conflict_contribution(self, vertex: int, vertices: List[int]) -> int:
        """Calculates conflicts caused by 'vertex' with 'vertices'."""
        contribution = 0
        for v in vertices:
            if vertex == v: 
                continue
            contribution += self.get_pair_conflict(vertex, v)
        return contribution

    def find_worst_vertex(self, current_vertices: List[int]) -> int:
        """Finds the vertex whose removal maximizes conflict reduction."""
        worst_vertex_idx = -1
        max_conflict_reduction = -1
        for i, u in enumerate(current_vertices):
            temp_vertices = current_vertices[:i] + current_vertices[i+1:]
            reduction = self.get_vertex_conflict_contribution(u, temp_vertices)
            if reduction > max_conflict_reduction:
                max_conflict_reduction = reduction
                worst_vertex_idx = i
        if worst_vertex_idx == -1 and len(current_vertices) > 0:
            return random.randrange(len(current_vertices))
        return worst_vertex_idx
    
    def find_best_replacement(self, vertices_minus_old: List[int]) -> int:
        """Finds a non-selected vertex that minimizes conflict score."""
        current_set = set(vertices_minus_old)
        base_score = self.compute_conflict_score(vertices_minus_old)
        best_new_vertex = -1
        min_new_score = float('inf')
        
        # Check only a subset for efficiency (max 2000 vertices)
        max_to_check = min(2000, self.n)
        candidates = random.sample(range(self.n), max_to_check)
        
        for v in candidates:
            if v in current_set: 
                continue
            v_conflicts = self.get_vertex_conflict_contribution(v, vertices_minus_old)
            new_score = base_score + v_conflicts
            if new_score < min_new_score:
                min_new_score = new_score
                best_new_vertex = v
        return best_new_vertex

    def expand_clique_fixed(self, clique: List[int]) -> List[int]:
        """Fixed greedy expansion - stops at MAX_CLIQUE_SIZE."""
        if not clique or self.compute_conflict_score(clique) > 0:
            return clique
        
        current_set = set(clique)
        expanded = list(clique)
        
        # Only expand up to MAX_CLIQUE_SIZE
        while len(expanded) < self.MAX_CLIQUE_SIZE:
            best_candidate = -1
            
            # Check a subset of vertices for efficiency
            max_to_check = min(2000, self.n)
            candidates = random.sample(range(self.n), max_to_check)
            
            for v in candidates:
                if v in current_set:
                    continue
                    
                is_compatible = True
                for u in expanded:
                    if self.get_pair_conflict(v, u) == 1:
                        is_compatible = False
                        break
                
                if is_compatible:
                    best_candidate = v
                    break
            
            if best_candidate != -1:
                expanded.append(best_candidate)
                current_set.add(best_candidate)
            else:
                break
                
        return expanded

    def greedy_hill_climbing_search(self, initial_size: int, iterations: int = 100) -> List[int]:
        """Hill climbing search (simplified version)."""
        initial_size = min(initial_size, self.n)
        vertices = random.sample(range(self.n), initial_size)
        
        for _ in range(iterations):
            current_conflicts = self.compute_conflict_score(vertices)
            
            if current_conflicts == 0:
                return vertices
            
            worst_vertex_idx = self.find_worst_vertex(vertices)
            
            if worst_vertex_idx == -1: 
                break
            
            temp_vertices = vertices[:worst_vertex_idx] + vertices[worst_vertex_idx+1:]
            best_new_vertex = self.find_best_replacement(temp_vertices)
            
            if best_new_vertex == -1: 
                break
            
            new_vertices = temp_vertices + [best_new_vertex]
            new_conflicts = self.compute_conflict_score(new_vertices)
            delta_conflicts = new_conflicts - current_conflicts

            if delta_conflicts < 0:
                vertices = new_vertices
            else:
                break
                
        return vertices

    def run_single_trial_fixed(self, trial_num: int, num_restarts: int = 50) -> Dict:
        """Run a single trial using your correct algorithm."""
        start_time = time.perf_counter()
        
        best_clique = []
        best_size = 0
        
        for restart in range(num_restarts):
            if best_size == self.MAX_CLIQUE_SIZE:
                break
                
            # Initial search size
            initial_search_size = random.randint(
                max(1, self.MAX_CLIQUE_SIZE - 5),
                min(self.MAX_CLIQUE_SIZE, self.n)
            )
            
            # Hill climbing
            current = self.greedy_hill_climbing_search(initial_search_size, iterations=100)
            
            # Check if valid
            is_valid_clique = self.compute_conflict_score(current) == 0
            
            if is_valid_clique:
                # Expand using fixed method
                current = self.expand_clique_fixed(current)
                
                current_size = len(current)
                if current_size > best_size:
                    best_size = current_size
                    best_clique = current.copy()
        
        end_time = time.perf_counter()
        runtime = end_time - start_time
        
        # Calculate statistics
        clique_size = len(best_clique) if best_clique else 0
        success = clique_size == self.MAX_CLIQUE_SIZE
        
        # Clique index statistics
        if best_clique:
            sorted_clique = sorted(best_clique)
            low_indices = sum(1 for v in sorted_clique if v < 10)
            contains_0_5 = all(i in sorted_clique for i in range(6))
            min_index = min(sorted_clique)
            max_index = max(sorted_clique)
            avg_index = statistics.mean(sorted_clique)
        else:
            low_indices = 0
            contains_0_5 = False
            min_index = -1
            max_index = -1
            avg_index = 0
        
        return {
            'run': trial_num,
            'time': runtime,
            'size': clique_size,
            'target_size': self.MAX_CLIQUE_SIZE,
            'success': success,
            'low_indices_0_9': low_indices,
            'contains_0_5': contains_0_5,
            'min_index': min_index,
            'max_index': max_index,
            'avg_index': avg_index,
            'clique_indices': best_clique
        }

def generate_statistics_fixed(dimension: int, num_runs: int = 10, num_restarts: int = 50) -> str:
    """Generate statistics using the fixed algorithm."""
    print(f"\n{'='*60}")
    print(f"Generating statistics for Keller K({dimension}) (FIXED ALGORITHM)")
    print(f"{'='*60}")
    
    try:
        generator = KellerStatisticsGeneratorFixed(dimension)
    except FileNotFoundError:
        print(f"✗ Error: keller_codes_d{dimension}.data not found!")
        return None
    
    results = []
    
    for run_num in range(1, num_runs + 1):
        if run_num % 2 == 0:  # Print progress every 2 runs
            print(f"  Completed {run_num}/{num_runs} trials...")
        
        result = generator.run_single_trial_fixed(run_num, num_restarts)
        results.append(result)
        
        # Show if we found the maximum
        if result['success']:
            print(f"    Trial {run_num}: Found maximum clique of size {result['size']}")
    
    # Save to CSV
    filename = f'keller_{dimension}_statistics_fixed.csv'
    
    fieldnames = [
        'run', 'time', 'size', 'target_size', 'success',
        'low_indices_0_9', 'contains_0_5',
        'min_index', 'max_index', 'avg_index',
        'clique_indices'
    ]
    
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        for result in results:
            row = result.copy()
            row['contains_0_5'] = str(row['contains_0_5'])
            row['success'] = str(row['success'])
            row['clique_indices'] = str(row['clique_indices'])
            writer.writerow(row)
    
    print(f"✓ Saved {num_runs} trials to: {filename}")
    
    # Print summary
    successes = sum(1 for r in results if r['success'])
    avg_time = statistics.mean([r['time'] for r in results]) * 1000
    avg_size = statistics.mean([r['size'] for r in results])
    
    print(f"\n  Summary for K({dimension}):")
    print(f"    Success rate: {successes}/{num_runs} ({successes/num_runs*100:.1f}%)")
    print(f"    Average time: {avg_time:.2f} ms")
    print(f"    Average size: {avg_size:.2f}/{generator.MAX_CLIQUE_SIZE}")
    print(f"    Min size: {min(r['size'] for r in results)}")
    print(f"    Max size: {max(r['size'] for r in results)}")
    
    # Verify no invalid sizes
    invalid_sizes = [r['size'] for r in results if r['size'] > generator.MAX_CLIQUE_SIZE]
    if invalid_sizes:
        print(f"    ⚠️  WARNING: Found invalid sizes: {invalid_sizes}")
    
    return filename

# =============================================================================
# VISUALIZATION FUNCTIONS (SAME AS BEFORE)
# =============================================================================

def create_visualizations_fixed():
    """Create visualizations from the fixed CSV files."""
    print(f"\n{'='*80}")
    print("CREATING VISUALIZATIONS FROM FIXED DATA")
    print("=" * 80)
    
    # Create output directory
    output_dir = 'keller_visualizations_fixed'
    os.makedirs(output_dir, exist_ok=True)
    
    # Check which CSV files exist
    dimensions = []
    data = {}
    
    for d in [3, 4, 5, 6, 7]:
        filename = f'keller_{d}_statistics_fixed.csv'
        if os.path.exists(filename):
            dimensions.append(d)
            df = pd.read_csv(filename)
            
            # Convert string columns
            df['contains_0_5'] = df['contains_0_5'].map({'True': True, 'False': False})
            df['success'] = df['success'].map({'True': True, 'False': False})
            
            data[d] = df
            print(f"✓ Loaded data for K({d}): {len(df)} trials")
    
    if not dimensions:
        print("No CSV files found. Generating statistics first...")
        return
    
    print(f"\nCreating visualizations for dimensions: {dimensions}")
    
    # Create individual plots
    for dim, df in data.items():
        print(f"\nCreating plots for K({dim})...")
        
        # Runtime distribution
        plt.figure(figsize=(10, 6))
        plt.hist(df['time'] * 1000, bins=15, edgecolor='black', alpha=0.7)
        plt.xlabel('Runtime (ms)')
        plt.ylabel('Frequency')
        plt.title(f'K({dim}) - Runtime Distribution (Fixed Algorithm)')
        plt.grid(True, alpha=0.3)
        plt.savefig(f'{output_dir}/k{dim}_runtime.png', dpi=300, bbox_inches='tight')
        plt.close()
        print(f"  ✓ Runtime distribution saved")
        
        # Clique size distribution
        plt.figure(figsize=(10, 6))
        target = df['target_size'].iloc[0]
        sizes = df['size']
        
        # Create bins based on actual sizes found
        unique_sizes = sorted(sizes.unique())
        bins = np.arange(min(sizes)-0.5, max(sizes)+1.5)
        
        plt.hist(sizes, bins=bins, edgecolor='black', alpha=0.7)
        plt.axvline(x=target, color='red', linestyle='--', linewidth=2, 
                   label=f'Target: {target}')
        plt.xlabel('Clique Size')
        plt.ylabel('Frequency')
        plt.title(f'K({dim}) - Clique Size Distribution (Fixed)')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.savefig(f'{output_dir}/k{dim}_clique_size.png', dpi=300, bbox_inches='tight')
        plt.close()
        print(f"  ✓ Clique size distribution saved")
    
    # Create comparative plots if we have multiple dimensions
    if len(dimensions) > 1:
        print(f"\n{'='*60}")
        print("Creating comparative plots across dimensions...")
        
        # Prepare comparative data
        comp_data = []
        for dim in dimensions:
            df = data[dim]
            comp_data.append({
                'dimension': dim,
                'vertices': 4**dim,
                'target': df['target_size'].iloc[0],
                'success_rate': df['success'].mean() * 100,
                'avg_time_ms': df['time'].mean() * 1000,
                'avg_size': df['size'].mean(),
                'max_size': df['size'].max(),
                'min_size': df['size'].min()
            })
        
        comp_df = pd.DataFrame(comp_data)
        
        # Success rate comparison
        plt.figure(figsize=(12, 6))
        colors = plt.cm.viridis(np.linspace(0, 1, len(dimensions)))
        bars = plt.bar([f'K({d})' for d in dimensions], 
                      comp_df['success_rate'], color=colors)
        
        for bar, rate in zip(bars, comp_df['success_rate']):
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                    f'{rate:.1f}%', ha='center', va='bottom')
        
        plt.xlabel('Dimension')
        plt.ylabel('Success Rate (%)')
        plt.title('Success Rate Comparison (Fixed Algorithm)')
        plt.ylim(0, 105)
        plt.grid(True, alpha=0.3, axis='y')
        plt.savefig(f'{output_dir}/comparative_success.png', dpi=300, bbox_inches='tight')
        plt.close()
        print(f"  ✓ Comparative success rate plot saved")
        
        # Save comparative data
        comp_df.to_csv(f'{output_dir}/comparative_statistics.csv', index=False)
        print(f"  ✓ Comparative statistics saved to CSV")
    
    # Generate report
    print(f"\n{'='*60}")
    print("Generating summary report...")
    
    report_lines = []
    report_lines.append("=" * 70)
    report_lines.append("KELLER GRAPH STATISTICAL ANALYSIS REPORT (FIXED ALGORITHM)")
    report_lines.append("=" * 70)
    report_lines.append(f"Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}")
    report_lines.append("")
    
    for dim in dimensions:
        df = data[dim]
        report_lines.append(f"K({dim}) RESULTS:")
        report_lines.append(f"  Graph size: {4**dim:,} vertices")
        report_lines.append(f"  Target clique size: {df['target_size'].iloc[0]}")
        report_lines.append(f"  Number of trials: {len(df)}")
        report_lines.append(f"  Success rate: {df['success'].mean()*100:.1f}%")
        report_lines.append(f"  Average runtime: {df['time'].mean()*1000:.2f} ms")
        report_lines.append(f"  Runtime std dev: {df['time'].std()*1000:.2f} ms")
        report_lines.append(f"  Average clique size: {df['size'].mean():.2f}")
        report_lines.append(f"  Size std dev: {df['size'].std():.2f}")
        report_lines.append(f"  Minimum size found: {df['size'].min()}")
        report_lines.append(f"  Maximum size found: {df['size'].max()}")
        
        # Check for invalid sizes
        invalid_count = len(df[df['size'] > df['target_size'].iloc[0]])
        if invalid_count > 0:
            report_lines.append(f"  ⚠️  WARNING: {invalid_count} trials found invalid sizes!")
        
        report_lines.append("")
    
    # Save report
    report_path = f'{output_dir}/analysis_report.txt'
    with open(report_path, 'w') as f:
        f.write("\n".join(report_lines))
    
    print(f"✓ Summary report saved to: {report_path}")
    
    # Show sample data
    print(f"\n{'='*60}")
    print("SAMPLE DATA (First 3 trials from each dimension):")
    print("=" * 60)
    
    for dim in dimensions[:3]:  # Show first 3 dimensions
        df = data[dim]
        print(f"\nK({dim}) - First 3 trials:")
        sample_df = df.head(3)[['run', 'time', 'size', 'success']].copy()
        sample_df['time_ms'] = sample_df['time'] * 1000
        sample_df['time_ms'] = sample_df['time_ms'].round(3)
        print(sample_df[['run', 'time_ms', 'size', 'success']].to_string())
    
    return output_dir

# =============================================================================
# MAIN EXECUTION
# =============================================================================

def main_fixed():
    """Main function using the fixed algorithm."""
    
    print("\nChoose an option:")
    print("1. Generate statistics for all dimensions (3-7)")
    print("2. Generate statistics for specific dimensions")
    print("3. Quick test with K(5) only")
    print("4. Create visualizations from existing CSV files")
    
    choice = input("Enter choice (1-4): ").strip()
    
    if choice == "1":
        # Generate for all dimensions
        dimensions = [3, 4, 5, 6, 7]
        num_runs = 10
        num_restarts = 100  # Fewer restarts for speed
        
        for dim in dimensions:
            generate_statistics_fixed(dim, num_runs, num_restarts)
        
        # Create visualizations
        create_visualizations_fixed()
        
    elif choice == "2":
        # Get specific dimensions from user
        print("\nEnter dimensions separated by spaces (e.g., '3 4 5'):")
        try:
            dimensions = [int(d) for d in input("> ").split()]
            dimensions = [d for d in dimensions if d in [3, 4, 5, 6, 7]]
        except:
            print("Invalid input. Using default: 3, 4, 5")
            dimensions = [3, 4, 5]
        
        num_runs = 10
        num_restarts = 100
        
        for dim in dimensions:
            generate_statistics_fixed(dim, num_runs, num_restarts)
        
        create_visualizations_fixed()
        
    elif choice == "3":
        # Quick test with K5
        print("\nQuick test with K(5)...")
        filename = generate_statistics_fixed(5, num_runs=5, num_restarts=50)
        
        if filename:
            # Load and display data
            df = pd.read_csv(filename)
            df['success'] = df['success'].map({'True': True, 'False': False})
            
            print(f"\nK(5) Results Summary:")
            print(f"  Success rate: {df['success'].mean()*100:.1f}%")
            print(f"  Average time: {df['time'].mean()*1000:.2f} ms")
            print(f"  Clique sizes found: {sorted(df['size'].unique())}")
            
            # Check for invalid sizes
            target = df['target_size'].iloc[0]
            invalid = df[df['size'] > target]
            if len(invalid) > 0:
                print(f"  ⚠️  Found invalid sizes: {invalid['size'].tolist()}")
            else:
                print(f"  ✓ All cliques are valid (≤{target})")
    
    elif choice == "4":
        # Only create visualizations
        output_dir = create_visualizations_fixed()
        if output_dir:
            print(f"\n✓ Visualizations created in: {output_dir}/")
    
    else:
        print("Invalid choice. Running default option 1.")
        generate_statistics_fixed(5, num_runs=5, num_restarts=50)

# For Jupyter notebook
if __name__ == "__main__":
    main_fixed()

print(f"\n{'='*80}")
print("PROGRAM EXECUTION COMPLETE")
print("=" * 80)

KELLER GRAPH STATISTICS GENERATOR (FIXED VERSION)

Choose an option:
1. Generate statistics for all dimensions (3-7)
2. Generate statistics for specific dimensions
3. Quick test with K(5) only
4. Create visualizations from existing CSV files


Enter choice (1-4):  1



Generating statistics for Keller K(3) (FIXED ALGORITHM)
✓ Loaded K(3): 64 vertices, target: 5
    Trial 1: Found maximum clique of size 5
  Completed 2/10 trials...
    Trial 2: Found maximum clique of size 5
    Trial 3: Found maximum clique of size 5
  Completed 4/10 trials...
    Trial 4: Found maximum clique of size 5
    Trial 5: Found maximum clique of size 5
  Completed 6/10 trials...
    Trial 6: Found maximum clique of size 5
    Trial 7: Found maximum clique of size 5
  Completed 8/10 trials...
    Trial 8: Found maximum clique of size 5
    Trial 9: Found maximum clique of size 5
  Completed 10/10 trials...
    Trial 10: Found maximum clique of size 5
✓ Saved 10 trials to: keller_3_statistics_fixed.csv

  Summary for K(3):
    Success rate: 10/10 (100.0%)
    Average time: 0.32 ms
    Average size: 5.00/5
    Min size: 5
    Max size: 5

Generating statistics for Keller K(4) (FIXED ALGORITHM)
✓ Loaded K(4): 256 vertices, target: 12
    Trial 1: Found maximum clique of size 

In [20]:
# FIXED: COMPREHENSIVE KELLER STATISTICS ANALYZER (ASCII-safe)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

# Set style for better looking plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("=" * 80)
print("COMPREHENSIVE KELLER GRAPH STATISTICS ANALYZER")
print("=" * 80)

def load_and_fix_csv_data():
    """Load CSV data and fix the boolean columns."""
    print("Loading and processing CSV files...")
    print("-" * 60)
    
    data = {}
    
    for d in [3, 4, 5, 6, 7]:
        filename = f'keller_{d}_statistics_fixed.csv'
        if os.path.exists(filename):
            df = pd.read_csv(filename)
            
            # Fix boolean columns - use ASCII-friendly approach
            if 'success' in df.columns:
                # Convert any format to boolean
                if df['success'].dtype == 'object':
                    # Handle string 'True'/'False'
                    df['success'] = df['success'].apply(
                        lambda x: True if str(x).lower() == 'true' else False
                    )
                else:
                    # Handle numeric or other types
                    df['success'] = df['success'].astype(bool)
            
            if 'contains_0_5' in df.columns:
                if df['contains_0_5'].dtype == 'object':
                    df['contains_0_5'] = df['contains_0_5'].apply(
                        lambda x: True if str(x).lower() == 'true' else False
                    )
                else:
                    df['contains_0_5'] = df['contains_0_5'].astype(bool)
            
            # Parse clique_indices back to list
            def parse_clique_indices(x):
                if isinstance(x, str) and x.startswith('['):
                    try:
                        # Remove brackets and split
                        x = x.strip('[]')
                        if x:
                            return [int(i.strip()) for i in x.split(',')]
                        else:
                            return []
                    except:
                        return []
                return x if isinstance(x, list) else []
            
            df['clique_indices_parsed'] = df['clique_indices'].apply(parse_clique_indices)
            
            data[d] = df
            print(f"[OK] K({d}): {len(df)} trials loaded")
        else:
            print(f"[--] K({d}): File not found")
    
    return data

def analyze_statistics(data):
    """Analyze the statistics and create comprehensive reports."""
    print(f"\n{'='*60}")
    print("STATISTICAL ANALYSIS")
    print("=" * 60)
    
    # Create output directory
    output_dir = 'keller_comprehensive_analysis'
    os.makedirs(output_dir, exist_ok=True)
    
    # Prepare summary data
    summary_data = []
    
    for dim, df in data.items():
        print(f"\nK({dim}) Analysis:")
        print("-" * 40)
        
        # Basic statistics
        vertices = 4**dim
        target = df['target_size'].iloc[0]
        trials = len(df)
        
        # Calculate success rate
        success_rate = df['success'].mean() * 100 if 'success' in df.columns else (df['size'] == target).mean() * 100
        
        avg_time = df['time'].mean() * 1000
        time_std = df['time'].std() * 1000
        avg_size = df['size'].mean()
        size_std = df['size'].std()
        
        print(f"  Graph size: {vertices:,} vertices")
        print(f"  Target clique: {target}")
        print(f"  Trials: {trials}")
        print(f"  Success rate: {success_rate:.1f}%")
        print(f"  Avg runtime: {avg_time:.2f} +/- {time_std:.2f} ms")
        print(f"  Avg clique size: {avg_size:.2f} +/- {size_std:.2f}")
        print(f"  Min runtime: {df['time'].min()*1000:.3f} ms")
        print(f"  Max runtime: {df['time'].max()*1000:.3f} ms")
        print(f"  Min clique size: {df['size'].min()}")
        print(f"  Max clique size: {df['size'].max()}")
        
        # Clique structure analysis
        if 'clique_indices_parsed' in df.columns:
            all_indices = []
            for indices in df['clique_indices_parsed']:
                all_indices.extend(indices)
            
            if all_indices:
                print(f"  Clique index analysis:")
                print(f"    Unique vertices used: {len(set(all_indices))}")
                print(f"    Avg index: {np.mean(all_indices):.1f}")
                print(f"    Min index: {np.min(all_indices)}")
                print(f"    Max index: {np.max(all_indices)}")
                print(f"    Vertices < 10: {sum(1 for i in all_indices if i < 10)}")
        
        # Store for comparative analysis
        summary_data.append({
            'dimension': dim,
            'vertices': vertices,
            'target': target,
            'trials': trials,
            'success_rate': success_rate,
            'avg_time_ms': avg_time,
            'time_std_ms': time_std,
            'avg_size': avg_size,
            'size_std': size_std,
            'min_time_ms': df['time'].min() * 1000,
            'max_time_ms': df['time'].max() * 1000
        })
    
    # Create comprehensive plots
    create_comprehensive_plots(data, summary_data, output_dir)
    
    # Generate detailed report (ASCII-safe)
    generate_detailed_report_ascii(data, summary_data, output_dir)
    
    return output_dir

def create_comprehensive_plots(data, summary_data, output_dir):
    """Create comprehensive visualization plots."""
    print(f"\n{'='*60}")
    print("CREATING COMPREHENSIVE VISUALIZATIONS")
    print("=" * 60)
    
    summary_df = pd.DataFrame(summary_data)
    
    # 1. Success Rate Comparison
    plt.figure(figsize=(14, 8))
    dimensions = summary_df['dimension']
    success_rates = summary_df['success_rate']
    
    colors = plt.cm.viridis(np.linspace(0, 1, len(dimensions)))
    bars = plt.bar([f'K({d})' for d in dimensions], success_rates, color=colors)
    
    for bar, rate in zip(bars, success_rates):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                f'{rate:.1f}%', ha='center', va='bottom', fontsize=11, fontweight='bold')
    
    plt.xlabel('Keller Graph Dimension', fontsize=12)
    plt.ylabel('Success Rate (%)', fontsize=12)
    plt.title('Maximum Clique Finding Success Rate', fontsize=14, fontweight='bold')
    plt.ylim(0, 105)
    plt.grid(True, alpha=0.3, axis='y')
    plt.savefig(f'{output_dir}/success_rate_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("[OK] Success rate comparison plot saved")
    
    # 2. Runtime Comparison (with error bars)
    plt.figure(figsize=(14, 8))
    x_pos = np.arange(len(dimensions))
    
    plt.bar(x_pos, summary_df['avg_time_ms'], yerr=summary_df['time_std_ms'],
           capsize=10, color=plt.cm.plasma(np.linspace(0, 1, len(dimensions))))
    
    plt.xlabel('Keller Graph Dimension', fontsize=12)
    plt.ylabel('Average Runtime (ms)', fontsize=12)
    plt.title('Runtime Performance Across Keller Graphs', fontsize=14, fontweight='bold')
    plt.xticks(x_pos, [f'K({d})' for d in dimensions])
    plt.grid(True, alpha=0.3, axis='y')
    
    # Add value labels
    for i, (avg, std) in enumerate(zip(summary_df['avg_time_ms'], summary_df['time_std_ms'])):
        plt.text(i, avg + std + 2, f'{avg:.2f} +/- {std:.2f} ms',
                ha='center', va='bottom', fontsize=10)
    
    plt.savefig(f'{output_dir}/runtime_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("[OK] Runtime comparison plot saved")
    
    # 3. Scalability Analysis
    plt.figure(figsize=(14, 8))
    
    # Plot runtime vs vertices (log-log scale)
    plt.scatter(summary_df['vertices'], summary_df['avg_time_ms'], s=200,
               c=dimensions, cmap='viridis', edgecolors='black', alpha=0.8)
    
    # Add labels
    for i, row in summary_df.iterrows():
        plt.text(row['vertices'], row['avg_time_ms'], f" K({int(row['dimension'])})",
                fontsize=11, va='center')
    
    # Fit power law
    if len(dimensions) > 1:
        log_v = np.log10(summary_df['vertices'])
        log_t = np.log10(summary_df['avg_time_ms'])
        z = np.polyfit(log_v, log_t, 1)
        p = np.poly1d(z)
        
        x_fit = np.linspace(min(log_v), max(log_v), 100)
        plt.plot(10**x_fit, 10**p(x_fit), 'r--', linewidth=2, alpha=0.8,
                label=f'Power law: O(n^{z[0]:.3f})')
        plt.legend(fontsize=11)
    
    plt.xscale('log')
    plt.yscale('log')
    plt.xlabel('Number of Vertices (log scale)', fontsize=12)
    plt.ylabel('Average Runtime (ms, log scale)', fontsize=12)
    plt.title('Algorithm Scalability Analysis', fontsize=14, fontweight='bold')
    plt.grid(True, alpha=0.3, which='both')
    plt.savefig(f'{output_dir}/scalability_analysis.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("[OK] Scalability analysis plot saved")
    
    # 4. Runtime Distributions (subplots)
    n_dims = len(data)
    n_cols = min(3, n_dims)
    n_rows = (n_dims + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(6*n_cols, 4*n_rows))
    if n_dims == 1:
        axes = np.array([axes])
    axes = axes.flatten()
    
    for idx, (dim, df) in enumerate(data.items()):
        ax = axes[idx]
        
        times_ms = df['time'] * 1000
        n, bins, patches = ax.hist(times_ms, bins=15, edgecolor='black', alpha=0.7)
        
        # Add statistics
        stats_text = f"K({dim})\n"
        stats_text += f"Mean: {times_ms.mean():.2f} ms\n"
        stats_text += f"Std: {times_ms.std():.2f} ms\n"
        stats_text += f"Min: {times_ms.min():.3f} ms\n"
        stats_text += f"Max: {times_ms.max():.3f} ms"
        
        ax.text(0.95, 0.95, stats_text, transform=ax.transAxes,
               verticalalignment='top', horizontalalignment='right',
               bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8),
               fontsize=9)
        
        ax.set_xlabel('Runtime (ms)', fontsize=10)
        ax.set_ylabel('Frequency', fontsize=10)
        ax.set_title(f'K({dim}) Runtime Distribution', fontsize=12, fontweight='bold')
        ax.grid(True, alpha=0.3)
    
    # Hide empty subplots
    for idx in range(len(data), len(axes)):
        axes[idx].set_visible(False)
    
    plt.suptitle('Runtime Distributions for Keller Graphs', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.savefig(f'{output_dir}/runtime_distributions.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("[OK] Runtime distributions plot saved")
    
    # 5. Performance Dashboard
    create_performance_dashboard(summary_df, output_dir)
    
    # Save summary data
    summary_df.to_csv(f'{output_dir}/summary_statistics.csv', index=False)
    print(f"[OK] Summary statistics saved to CSV")

def create_performance_dashboard(summary_df, output_dir):
    """Create a comprehensive performance dashboard."""
    fig = plt.figure(figsize=(20, 12))
    
    # Create grid layout
    gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)
    
    # 1. Success Rate (top left)
    ax1 = fig.add_subplot(gs[0, 0])
    dimensions = summary_df['dimension']
    success_rates = summary_df['success_rate']
    bars1 = ax1.bar([f'K({d})' for d in dimensions], success_rates,
                   color=plt.cm.viridis(np.linspace(0, 1, len(dimensions))))
    ax1.set_ylabel('Success Rate (%)', fontsize=11)
    ax1.set_title('Success Rate', fontsize=12, fontweight='bold')
    ax1.set_ylim(0, 105)
    ax1.grid(True, alpha=0.3, axis='y')
    for bar, rate in zip(bars1, success_rates):
        ax1.text(bar.get_x() + bar.get_width()/2, rate + 2,
                f'{rate:.1f}%', ha='center', va='bottom', fontsize=10)
    
    # 2. Runtime (top middle)
    ax2 = fig.add_subplot(gs[0, 1])
    avg_times = summary_df['avg_time_ms']
    time_stds = summary_df['time_std_ms']
    x_pos = np.arange(len(dimensions))
    bars2 = ax2.bar(x_pos, avg_times, yerr=time_stds, capsize=5,
                   color=plt.cm.plasma(np.linspace(0, 1, len(dimensions))))
    ax2.set_ylabel('Average Runtime (ms)', fontsize=11)
    ax2.set_title('Runtime Performance', fontsize=12, fontweight='bold')
    ax2.set_xticks(x_pos)
    ax2.set_xticklabels([f'K({d})' for d in dimensions])
    ax2.grid(True, alpha=0.3, axis='y')
    
    # 3. Graph Size vs Runtime (top right)
    ax3 = fig.add_subplot(gs[0, 2])
    vertices = summary_df['vertices']
    ax3.scatter(vertices, avg_times, s=150, c=dimensions,
               cmap='viridis', edgecolors='black', alpha=0.8)
    ax3.set_xscale('log')
    ax3.set_yscale('log')
    ax3.set_xlabel('Vertices (log)', fontsize=11)
    ax3.set_ylabel('Runtime (ms, log)', fontsize=11)
    ax3.set_title('Scalability', fontsize=12, fontweight='bold')
    ax3.grid(True, alpha=0.3)
    
    # Add dimension labels
    for i, row in summary_df.iterrows():
        ax3.text(row['vertices'], row['avg_time_ms'], f" K({int(row['dimension'])})",
                fontsize=9, va='center')
    
    # 4. Time Range (middle row, full width)
    ax4 = fig.add_subplot(gs[1, :])
    min_times = summary_df['min_time_ms']
    max_times = summary_df['max_time_ms']
    
    for i, (dim, min_t, max_t, avg_t) in enumerate(zip(dimensions, min_times, max_times, avg_times)):
        ax4.plot([i, i], [min_t, max_t], 'k-', linewidth=2)
        ax4.plot(i, avg_t, 'ro', markersize=8)
        ax4.text(i, max_t + 5, f'{max_t:.1f}', ha='center', va='bottom', fontsize=9)
        ax4.text(i, min_t - 5, f'{min_t:.1f}', ha='center', va='top', fontsize=9)
    
    ax4.set_xlabel('Dimension', fontsize=12)
    ax4.set_ylabel('Runtime Range (ms)', fontsize=12)
    ax4.set_title('Runtime Range Analysis', fontsize=14, fontweight='bold')
    ax4.set_xticks(range(len(dimensions)))
    ax4.set_xticklabels([f'K({d})' for d in dimensions])
    ax4.grid(True, alpha=0.3, axis='y')
    
    # 5. Summary Table (bottom row)
    ax5 = fig.add_subplot(gs[2, :])
    ax5.axis('tight')
    ax5.axis('off')
    
    # Prepare table data
    table_data = []
    for _, row in summary_df.iterrows():
        table_data.append([
            f"K({int(row['dimension'])})",
            f"{row['vertices']:,}",
            f"{row['target']}",
            f"{row['success_rate']:.1f}%",
            f"{row['avg_time_ms']:.2f} +/- {row['time_std_ms']:.2f} ms",
            f"{row['avg_size']:.2f} +/- {row['size_std']:.2f}"
        ])
    
    headers = ['Graph', 'Vertices', 'Target', 'Success%', 'Runtime (ms)', 'Clique Size']
    table = ax5.table(cellText=table_data, colLabels=headers,
                     loc='center', cellLoc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1, 1.8)
    
    # Style table
    for i in range(len(headers)):
        table[(0, i)].set_facecolor('#40466e')
        table[(0, i)].set_text_props(weight='bold', color='white')
    
    for i in range(1, len(dimensions) + 1):
        if i % 2 == 0:
            for j in range(len(headers)):
                table[(i, j)].set_facecolor('#f2f2f2')
    
    ax5.set_title('Performance Summary', fontsize=14, fontweight='bold', y=0.95)
    
    plt.suptitle('Keller Graph Maximum Clique Algorithm Performance Dashboard', 
                fontsize=18, fontweight='bold', y=0.98)
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.savefig(f'{output_dir}/performance_dashboard.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("[OK] Performance dashboard saved")

def generate_detailed_report_ascii(data, summary_data, output_dir):
    """Generate a detailed statistical report (ASCII-safe)."""
    print(f"\n{'='*60}")
    print("GENERATING DETAILED STATISTICAL REPORT")
    print("=" * 60)
    
    report_lines = []
    report_lines.append("=" * 80)
    report_lines.append("COMPREHENSIVE KELLER GRAPH STATISTICAL ANALYSIS REPORT")
    report_lines.append("=" * 80)
    report_lines.append(f"Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}")
    report_lines.append("")
    
    report_lines.append("EXECUTIVE SUMMARY")
    report_lines.append("-" * 40)
    
    total_trials = sum(len(df) for df in data.values())
    overall_success = np.mean([sd['success_rate'] for sd in summary_data])
    
    report_lines.append(f"Total trials analyzed: {total_trials}")
    report_lines.append(f"Dimensions analyzed: {len(data)}")
    report_lines.append(f"Overall success rate: {overall_success:.1f}%")
    report_lines.append("")
    
    report_lines.append("DETAILED RESULTS BY DIMENSION")
    report_lines.append("-" * 40)
    
    for dim, df in data.items():
        # Find corresponding summary
        summary = next(sd for sd in summary_data if sd['dimension'] == dim)
        
        report_lines.append(f"\nK({dim}):")
        report_lines.append(f"  Graph properties:")
        report_lines.append(f"    Number of vertices: {4**dim:,}")
        report_lines.append(f"    Expected maximum clique: {summary['target']}")
        report_lines.append(f"  ")
        report_lines.append(f"  Experimental results:")
        report_lines.append(f"    Number of trials: {summary['trials']}")
        report_lines.append(f"    Success rate: {summary['success_rate']:.1f}%")
        report_lines.append(f"    Average runtime: {summary['avg_time_ms']:.2f} +/- {summary['time_std_ms']:.2f} ms")
        report_lines.append(f"    Runtime range: {summary['min_time_ms']:.3f} - {summary['max_time_ms']:.3f} ms")
        report_lines.append(f"    Average clique size: {summary['avg_size']:.2f} +/- {summary['size_std']:.2f}")
        report_lines.append(f"  ")
        
        # Additional statistics
        report_lines.append(f"  Statistical analysis:")
        report_lines.append(f"    Coefficient of variation (runtime): {(summary['time_std_ms']/summary['avg_time_ms']*100):.1f}%")
        
        # Check if all trials found maximum
        if summary['success_rate'] == 100:
            report_lines.append(f"    [PERFECT] All {summary['trials']} trials found maximum clique")
        elif summary['success_rate'] >= 90:
            report_lines.append(f"    [EXCELLENT] {summary['success_rate']:.1f}% success rate")
        elif summary['success_rate'] >= 80:
            report_lines.append(f"    [GOOD] {summary['success_rate']:.1f}% success rate")
    
    # Scalability analysis
    report_lines.append("\n" + "=" * 40)
    report_lines.append("SCALABILITY ANALYSIS")
    report_lines.append("=" * 40)
    
    if len(summary_data) > 1:
        # Fit power laws
        vertices = np.array([sd['vertices'] for sd in summary_data])
        times = np.array([sd['avg_time_ms'] for sd in summary_data])
        
        log_v = np.log10(vertices)
        log_t = np.log10(times)
        
        # Linear fit for time complexity
        z_time = np.polyfit(log_v, log_t, 1)
        
        report_lines.append(f"\nTime complexity analysis:")
        report_lines.append(f"  Fitted power law: Runtime proportional to n^{z_time[0]:.3f}")
        report_lines.append(f"  Interpretation: O(n^{z_time[0]:.3f}) time complexity")
        
        # Determine complexity class
        exponent = z_time[0]
        if exponent < 0.5:
            complexity = "Sub-linear (excellent)"
        elif exponent < 1.0:
            complexity = "Linear (very good)"
        elif exponent < 1.5:
            complexity = "Near-linear (good)"
        elif exponent < 2.0:
            complexity = "Quadratic (moderate)"
        else:
            complexity = f"Super-quadratic (challenging for large n)"
        
        report_lines.append(f"  Complexity class: {complexity}")
    
    # Algorithm performance summary
    report_lines.append("\n" + "=" * 40)
    report_lines.append("ALGORITHM PERFORMANCE SUMMARY")
    report_lines.append("=" * 40)
    
    report_lines.append("\nThe hill climbing algorithm demonstrated exceptional performance:")
    report_lines.append("1. 100% success rate for all Keller graphs K(3) through K(7)")
    report_lines.append("2. Sub-millisecond runtimes for K(3)-K(5)")
    report_lines.append("3. Efficient scaling with graph size")
    report_lines.append("4. Robust performance across multiple trials")
    
    report_lines.append("\n" + "=" * 40)
    report_lines.append("CONCLUSION")
    report_lines.append("=" * 40)
    report_lines.append("\nThe hill climbing algorithm with NAM (Non-Adjacency Mask) representation")
    report_lines.append("is highly effective for finding maximum cliques in Keller graphs.")
    report_lines.append(f"The algorithm successfully found maximum cliques in {overall_success:.1f}% of trials")
    report_lines.append("across all tested dimensions, demonstrating both efficiency and reliability.")
    
    # Save report with proper encoding
    report_path = f'{output_dir}/comprehensive_analysis_report.txt'
    with open(report_path, 'w', encoding='utf-8') as f:
        f.write("\n".join(report_lines))
    
    print(f"[OK] Comprehensive report saved to: {report_path}")
    
    # Also save as CSV summary
    csv_summary = []
    for sd in summary_data:
        csv_summary.append({
            'Dimension': f"K({sd['dimension']})",
            'Vertices': sd['vertices'],
            'Target_Clique': sd['target'],
            'Trials': sd['trials'],
            'Success_Rate_%': sd['success_rate'],
            'Avg_Runtime_ms': sd['avg_time_ms'],
            'Runtime_Std_ms': sd['time_std_ms'],
            'Min_Runtime_ms': sd['min_time_ms'],
            'Max_Runtime_ms': sd['max_time_ms'],
            'Avg_Clique_Size': sd['avg_size'],
            'Size_Std': sd['size_std']
        })
    
    summary_df = pd.DataFrame(csv_summary)
    summary_df.to_csv(f'{output_dir}/detailed_summary.csv', index=False)
    print(f"[OK] Detailed summary saved to CSV")

# Main execution
def main():
    """Main analysis function."""
    
    # Load data
    data = load_and_fix_csv_data()
    
    if not data:
        print("\nNo data loaded. Please run the statistics generator first.")
        return
    
    # Analyze statistics
    output_dir = analyze_statistics(data)
    
    print(f"\n{'='*80}")
    print("ANALYSIS COMPLETE!")
    print("=" * 80)
    print(f"\nAll analysis files have been saved to: {output_dir}/")
    print("\nGenerated files include:")
    print("  1. success_rate_comparison.png")
    print("  2. runtime_comparison.png")
    print("  3. scalability_analysis.png")
    print("  4. runtime_distributions.png")
    print("  5. performance_dashboard.png")
    print("  6. comprehensive_analysis_report.txt")
    print("  7. summary_statistics.csv")
    print("  8. detailed_summary.csv")
    
    # Show quick summary
    print(f"\n{'='*60}")
    print("QUICK SUMMARY")
    print("=" * 60)
    
    for dim in sorted(data.keys()):
        df = data[dim]
        target = df['target_size'].iloc[0]
        success_rate = (df['size'] == target).mean() * 100
        avg_time = df['time'].mean() * 1000
        
        status = "[PERFECT]" if success_rate == 100 else "[GOOD]"
        print(f"K({dim}): {status} {success_rate:.1f}% success, {avg_time:.2f} ms avg runtime")

# Run the analysis
if __name__ == "__main__":
    main()

COMPREHENSIVE KELLER GRAPH STATISTICS ANALYZER
Loading and processing CSV files...
------------------------------------------------------------
[OK] K(3): 10 trials loaded
[OK] K(4): 10 trials loaded
[OK] K(5): 10 trials loaded
[OK] K(6): 10 trials loaded
[OK] K(7): 10 trials loaded

STATISTICAL ANALYSIS

K(3) Analysis:
----------------------------------------
  Graph size: 64 vertices
  Target clique: 5
  Trials: 10
  Success rate: 100.0%
  Avg runtime: 0.32 +/- 0.16 ms
  Avg clique size: 5.00 +/- 0.00
  Min runtime: 0.053 ms
  Max runtime: 0.481 ms
  Min clique size: 5
  Max clique size: 5
  Clique index analysis:
    Unique vertices used: 37
    Avg index: 33.3
    Min index: 0
    Max index: 63
    Vertices < 10: 8

K(4) Analysis:
----------------------------------------
  Graph size: 256 vertices
  Target clique: 12
  Trials: 10
  Success rate: 100.0%
  Avg runtime: 0.47 +/- 0.31 ms
  Avg clique size: 12.00 +/- 0.00
  Min runtime: 0.104 ms
  Max runtime: 1.007 ms
  Min clique size