In [13]:
import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pickle
from tqdm import tqdm

In [17]:
import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pickle
from tqdm import tqdm

def run_rashomon_simulation(thresholds=[0.01, 0.05], 
                           sample_sizes=[500, 1000],
                           n_iterations=100,
                           n_total=10000,
                           n_features=30,
                           n_signal_vars=10,
                           k=5,
                           noise_level=1.0,
                           save_results=True,
                           random_seed=42):
    """
    Run the Rashomon effect simulation with specified parameters.
    
    Parameters:
    -----------
    thresholds : list of float
        List of threshold values (e.g., [0.01, 0.05] for 1% and 5%)
    sample_sizes : list of int
        Sample sizes to test
    n_iterations : int
        Number of iterations per sample size
    n_total : int
        Total size of the dataset to generate
    n_features : int
        Number of features in the dataset
    n_signal_vars : int
        Number of features with true signal
    k : int
        Number of variables to select in each model
    noise_level : float
        Standard deviation of noise in the data
    save_results : bool
        Whether to save results to files
    random_seed : int
        Random seed for reproducibility
    
    Returns:
    --------
    results_df : pandas.DataFrame
        DataFrame containing all results
    """
    
    # Set random seed
    np.random.seed(random_seed)
    
    # Generate dataset
    print("Generating dataset...")
    X = np.random.randn(n_total, n_features)
    true_coefs = np.zeros(n_features)
    true_coefs[:n_signal_vars] = np.random.uniform(-3, 3, n_signal_vars)
    y = X @ true_coefs + noise_level * np.random.randn(n_total)
    
    # Initialize results storage
    results = {
        'sample_size': [],
        'iteration': [],
    }
    
    # Add columns for each threshold
    for threshold in thresholds:
        results[f'count_{int(threshold*100)}pct'] = []
    
    # Run simulations
    total_combinations = combinations(range(n_features), k)
    n_combinations = int(np.math.factorial(n_features) / 
                        (np.math.factorial(k) * np.math.factorial(n_features - k)))
    
    print(f"Total possible {k}-variable subsets: {n_combinations:,}")
    print(f"Testing thresholds: {[f'{t*100:.1f}%' for t in thresholds]}")
    
    for sample_size in sample_sizes:
        print(f"\nProcessing sample size: {sample_size}")
        
        for iteration in tqdm(range(n_iterations), desc=f"Sample size {sample_size}"):
            # Sample from full dataset
            indices = np.random.choice(len(X), sample_size, replace=False)
            X_sample = X[indices]
            y_sample = y[indices]
            
            # Find best RSS and count models within thresholds
            best_rss = float('inf')
            all_rss = []
            
            # Evaluate all k-combinations
            for feature_combo in combinations(range(n_features), k):
                X_subset = X_sample[:, list(feature_combo)]
                model = LinearRegression()
                model.fit(X_subset, y_sample)
                y_pred = model.predict(X_subset)
                rss = np.sum((y_sample - y_pred) ** 2)
                all_rss.append(rss)
                if rss < best_rss:
                    best_rss = rss
            
            # Count models within each threshold
            all_rss = np.array(all_rss)
            
            # Store results
            results['sample_size'].append(sample_size)
            results['iteration'].append(iteration)
            
            for threshold in thresholds:
                threshold_value = best_rss * (1 + threshold)
                count = np.sum(all_rss <= threshold_value)
                col_name = f'count_{int(threshold*100)}pct'
                results[col_name].append(count)
    
    # Convert to DataFrame
    results_df = pd.DataFrame(results)
    
    # Save results if requested
    if save_results:
        results_df.to_csv('rashomon_results.csv', index=False)
        with open('rashomon_results.pkl', 'wb') as f:
            pickle.dump(results_df, f)
        with open('rashomon_thresholds.pkl', 'wb') as f:
            pickle.dump(thresholds, f)
        print("\nResults saved to files.")
    
    # Print summary
    print_summary(results_df, thresholds)
    
    return results_df

def print_summary(results_df, thresholds):
    """Print summary statistics of the simulation results."""
    print("\nSummary Statistics:")
    print("-" * 60)
    
    for sample_size in sorted(results_df['sample_size'].unique()):
        subset = results_df[results_df['sample_size'] == sample_size]
        print(f"\nSample size: {sample_size}")
        
        for threshold in thresholds:
            col_name = f'count_{int(threshold*100)}pct'
            mean_val = subset[col_name].mean()
            std_val = subset[col_name].std()
            print(f"  Models within {threshold*100:.1f}% of best: "
                  f"{mean_val:.1f} ± {std_val:.1f}")

def quick_rashomon_demo():
    """
    Run a quick demonstration with smaller parameters for testing.
    """
    print("Running quick demonstration with reduced parameters...")
    
    # Use smaller parameters for quick demo
    results = run_rashomon_simulation(
        thresholds=[0.01, 0.02, 0.05],
        sample_sizes=[500, 1000],
        n_iterations=10,
        n_total=2000,
        n_features=10,
        n_signal_vars=5,
        k=3,
        save_results=False
    )
    
    return results



In [18]:
# Example usage in notebook:
if __name__ == "__main__":
    # For full simulation (this will take a while):
    results = run_rashomon_simulation(thresholds=[0.01, 0.05])
    
    # For quick test:
    # results = quick_rashomon_demo()

  n_combinations = int(np.math.factorial(n_features) /
  (np.math.factorial(k) * np.math.factorial(n_features - k)))


Generating dataset...
Total possible 5-variable subsets: 142,506
Testing thresholds: ['1.0%', '5.0%']

Processing sample size: 500


Sample size 500: 100%|█████████████████████| 100/100 [6:07:02<00:00, 220.22s/it]



Processing sample size: 1000


Sample size 1000: 100%|███████████████████| 100/100 [10:12:30<00:00, 367.50s/it]


Results saved to files.

Summary Statistics:
------------------------------------------------------------

Sample size: 500
  Models within 1.0% of best: 1.3 ± 0.5
  Models within 5.0% of best: 2.2 ± 1.0

Sample size: 1000
  Models within 1.0% of best: 1.2 ± 0.4
  Models within 5.0% of best: 2.1 ± 0.9





In [20]:
results.to_csv('../results/bootstrap_sim.csv', index=False)

In [22]:
results

Unnamed: 0,sample_size,iteration,count_1pct,count_5pct
0,500,0,3,4
1,500,1,2,2
2,500,2,2,3
3,500,3,2,3
4,500,4,2,2
...,...,...,...,...
195,1000,95,1,2
196,1000,96,1,2
197,1000,97,1,1
198,1000,98,1,1
