In [1]:
import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.linear_model import LinearRegression
from tqdm import tqdm
import os
from typing import List, Tuple, Dict

# Breiman 2001 
*What I call the Rashomon Effect is that there is often a multitude of different descriptions \[equations f(x)] in a class of functions giving about the same minimum error rate. The most easily understood example is subset selection in linear regression. Suppose there are 30 variables and we want to find the best five variable linear regressions. There are about 140,000 five-variable subsets in competition. Usually we pick the one with the lowest residual sum-of-squares (RSS), or, if there is a test set, the lowest test error. But there may be (and generally are) many five-variable equations that have RSS within 1.0% of the lowest RSS (see Breiman, 1996a). The same is true if test set error is being measured. So here are three possible pictures with RSS or test set error within 1.0% of each other:*

*Picture 1 y = 2.1 + 3.8x3 - 0.6x8 + 83.2x12 - 2.1x17 + 3.2x27 <br>
Picture 2 y = -8.9 + 4.6x5 + 0.01x6 + 12.0x15 + 17.5X21 + 0.2X22 <br>
Picture 3 y = -76.7 + 9.3x2 + 22.0x7 - 13.2x8 + 3.4x11 + 7.2X28 <br>*

# Sampling noise is a cause of the Rashomon Effect
## This simulation demonstrates by:
1. generating a population N=10000 with outcome y and 30 correlated predictors X.
2. draw different samples from N.
3. Compute size of Rashomon set of subset regression (5-variables) within $\epsilon=0.01$ of the best fitting model.
4. Plot the size of the Rashomon set across different sample sizes. 

In [2]:
class RashomonDataGenerator:
    """Generates synthetic data for demonstrating the Rashomon Effect."""
    
    def __init__(self, n_pop: int = 10000, n_vars: int = 30, random_state: int = 123):
        """
        Initialize the data generator.
        
        Parameters:
        -----------
        n_pop : int
            Population size
        n_vars : int
            Number of predictor variables
        random_state : int
            Random seed for reproducibility
        """
        self.n_pop = n_pop
        self.n_vars = n_vars
        self.random_state = random_state
        np.random.seed(random_state)
        
        # Generate population data
        self.X_pop, self.y_pop = self._generate_population()
        
    def _create_correlation_matrix(self) -> np.ndarray:
        """Create correlation matrix with block structure to induce correlations."""
        Sigma = np.full((self.n_vars, self.n_vars), 0.3)  # baseline correlation
        np.fill_diagonal(Sigma, 1.0)
        
        # Add stronger correlations between groups of variables
        # Group 1: variables 0-4
        Sigma[0:5, 0:5] = 0.7
        np.fill_diagonal(Sigma, 1.0)
        
        # Group 2: variables 5-9
        Sigma[5:10, 5:10] = 0.6
        np.fill_diagonal(Sigma, 1.0)
        
        # Group 3: variables 10-14
        Sigma[10:15, 10:15] = 0.65
        np.fill_diagonal(Sigma, 1.0)
        
        return Sigma
    
    def _generate_population(self) -> Tuple[np.ndarray, np.ndarray]:
        """Generate population data with correlated predictors."""
        # Create correlation matrix
        Sigma = self._create_correlation_matrix()
        
        # Generate correlated predictors
        mean = np.zeros(self.n_vars)
        X = np.random.multivariate_normal(mean, Sigma, size=self.n_pop)
        
        # Generate coefficients - many small effects
        true_coefs = np.random.normal(0, 1, self.n_vars)
        
        # Make some coefficients larger
        important_vars = np.random.choice(self.n_vars, 10, replace=False)
        true_coefs[important_vars] *= 2
        
        # Generate outcome with moderate noise
        signal = X @ true_coefs
        noise_sd = np.std(signal) * 0.8  # noise is 80% of signal SD
        y = signal + np.random.normal(0, noise_sd, self.n_pop)
        
        return X, y
    
    def get_sample(self, n_sample: int) -> pd.DataFrame:
        """
        Get a sample from the population.
        
        Parameters:
        -----------
        n_sample : int
            Sample size
            
        Returns:
        --------
        pd.DataFrame
            Sample data with outcome 'y' and predictors 'X0', 'X1', ..., 'X29'
        """
        if n_sample <= self.n_pop:
            idx = np.random.choice(self.n_pop, n_sample, replace=False)
        else:
            idx = np.random.choice(self.n_pop, n_sample, replace=True)
        
        # Create DataFrame
        data = pd.DataFrame(self.X_pop[idx], columns=[f'X{i}' for i in range(self.n_vars)])
        data['y'] = self.y_pop[idx]
        
        return data


In [3]:
class RashomonAnalyzer:
    """Analyzes the Rashomon Effect for linear regression models."""
    
    def __init__(self, n_vars_select: int = 5):
        """
        Initialize the analyzer.
        
        Parameters:
        -----------
        n_vars_select : int
            Number of variables to select in each model
        """
        self.n_vars_select = n_vars_select
        self.results = {}
        
    def calculate_rss(self, X: np.ndarray, y: np.ndarray) -> float:
        """Calculate residual sum of squares for a linear model."""
        model = LinearRegression()
        model.fit(X, y)
        predictions = model.predict(X)
        return np.sum((y - predictions) ** 2)
    
    def find_rashomon_set(self, data: pd.DataFrame, 
                         thresholds: List[float] = [0.01, 0.02, 0.05]) -> Dict:
        """
        Find the Rashomon set for different thresholds.
        
        Parameters:
        -----------
        data : pd.DataFrame
            Sample data
        thresholds : List[float]
            List of thresholds to evaluate
            
        Returns:
        --------
        Dict
            Dictionary containing results for each threshold
        """
        # Get predictor names (exclude 'y')
        predictor_names = [col for col in data.columns if col != 'y']
        y = data['y'].values
        
        # Get all possible combinations of n_vars_select variables
        all_combos = list(combinations(predictor_names, self.n_vars_select))
        n_models = len(all_combos)
        
        print(f"  Evaluating {n_models} possible {self.n_vars_select}-variable models...")
        
        # Calculate RSS for each combination
        rss_values = []
        
        for combo in tqdm(all_combos, desc="  Computing RSS"):
            X_subset = data[list(combo)].values
            rss = self.calculate_rss(X_subset, y)
            rss_values.append(rss)
        
        rss_values = np.array(rss_values)
        min_rss = np.min(rss_values)
        
        # Count models within each threshold
        results = {
            'min_rss': min_rss,
            'rss_values': rss_values,
            'threshold_counts': {}
        }
        
        print(f"  Models within various thresholds of best:")
        for threshold in thresholds:
            threshold_rss = min_rss * (1 + threshold)
            n_models_in_set = np.sum(rss_values <= threshold_rss)
            results['threshold_counts'][threshold] = n_models_in_set
            print(f"    Within {int(threshold*100)}%: {n_models_in_set} models")
        
        return results

In [4]:
class RashomonSimulation:
    """Main simulation class for the Rashomon Effect."""
    
    def __init__(self, sample_sizes: List[int] = list(range(100, 2001, 100))):
    # def __init__(self, sample_sizes: List[int] = [100, 250, 500]):

        """
        Initialize the simulation.
        
        Parameters:
        -----------
        sample_sizes : List[int]
            List of sample sizes to test
        """
        self.sample_sizes = sample_sizes
        self.data_generator = RashomonDataGenerator()
        self.analyzer = RashomonAnalyzer()
        self.results = []
        self.detailed_results = []
        
    def run(self):
        """Run the full simulation."""
        print("Running Rashomon Effect simulation...")
        print("Data generated with correlated predictors and distributed effects\n")
        
        for n in self.sample_sizes:
            print(f"Sample size: {n}")
            
            # Get sample
            sample_data = self.data_generator.get_sample(n)
            
            # Analyze Rashomon set
            rashomon_results = self.analyzer.find_rashomon_set(sample_data)
            
            # Store results
            self.results.append({
                'sample_size': n,
                'n_models_rashomon': rashomon_results['threshold_counts'][0.01]
            })
            
            # Store detailed results
            for threshold, count in rashomon_results['threshold_counts'].items():
                self.detailed_results.append({
                    'sample_size': n,
                    'threshold': threshold,
                    'n_models': count
                })
            
            print()
        
    def save_results(self, output_dir: str = '../data'):
        """Save results to CSV files."""
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Save main results
        results_df = pd.DataFrame(self.results)
        results_df.to_csv(os.path.join(output_dir, 'result.csv'), index=False)
        
        # Save detailed results
        detailed_df = pd.DataFrame(self.detailed_results)
        detailed_df.to_csv(os.path.join(output_dir, 'detailed_results.csv'), index=False)
        
        print(f"\nResults saved to '{output_dir}/result.csv' and '{output_dir}/detailed_results.csv'")
        print("\nSummary (1% threshold):")
        print(results_df)
        
        # Show correlation info
        Sigma = self.data_generator._create_correlation_matrix()
        avg_corr = np.mean(Sigma[np.triu_indices_from(Sigma, k=1)])
        print(f"\nAverage correlation between predictors: {avg_corr:.3f}")
        print("This creates many competing models with similar performance")


In [5]:
def main():
    """Main function to run the simulation."""
    # Create and run simulation
    sim = RashomonSimulation()
    sim.run()
    sim.save_results()

In [6]:
if __name__ == "__main__":
    main()

Running Rashomon Effect simulation...
Data generated with correlated predictors and distributed effects

Sample size: 100
  Evaluating 142506 possible 5-variable models...


  Computing RSS: 100%|████████████████| 142506/142506 [01:02<00:00, 2264.95it/s]


  Models within various thresholds of best:
    Within 1%: 3 models
    Within 2%: 11 models
    Within 5%: 57 models

Sample size: 200
  Evaluating 142506 possible 5-variable models...


  Computing RSS: 100%|████████████████| 142506/142506 [01:03<00:00, 2231.73it/s]


  Models within various thresholds of best:
    Within 1%: 4 models
    Within 2%: 7 models
    Within 5%: 60 models

Sample size: 300
  Evaluating 142506 possible 5-variable models...


  Computing RSS: 100%|████████████████| 142506/142506 [01:08<00:00, 2085.24it/s]


  Models within various thresholds of best:
    Within 1%: 1 models
    Within 2%: 3 models
    Within 5%: 10 models

Sample size: 400
  Evaluating 142506 possible 5-variable models...


  Computing RSS: 100%|████████████████| 142506/142506 [01:13<00:00, 1951.73it/s]


  Models within various thresholds of best:
    Within 1%: 1 models
    Within 2%: 1 models
    Within 5%: 11 models

Sample size: 500
  Evaluating 142506 possible 5-variable models...


  Computing RSS: 100%|████████████████| 142506/142506 [01:18<00:00, 1825.08it/s]


  Models within various thresholds of best:
    Within 1%: 4 models
    Within 2%: 11 models
    Within 5%: 50 models

Sample size: 600
  Evaluating 142506 possible 5-variable models...


  Computing RSS: 100%|████████████████| 142506/142506 [01:21<00:00, 1757.73it/s]


  Models within various thresholds of best:
    Within 1%: 2 models
    Within 2%: 5 models
    Within 5%: 26 models

Sample size: 700
  Evaluating 142506 possible 5-variable models...


  Computing RSS: 100%|████████████████| 142506/142506 [01:25<00:00, 1660.72it/s]


  Models within various thresholds of best:
    Within 1%: 2 models
    Within 2%: 3 models
    Within 5%: 33 models

Sample size: 800
  Evaluating 142506 possible 5-variable models...


  Computing RSS: 100%|████████████████| 142506/142506 [01:30<00:00, 1577.63it/s]


  Models within various thresholds of best:
    Within 1%: 5 models
    Within 2%: 13 models
    Within 5%: 87 models

Sample size: 900
  Evaluating 142506 possible 5-variable models...


  Computing RSS: 100%|████████████████| 142506/142506 [01:35<00:00, 1485.50it/s]


  Models within various thresholds of best:
    Within 1%: 2 models
    Within 2%: 6 models
    Within 5%: 33 models

Sample size: 1000
  Evaluating 142506 possible 5-variable models...


  Computing RSS: 100%|████████████████| 142506/142506 [01:40<00:00, 1425.06it/s]


  Models within various thresholds of best:
    Within 1%: 1 models
    Within 2%: 7 models
    Within 5%: 83 models

Sample size: 1100
  Evaluating 142506 possible 5-variable models...


  Computing RSS: 100%|████████████████| 142506/142506 [01:44<00:00, 1362.80it/s]


  Models within various thresholds of best:
    Within 1%: 2 models
    Within 2%: 5 models
    Within 5%: 64 models

Sample size: 1200
  Evaluating 142506 possible 5-variable models...


  Computing RSS: 100%|█████████████████| 142506/142506 [16:50<00:00, 141.03it/s]


  Models within various thresholds of best:
    Within 1%: 3 models
    Within 2%: 6 models
    Within 5%: 46 models

Sample size: 1300
  Evaluating 142506 possible 5-variable models...


  Computing RSS: 100%|████████████████| 142506/142506 [01:53<00:00, 1256.94it/s]


  Models within various thresholds of best:
    Within 1%: 3 models
    Within 2%: 13 models
    Within 5%: 124 models

Sample size: 1400
  Evaluating 142506 possible 5-variable models...


  Computing RSS: 100%|████████████████| 142506/142506 [01:57<00:00, 1209.15it/s]


  Models within various thresholds of best:
    Within 1%: 3 models
    Within 2%: 3 models
    Within 5%: 44 models

Sample size: 1500
  Evaluating 142506 possible 5-variable models...


  Computing RSS: 100%|████████████████| 142506/142506 [02:02<00:00, 1160.66it/s]


  Models within various thresholds of best:
    Within 1%: 1 models
    Within 2%: 3 models
    Within 5%: 62 models

Sample size: 1600
  Evaluating 142506 possible 5-variable models...


  Computing RSS: 100%|████████████████| 142506/142506 [02:07<00:00, 1121.81it/s]


  Models within various thresholds of best:
    Within 1%: 1 models
    Within 2%: 2 models
    Within 5%: 30 models

Sample size: 1700
  Evaluating 142506 possible 5-variable models...


  Computing RSS: 100%|████████████████| 142506/142506 [02:11<00:00, 1080.84it/s]


  Models within various thresholds of best:
    Within 1%: 1 models
    Within 2%: 5 models
    Within 5%: 33 models

Sample size: 1800
  Evaluating 142506 possible 5-variable models...


  Computing RSS: 100%|████████████████| 142506/142506 [02:16<00:00, 1045.34it/s]


  Models within various thresholds of best:
    Within 1%: 3 models
    Within 2%: 4 models
    Within 5%: 64 models

Sample size: 1900
  Evaluating 142506 possible 5-variable models...


  Computing RSS: 100%|█████████████████| 142506/142506 [02:36<00:00, 911.94it/s]


  Models within various thresholds of best:
    Within 1%: 8 models
    Within 2%: 11 models
    Within 5%: 130 models

Sample size: 2000
  Evaluating 142506 possible 5-variable models...


  Computing RSS: 100%|█████████████████| 142506/142506 [02:50<00:00, 835.36it/s]

  Models within various thresholds of best:
    Within 1%: 4 models
    Within 2%: 8 models
    Within 5%: 68 models


Results saved to '../data/result.csv' and '../data/detailed_results.csv'

Summary (1% threshold):
    sample_size  n_models_rashomon
0           100                  3
1           200                  4
2           300                  1
3           400                  1
4           500                  4
5           600                  2
6           700                  2
7           800                  5
8           900                  2
9          1000                  1
10         1100                  2
11         1200                  3
12         1300                  3
13         1400                  3
14         1500                  1
15         1600                  1
16         1700                  1
17         1800                  3
18         1900                  8
19         2000                  4

Average correlation between predictors: 0.324
T


