# RashomonSetAnalyser class ver. 1.0

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import dalex as dx
import copy

In [2]:
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
X, y = datasets.load_breast_cancer(return_X_y=True, as_frame=True)

params = {'n_estimators': list(np.arange(10, 50, 10, dtype = 'int'))}
rf = RandomForestClassifier()

In [3]:
class RashomonSetAnalyser:       
    
    def __init__(self):        
        self.base_model = None
        self.models = None
        self.rashomon_search_results = None
        self.model_profiles = None
        self.pdp_measures = None
    
    
    def set_base_model(self, base_model):
        """
        If you want to use created (and maybe fitted before) models, you can assign them to class attributes with this method.
        With this method you assign base model (the best one)
        
        argument: list: ['model_name', model_object]
        example: ['model_base', RandomForestClassifier(n_estimators = 30)]
        """
        self.base_model = base_model
        
    
    def set_models(self, models):
        """
        If you want to use created (and maybe fitted before) models, you can assign them to class attributes with this method.
        With this method you assign all models but the best one
        
        argument: list of such lists: ['model_name', model_object]
        example: [['model1', RandomForestClassifier(n_estimators = 10)], ['model2', RandomForestClassifier(n_estimators = 20)]]
        """
        self.models = models
        
        
    def fit(self, X, y, *args, **kwargs):
        """
        Fits assigned models.
        """
        
        if self.base_model is None:
            raise Exception("Models were not chosen")
        
        self.base_model[1] = self.base_model[1].fit(X, y, *args, **kwargs)
        
        if self.models is None:
            return
        
        for i in range(len(self.models)):
            self.models[i][1] = self.models[i][1].fit(X, y, *args, **kwargs)
            
    
    def get_params(self):
        """
        Return dictionary of params of assigned models.
        """
        
        if self.base_model is None:
            raise Exception("Models were not created.")
        
        d = dict()
        d[self.base_model[0]] = self.base_model[1].get_params()
        
        if self.models is None:
            return d
        
        for model in self.models:
            d[model[0]] = model[1].get_params()
            
        return d
            
    
    def generate_rashomon_set(self, X, y, base_estimator, searcher_type = 'random', rashomon_ratio = 0.1, *args, **kwargs):
        """
        Searching for best models and choosing [rashomon_ratio %] best.
        """
        from sklearn.model_selection import GridSearchCV
        from sklearn.model_selection import RandomizedSearchCV
        import pandas as pd
        import math
        
        searcher_object = None
        
        if searcher_type == 'random':
            searcher_object = RandomizedSearchCV(base_estimator, *args, **kwargs)
        elif searcher_type == 'grid':
            searcher_object = GridSearchCV(base_estimator, *args, **kwargs)
        else:
            raise Exception("Wrong searcher type.")
            
        rashomon_ratio = min(max(0, rashomon_ratio), 1)      

        searcher_object.fit(X, y)
        results = pd.DataFrame(searcher_object.cv_results_).sort_values(by = 'mean_test_score', ascending = False).reset_index(drop=True)
        
        self.base_model = ["Base model", copy.deepcopy(base_estimator)]
        self.base_model[1].set_params(**results.params[0])
        
        n_models = min(max(math.floor(len(results.index) * rashomon_ratio), 1), len(results.index) - 1)
        self.models = []
        
        for i in range(n_models):
            m = copy.deepcopy(base_estimator)
            m.set_params(**results.params[i + 1])
            
            self.models.append(["Model " + str(i + 1), m])
            
        self.rashomon_search_results = results
        return results
    
    
    def change_rashomon_ratio(self, rashomon_ratio):
        """
        Changing rashomon ratio after generating set of models.
        """
        import math
        import copy
        
        if self.rashomon_search_results is None:
            raise Exception("Models were not generated. Run generate_rashomon_set method.")
        
        model = copy.deepcopy(self.base_model[1])
        n_models = min(max(math.floor(len(self.rashomon_search_results.index) * rashomon_ratio), 1), len(self.rashomon_search_results.index) - 1)
        self.models = []
        
        for i in range(n_models):
            m = copy.deepcopy(model)
            m.set_params(**self.rashomon_search_results.params[i + 1])
            
            self.models.append(["Model " + str(i + 1), m])
        
    
    def pdp_comparator(self, X, y, metric = 'abs_sum', save_model_profiles = False, variables = None):
        """
        Compares pdp profiles with given metric.
        You can save (inside this object) model profiles from dalex if save_model_profiles set to True.
        If you set save_model_profiles=True, it requiers more memory, but you can calculate very fast different metrics with pdp_comparator_change_metric method.
        
        You can choose a certain subset of features by giving a list of these feature names as a variables parameter. If it's None, all features will be calculated.
        """
        import dalex as dx
        import pandas as pd
       
        def distance_function_generator(metric):
            if metric == 'abs_sum':
                return lambda x_base, y_base, x_new, y_new: np.sum(np.abs(y_base - y_new))
            elif metric == 'sum':
                return lambda x_base, y_base, x_new, y_new: np.sum(y_base - y_new)
            elif metric == 'integrate':
                return lambda x_base, y_base, x_new, y_new: np.sum((y_base - y_new) * x_new) 
            else:
                return lambda x_base, y_base, x_new, y_new: metric(x_base, y_base, x_new, y_new)
        
        distance = distance_function_generator(metric)
        
        profile_base = dx.Explainer(self.base_model[1], X, y, label = self.base_model[0], verbose = False)
        
        if variables is None:
            profile_base = profile_base.model_profile(verbose = False)
        else:
            profile_base = profile_base.model_profile(verbose = False, variables = variables)
        
        df = pd.DataFrame({'colname': profile_base.result._vname_.unique()})
        
        if save_model_profiles:
            self.model_profiles = [profile_base]
        
        y_base = profile_base.result._yhat_
        x_base = profile_base.result._x_
        
        sample_length = y_base.size / profile_base.result._vname_.nunique()
        
        for model in self.models:
            profile = dx.Explainer(model[1], X, y, label = model[0], verbose = False)
            
            if variables is None:
                profile = profile.model_profile(verbose = False)
            else:
                profile = profile.model_profile(verbose = False, variables = variables)
            
            y_result = profile.result._yhat_
            x_result = profile.result._x_
            
            tab_res = []
            for i in range(len(df.colname)):
                lower = int(i * sample_length)
                higher = int((i + 1) * sample_length)
                tab_res.append(distance(x_base[lower:higher], y_base[lower:higher], x_result[lower:higher], y_result[lower:higher]))
                
            df[model[0]] = tab_res
            
            if save_model_profiles:
                self.model_profiles.append(profile)
            else:
                del profile
        
        self.pdp_measures = df
        return df
    
    
    def pdp_comparator_change_metric(self, metric):
        """
        You can use this method only if pdp_comparator was ran with parameter save_model_profiles=True
        It calculates results with new metric efficiently
        """
        
        if self.model_profiles is None:
            raise Exception("Model profiles don't exist. Run pdp_comparator with parameter save_model_profiles = True to use this method.")
        
        def distance_function_generator(metric):
            if metric == 'abs_sum':
                return lambda x_base, y_base, x_new, y_new: np.sum(np.abs(y_base - y_new))
            elif metric == 'sum':
                return lambda x_base, y_base, x_new, y_new: np.sum(y_base - y_new)
            elif metric == 'integrate':
                return lambda x_base, y_base, x_new, y_new: np.sum((y_base - y_new) * x_new) 
            else:
                return lambda x_base, y_base, x_new, y_new: metric(x_base, y_base, x_new, y_new)
            
        distance = distance_function_generator(metric)
        
        profile_base = self.model_profiles[0]
        y_base = profile_base.result._yhat_
        x_base = profile_base.result._x_
        df = pd.DataFrame({'colname': profile_base.result._vname_.unique()})
        
        sample_length = y_base.size / profile_base.result._vname_.nunique()
        
        for j in range(1, len(self.model_profiles)):
            y_result = self.model_profiles[j].result._yhat_
            x_result = self.model_profiles[j].result._x_
            
            tab_res = []
            for i in range(len(df.colname)):
                lower = int(i * sample_length)
                higher = int((i + 1) * sample_length)
                tab_res.append(distance(x_base[lower:higher], y_base[lower:higher], x_result[lower:higher], y_result[lower:higher]))
            
            df[self.models[j - 1][0]] = tab_res
            
        self.pdp_measures = df
        return df

### How this class works?

In [4]:
rashomon = RashomonSetAnalyser()

In [5]:
rashomon.base_model is None

True

In [6]:
rashomon.generate_rashomon_set(X, y, rf, searcher_type = 'grid', rashomon_ratio = 1, param_grid = params)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.031993,0.000545,0.003715,0.00024,20,{'n_estimators': 20},0.921053,0.947368,0.973684,0.973684,0.982301,0.959618,0.022572,1
1,0.04774,0.000555,0.004615,0.000216,30,{'n_estimators': 30},0.921053,0.929825,0.982456,0.982456,0.973451,0.957848,0.02681,2
2,0.061313,0.000671,0.005313,4.1e-05,40,{'n_estimators': 40},0.929825,0.947368,0.973684,0.973684,0.955752,0.956063,0.016644,3
3,0.021972,0.004642,0.0037,0.001641,10,{'n_estimators': 10},0.921053,0.938596,0.964912,0.929825,0.99115,0.949107,0.025639,4


In [7]:
rashomon.base_model

['Base model', RandomForestClassifier(n_estimators=20)]

In [8]:
rashomon.models

[['Model 1', RandomForestClassifier(n_estimators=30)],
 ['Model 2', RandomForestClassifier(n_estimators=40)],
 ['Model 3', RandomForestClassifier(n_estimators=10)]]

In [9]:
rashomon.change_rashomon_ratio(0.6)

In [10]:
rashomon.models

[['Model 1', RandomForestClassifier(n_estimators=30)],
 ['Model 2', RandomForestClassifier(n_estimators=40)]]

### Fit models

In [11]:
rashomon.fit(X, y)

### Compare PDP

In [12]:
rashomon.pdp_comparator(X, y)

Unnamed: 0,colname,Model 1,Model 2
0,mean radius,1.185222,1.84525
1,mean texture,0.526611,0.7135
2,mean perimeter,0.710389,2.334167
3,mean area,2.665611,0.9975
4,mean smoothness,0.361667,0.528833
5,mean compactness,0.859444,1.929167
6,mean concavity,4.304778,3.469083
7,mean concave points,1.474833,0.570917
8,mean symmetry,0.334444,0.256917
9,mean fractal dimension,0.297,0.144583


In [13]:
rashomon.pdp_comparator(X, y, metric = 'integrate')

Unnamed: 0,colname,Model 1,Model 2
0,mean radius,-16.580795,-25.338178
1,mean texture,1.114686,-7.306956
2,mean perimeter,89.668022,249.486603
3,mean area,3897.433293,1426.948881
4,mean smoothness,-0.012758,-0.024162
5,mean compactness,0.147867,0.466473
6,mean concavity,-0.690292,-0.616823
7,mean concave points,0.145619,-0.012801
8,mean symmetry,-0.02251,-0.02598
9,mean fractal dimension,-0.023525,0.004539


### Comapre PDP with parameter *save_model_profiles=True* and change metrics

In [14]:
rashomon.pdp_comparator(X, y, save_model_profiles = True)

Unnamed: 0,colname,Model 1,Model 2
0,mean radius,1.185222,1.84525
1,mean texture,0.526611,0.7135
2,mean perimeter,0.710389,2.334167
3,mean area,2.665611,0.9975
4,mean smoothness,0.361667,0.528833
5,mean compactness,0.859444,1.929167
6,mean concavity,4.304778,3.469083
7,mean concave points,1.474833,0.570917
8,mean symmetry,0.334444,0.256917
9,mean fractal dimension,0.297,0.144583


In [15]:
rashomon.pdp_comparator_change_metric(metric = 'integrate')

Unnamed: 0,colname,Model 1,Model 2
0,mean radius,-16.580795,-25.338178
1,mean texture,1.114686,-7.306956
2,mean perimeter,89.668022,249.486603
3,mean area,3897.433293,1426.948881
4,mean smoothness,-0.012758,-0.024162
5,mean compactness,0.147867,0.466473
6,mean concavity,-0.690292,-0.616823
7,mean concave points,0.145619,-0.012801
8,mean symmetry,-0.02251,-0.02598
9,mean fractal dimension,-0.023525,0.004539


In [16]:
rashomon.pdp_comparator_change_metric(metric = 'sum')

Unnamed: 0,colname,Model 1,Model 2
0,mean radius,-0.750556,-0.929417
1,mean texture,0.220389,-0.065833
2,mean perimeter,0.677056,1.4885
3,mean area,2.287167,0.954167
4,mean smoothness,-0.172222,-0.27
5,mean compactness,0.515778,1.859167
6,mean concavity,-1.451667,-1.473583
7,mean concave points,0.832389,-0.51725
8,mean symmetry,-0.098778,-0.085583
9,mean fractal dimension,-0.277444,0.062417


### Choose a subset of variables

In [17]:
rashomon.pdp_comparator(X, y, save_model_profiles = True, variables = ['mean area', 'worst area'])

Unnamed: 0,colname,Model 1,Model 2
0,mean area,2.665611,0.9975
1,worst area,4.799389,4.89325


In [18]:
rashomon.pdp_comparator_change_metric(metric = 'integrate')

Unnamed: 0,colname,Model 1,Model 2
0,mean area,3897.433293,1426.948881
1,worst area,10922.387068,10775.488239


In [19]:
rashomon.pdp_comparator_change_metric(metric = 'sum')

Unnamed: 0,colname,Model 1,Model 2
0,mean area,2.287167,0.954167
1,worst area,3.616167,3.66725
