# Module

In [1]:
import sklearn
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

class SupervisedModelComparison:
    """
    A class to compare various supervised machine learning models.

    Attributes:
    ----------
    model_type : str
        Type of the model - 'c' for classification, 'r' for regression.
    features : pd.DataFrame
        Features for the model.
    target : pd.Series
        Target variable for the model.
    random_state : int
        Random state for reproducibility.
    test_size : float
        Proportion of the dataset to include in the test split.
    stratify : pd.Series or None
        If not None, data is split in a stratified fashion using this as the class labels.

    Methods:
    -------
    split_data():
        Splits the data into training and testing sets.
    train_models():
        Trains all available classifiers or regressors based on the model type.
    evaluate_classifier(friendly_name, train_preds, test_preds):
        Evaluates the performance of a classifier.
    evaluate_regressor(friendly_name, train_preds, test_preds):
        Evaluates the performance of a regressor.
    build_residuals_df(friendly_name, train_preds, test_preds):
        Builds the residuals dataframe for regressors.
    run_evaluation():
        Runs the evaluation of the trained models and returns an EvaluationObject.
    """
    
    def __init__(self, model_type, features, target, random_state=33, test_size=0.2, stratify=None):
        """
        Constructs all the necessary attributes for the SupervisedModelComparison object.

        Parameters:
        ----------
        model_type : str
            'c' for classification, 'r' for regression.
        features : pd.DataFrame
            DataFrame containing the features.
        target : pd.Series
            Series containing the target variable.
        random_state : int, optional
            Random state for reproducibility (default is 33).
        test_size : float, optional
            Proportion of the dataset to include in the test split (default is 0.2).
        stratify : pd.Series or None, optional
            Data is split in a stratified fashion using this as the class labels if not None.
        """
        
        if model_type not in ['c','r']:
            raise ValueError("For model_type, please enter 'c' for classification or 'r' for regression")
            
        if not isinstance(features, pd.DataFrame):
            raise ValueError("For the features argument, please pass the data as a DataFrame object.")
        
        if not isinstance(target, pd.Series):
            raise ValueError("For the target argument, please pass the data as a Series object.")
        
        self.model_type = model_type
        self.features = features
        self.target = target
        self.random_state = random_state
        self.test_size = test_size
        self.stratify = stratify
        
        self.X_train, self.X_test, self.y_train, self.y_test = self.split_data()
        
        self.classifier_evaluation = pd.DataFrame(columns=['Model', 'TP_fit', 'FP_fit', 'FN_fit', 'TN_fit', 'Precision_fit', 'Recall_fit', 'Specificity_fit', 'Accuracy_fit', 'F1_fit',
                                                            'TP_test', 'FP_test', 'FN_test', 'TN_test', 'Precision_test', 'Recall_test', 'Specificity_test', 'Accuracy_test', 'F1_test'])
        self.regressor_evaluation = pd.DataFrame(columns=['Model', 'MSE_fit', 'MAPE_fit', 'MAE_fit', 'R2_fit', 'RMSE_fit', 'Adjusted_R2_fit',
                                                           'MSE_test', 'MAPE_test', 'MAE_test', 'R2_test', 'RMSE_test', 'Adjusted_R2_test'])
        
        self.train_residuals = pd.DataFrame()
        self.test_residuals = pd.DataFrame()
        
    def split_data(self):
        """
        Splits the data into training and testing sets.

        Returns:
        -------
        tuple
            X_train, X_test, y_train, y_test
        """
        return train_test_split(self.features, self.target, test_size=self.test_size, random_state=self.random_state, stratify=self.stratify)
    
    def train_models(self):
        """
        Trains all available classifiers or regressors based on the model type.
        """
        if self.model_type == 'c':
            for friendly_name, classifier in sklearn.utils.all_estimators(type_filter='classifier'):
                try:
                    clf = classifier()
                    clf.fit(self.X_train, self.y_train)
                    train_preds = clf.predict(self.X_train)
                    test_preds = clf.predict(self.X_test)
                    self.evaluate_classifier(friendly_name, train_preds, test_preds)
                except Exception as e:
                    next
        else:
            for friendly_name, regressor in sklearn.utils.all_estimators(type_filter='regressor'):
                try:
                    rgr = regressor()
                    rgr.fit(self.X_train, self.y_train)
                    train_preds = rgr.predict(self.X_train)
                    test_preds = rgr.predict(self.X_test)
                    self.evaluate_regressor(friendly_name, train_preds, test_preds)
                    self.build_residuals_df(friendly_name, train_preds, test_preds)
                except Exception as e:
                    next
                    
    def evaluate_classifier(self, friendly_name, train_preds, test_preds):
        """
        Evaluates the performance of a classifier.

        Parameters:
        ----------
        friendly_name : str
            The name of the classifier.
        train_preds : array-like
            Predictions on the training set.
        test_preds : array-like
            Predictions on the testing set.
        """
        cm_train = sklearn.metrics.confusion_matrix(self.y_train, train_preds)
        cm_test = sklearn.metrics.confusion_matrix(self.y_test, test_preds)

        TP_fit, FP_fit, FN_fit, TN_fit = cm_train.ravel()
        TP_test, FP_test, FN_test, TN_test = cm_test.ravel()

        Precision_fit = sklearn.metrics.precision_score(self.y_train, train_preds)
        Recall_fit = sklearn.metrics.recall_score(self.y_train, train_preds)
        Specificity_fit = TN_fit / (TN_fit + FP_fit)
        Accuracy_fit = sklearn.metrics.accuracy_score(self.y_train, train_preds)
        F1_fit = sklearn.metrics.f1_score(self.y_train, train_preds)

        Precision_test = sklearn.metrics.precision_score(self.y_test, test_preds)
        Recall_test = sklearn.metrics.recall_score(self.y_test, test_preds)
        Specificity_test = TN_test / (TN_test + FP_test)
        Accuracy_test = sklearn.metrics.accuracy_score(self.y_test, test_preds)
        F1_test = sklearn.metrics.f1_score(self.y_test, test_preds)
        
        self.classifier_evaluation.loc[len(self.classifier_evaluation.index)] = [friendly_name, TP_fit, FP_fit, FN_fit, TN_fit, Precision_fit, Recall_fit, Specificity_fit, Accuracy_fit, F1_fit,
                                                                                TP_test, FP_test, FN_test, TN_test, Precision_test, Recall_test, Specificity_test, Accuracy_test, F1_test]
    
    def evaluate_regressor(self, friendly_name, train_preds, test_preds):
        """
        Evaluates the performance of a regressor.

        Parameters:
        ----------
        friendly_name : str
            The name of the regressor.
        train_preds : array-like
            Predictions on the training set.
        test_preds : array-like
            Predictions on the testing set.
        """
        n_train, k_train = self.X_train.shape
        n_test, k_test = self.X_test.shape

        MSE_fit = sklearn.metrics.mean_squared_error(self.y_train, train_preds)
        MAPE_fit = sklearn.metrics.mean_absolute_percentage_error(self.y_train, train_preds)
        MAE_fit = sklearn.metrics.mean_absolute_error(self.y_train, train_preds)
        R2_fit = sklearn.metrics.r2_score(self.y_train, train_preds)
        RMSE_fit = np.sqrt(MSE_fit)
        Adjusted_R2_fit = 1 - (1 - R2_fit) * (n_train - 1) / (n_train - k_train - 1)

        MSE_test = sklearn.metrics.mean_squared_error(self.y_test, test_preds)
        MAPE_test = sklearn.metrics.mean_absolute_percentage_error(self.y_test, test_preds)
        MAE_test = sklearn.metrics.mean_absolute_error(self.y_test, test_preds)
        R2_test = sklearn.metrics.r2_score(self.y_test, test_preds)
        RMSE_test = np.sqrt(MSE_test)
        Adjusted_R2_test = 1 - (1 - R2_test) * (n_test - 1) / (n_test - k_test - 1)
        
        self.regressor_evaluation.loc[len(self.regressor_evaluation.index)] = [friendly_name, MSE_fit, MAPE_fit, MAE_fit, R2_fit, RMSE_fit, Adjusted_R2_fit,
                                                                                MSE_test, MAPE_test, MAE_test, R2_test, RMSE_test, Adjusted_R2_test]
    
    def build_residuals_df(self, friendly_name, train_preds, test_preds):
        """
        Builds the residuals dataframe for regressors.

        Parameters:
        ----------
        friendly_name : str
            The name of the regressor.
        train_preds : array-like
            Predictions on the training set.
        test_preds : array-like
            Predictions on the testing set.
        """
        train_residuals = self.y_train - train_preds
        test_residuals = self.y_test - test_preds
        self.train_residuals = pd.concat([self.train_residuals, pd.Series(train_residuals).rename(f"{friendly_name}")], axis=1)
        self.test_residuals = pd.concat([self.test_residuals, pd.Series(test_residuals).rename(f"{friendly_name}")], axis=1)
        
    def run_evaluation(self):
        """
        Runs the evaluation of the trained models and returns an EvaluationObject.

        Returns:
        -------
        EvaluationObject
            An object containing evaluation results and residuals.
        """
        self.train_models()
        evaluation = EvaluationObject(self.train_residuals, self.test_residuals, self.regressor_evaluation, self.classifier_evaluation)
        return evaluation
    
class EvaluationObject:
    """
    Packaged object that is returned by the "run_evaluation" method of the SupervisedModelComparison class.

    Attributes:
    ----------
    train_residuals : pd.DataFrame
        Residuals for the training set.
    test_residuals : pd.DataFrame
        Residuals for the testing set.
    regressor_evaluation : pd.DataFrame
        Evaluation metrics for regressors.
    classifier_evaluation : pd.DataFrame
        Evaluation metrics for classifiers.
    """
    
    def __init__(self, train_residuals=None, test_residuals=None, regressor_evaluation=None, classifier_evaluation=None):
        """
        Constructs all the necessary attributes for the EvaluationObject.

        Parameters:
        ----------
        train_residuals : pd.DataFrame, optional
            Residuals for the training set.
        test_residuals : pd.DataFrame, optional
            Residuals for the testing set.
        regressor_evaluation : pd.DataFrame, optional
            Evaluation metrics for regressors.
        classifier_evaluation : pd.DataFrame, optional
            Evaluation metrics for classifiers.
        """
        self.train_residuals = train_residuals
        self.test_residuals = test_residuals
        self.regressor_evaluation = regressor_evaluation
        self.classifier_evaluation = classifier_evaluation


# Use module for regression task (model type r)

In [2]:
X = pd.DataFrame({
    'feature1': np.random.randn(100),
    'feature2': np.random.randn(100)
})
y = pd.Series(np.random.randn(100))  # 100 target values

In [3]:
comparator = SupervisedModelComparison (model_type = 'r', features = X, target = y)

In [4]:
evaluation_object = comparator.run_evaluation()

### returns an evaluation object with regressor_evaluation, train_residuals, and test_residuals attributes

In [5]:
evaluation_object.regressor_evaluation.head()

Unnamed: 0,Model,MSE_fit,MAPE_fit,MAE_fit,R2_fit,RMSE_fit,Adjusted_R2_fit,MSE_test,MAPE_test,MAE_test,R2_test,RMSE_test,Adjusted_R2_test
0,ARDRegression,0.958354,1.840266,0.764041,9.6e-05,0.978956,-0.025876,1.01278,0.97762,0.812493,-0.650299,1.00637,-0.844451
1,AdaBoostRegressor,0.424543,1.973611,0.546802,0.557051,0.65157,0.545545,0.952486,1.098379,0.796947,-0.552051,0.975954,-0.734645
2,BaggingRegressor,0.234219,1.025277,0.344334,0.755626,0.483962,0.749279,1.03631,1.373973,0.863494,-0.688639,1.017993,-0.887303
3,BayesianRidge,0.958393,1.839182,0.764071,5.5e-05,0.978976,-0.025918,1.012902,0.97755,0.8125,-0.650497,1.00643,-0.844673
4,DecisionTreeRegressor,0.0,0.0,0.0,1.0,0.0,1.0,1.652939,2.584112,1.038538,-1.693421,1.285667,-2.010294


In [6]:
evaluation_object.train_residuals.head()

Unnamed: 0,ARDRegression,AdaBoostRegressor,BaggingRegressor,BayesianRidge,DecisionTreeRegressor,DummyRegressor,ElasticNet,ElasticNetCV,ExtraTreeRegressor,ExtraTreesRegressor,...,RANSACRegressor,RadiusNeighborsRegressor,RandomForestRegressor,Ridge,RidgeCV,SGDRegressor,SVR,TheilSenRegressor,TransformedTargetRegressor,TweedieRegressor
78,-0.905063,-0.930652,-1.110223e-16,-0.904583,0.0,-0.903925,-0.903925,-0.903925,0.0,9.992007e-16,...,-1.689546,-0.943492,-0.506344,-1.000789,-0.989092,-0.961883,-1.159519,-1.205929,-1.002291,-0.947929
97,-0.013865,-0.402132,-0.0442509,-0.013846,0.0,-0.013818,-0.013818,-0.013818,0.0,-1.94289e-16,...,-0.054949,-0.306281,0.037631,-0.017618,-0.017193,0.006223,-0.124143,-0.076923,-0.017672,-0.015634
29,0.317571,0.115082,0.5001092,0.31791,0.0,0.318355,0.318355,0.318355,0.0,6.661338e-16,...,-0.090049,0.382372,0.436197,0.249758,0.258293,0.288369,0.100053,0.177553,0.248658,0.287863
84,0.204839,-0.149827,0.2751892,0.205156,0.0,0.205568,0.205568,0.205568,0.0,3.885781e-16,...,-0.14749,0.222087,0.206822,0.141352,0.149391,0.179828,0.09986,0.087922,0.140315,0.177156
38,1.498536,1.024636,1.054288,1.498373,0.0,1.498153,1.498153,1.498153,0.0,-2.220446e-16,...,1.743521,1.515514,0.592822,1.531206,1.527153,1.548846,1.664197,1.524771,1.531728,1.513004


In [7]:
evaluation_object.test_residuals.head()

Unnamed: 0,ARDRegression,AdaBoostRegressor,BaggingRegressor,BayesianRidge,DecisionTreeRegressor,DummyRegressor,ElasticNet,ElasticNetCV,ExtraTreeRegressor,ExtraTreesRegressor,...,RANSACRegressor,RadiusNeighborsRegressor,RandomForestRegressor,Ridge,RidgeCV,SGDRegressor,SVR,TheilSenRegressor,TransformedTargetRegressor,TweedieRegressor
56,0.472494,0.451013,0.894916,0.472443,1.189212,0.472398,0.472398,0.472398,1.189212,0.823193,...,0.392351,0.781342,0.920046,0.482913,0.481339,0.500089,0.780093,0.356023,0.48312,0.476361
90,-0.837875,-0.870878,-1.066289,-0.838348,-0.301024,-0.838971,-0.838971,-0.838971,-1.193695,-0.771739,...,-0.230625,-0.888465,-0.941086,-0.743274,-0.755153,-0.739689,-0.87836,-0.732599,-0.741743,-0.796361
95,-1.322186,-0.807251,-1.118879,-1.321941,0.504849,-1.321597,-1.321597,-1.321597,-2.057941,-1.181559,...,-1.769029,-1.643519,-1.181045,-1.37103,-1.365149,-1.340971,-1.730307,-1.526044,-1.371784,-1.34429
82,0.851627,1.436675,1.129346,0.851229,0.667899,0.850718,0.850718,0.850718,1.151118,1.328505,...,1.26223,0.993062,1.13176,0.931448,0.921248,0.935271,0.995771,0.873437,0.932765,0.886187
60,0.226522,-0.215907,-0.14143,0.226623,0.158798,0.226767,0.226767,0.226767,-0.533348,-0.278304,...,0.039457,0.068252,-0.296547,0.206311,0.208731,0.232482,-0.133346,0.113315,0.206001,0.21734


# Use module for classification task (model type c) 

In [8]:
X = pd.DataFrame({
    'feature1': np.random.randn(100),
    'feature2': np.random.randn(100)
})
y = pd.Series(np.random.randint(0, 2, size=100))

In [9]:
comparator = SupervisedModelComparison(model_type = 'c', features = X, target = y)

In [10]:
evaluation_object = comparator.run_evaluation()

### returns an evaluation object with classifier_evaluation

In [11]:
evaluation_object.classifier_evaluation.head()

Unnamed: 0,Model,TP_fit,FP_fit,FN_fit,TN_fit,Precision_fit,Recall_fit,Specificity_fit,Accuracy_fit,F1_fit,TP_test,FP_test,FN_test,TN_test,Precision_test,Recall_test,Specificity_test,Accuracy_test,F1_test
0,AdaBoostClassifier,38,3,3,36,0.923077,0.923077,0.923077,0.925,0.923077,5,3,6,6,0.666667,0.5,0.666667,0.55,0.571429
1,BaggingClassifier,40,1,2,37,0.973684,0.948718,0.973684,0.9625,0.961039,6,2,10,2,0.5,0.166667,0.5,0.4,0.25
2,BernoulliNB,22,19,18,21,0.525,0.538462,0.525,0.5375,0.531646,5,3,6,6,0.666667,0.5,0.666667,0.55,0.571429
3,CalibratedClassifierCV,24,17,21,18,0.514286,0.461538,0.514286,0.525,0.486486,4,4,7,5,0.555556,0.416667,0.555556,0.45,0.47619
4,DecisionTreeClassifier,41,0,0,39,1.0,1.0,1.0,1.0,1.0,5,3,7,5,0.625,0.416667,0.625,0.5,0.5
