In [1]:
from sklearn.datasets import load_iris
from matplotlib import pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, KFold,cross_validate
import sklearn
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.datasets import load_breast_cancer

In [2]:
models = [
    KNeighborsClassifier(),
    LogisticRegression(solver='liblinear'),
    DecisionTreeClassifier(),
    BaggingClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    XGBClassifier(use_label_encoder=False,eval_metric="logloss")
]

algorithm_names = np.array([
    "Knn",
    "Lr",
    "Dt",
    "Bc",
    "Rf",
    "AdaB",
    "Gb",
    "XGB"
])


metrics = [
    'accuracy',
    'roc_auc',
]

params = {
    'Knn': {
        'n_neighbors' : [3,10,15]
    },
    
    'Dt': {
        'max_depth': [2,4]
    },
    
    'Gb' : {'learning_rate': [0.1,0.3,1]
}}



In [3]:
class Judge:
    
    def __init__(self,dataframe_name):
        self.dataframe_name = dataframe_name
        self.X = None
        self.y = None
        self.models = None
        self.algorithm_names = None
        self.params = None
        self.metrics = None
        return None
        
        
    def __str__(self):
        return f" Judging {self.dataframe_name}"
        
    
    def set_data(self,X,y):
        self.X = X
        self.y = y
        return self
    
    def set_algorithms_and_names(self,models,algorithm_names):
        self.models = models
        self.algorithm_names = algorithm_names
        return self
    
    def set_params(self, params):
        self.params = params
        return self
    
    
    def set_metrics(self, metrics):
        self.metrics = metrics
        return self
    
        
    @staticmethod
    def Class_info(cls):
        class_info = f"""Judge class contains methods for optimizing machine learning 
        performance and metrics evaluation for binary classification. It uses nested cross validation.
        
        Methods: 
        set_data -> to introduce data. X for the independent variables, y for the target. 
        set_algorithm_and_names -> to introduce different models (list) and their names (array). 
        set_params -> giving parameters for optimizing performances (dictionary) 
        set_metrics -> to introduce metrics to evaluate
        get_final_performance -> to start the analysis 
        Hyperparameters get_final_performance:
        cv_inner_splits: inner cross validation  
        cv_outer_splits: outer cross validation 
        metric_to_optimize:
        metric to optimize during parameter tuning (default: roc_auc) 
        find_params: set algorithm for parameters search -> GridSearchCV or RandomizedSearchCV(
        default: GridSearchCV)
        Returns:
        Metric performance tab for each model (Dataframe) """
        
        return class_info
    
    
    def __Construct_matrix_df(self,algorithm_names,scores):
    #Costruisco il dataframe a partire 
        Compacted_perf_values = np.hsplit(scores,len(self.metrics)) 
        Performance_matrix = np.append(algorithm_names,Compacted_perf_values).reshape(len(self.metrics)+1,len(self.models)).T 
        columns_vector = np.insert(self.metrics,0,"model") 
        Performance_matrix_df = pd.DataFrame(data=Performance_matrix,columns=columns_vector)
        return Performance_matrix_df
    
    def __get_performance_from_algorithm(self, algorithm, grid, X, y, metrics,inner_cv,outer_cv,find_params,metric_to_optimize):
        
        if grid == {}:
            cvl = cross_validate(algorithm, X, y, scoring = metrics,cv=self.outer_cv)
            results = np.array(list(cvl.values()))[2::,:]
        
        else:
            if self.find_params == "GridSearchCV":
                clf = GridSearchCV(estimator = algorithm, param_grid = grid, scoring = metrics,refit=metric_to_optimize,cv=self.inner_cv)
            elif self.find_params == "RandomizedSearchCV":
                clf = RandomizedSearchCV(estimator = algorithm, param_distributions= grid, scoring = metrics,refit=metric_to_optimize,cv=self.inner_cv)
            cvl = cross_validate(clf, X, y, scoring = metrics,cv=self.outer_cv)
            results = np.array(list(cvl.values()))[2::,:]
        
        results = np.mean(results,axis=1)
        return np.round(results*100,2)
    
    def get_final_performance(self,cv_inner_splits,cv_outer_splits,metric_to_optimize = "roc_auc",find_params="GridSearchCV"):
        self.inner_cv = KFold(n_splits=cv_inner_splits,shuffle=True)
        self.outer_cv = KFold(n_splits=cv_outer_splits,shuffle=True)
        self.find_params = find_params
        self.metric_to_optimize = metric_to_optimize
        self.scores = np.zeros((len(self.models),len(self.metrics))) #Initializing scoring matrix
        for model in range(len(self.models)):
            grid = {}
            if self.algorithm_names[model] in self.params.keys(): #Sets grid for parameters
                grid = self.params[self.algorithm_names[model]]
            
            score = self.__get_performance_from_algorithm(self.models[model],grid,self.X,self.y,self.metrics,self.inner_cv,self.outer_cv,self.find_params,self.metric_to_optimize) 
            self.scores[model] = score 
        
        
        Performance_matrix_df = self.__Construct_matrix_df(self.algorithm_names,self.scores)
        return Performance_matrix_df
        

In [4]:
#Dataset loading 
X, y = load_breast_cancer(return_X_y=True)

In [5]:
#Inizializing
judge = Judge("Breast_Cancer")

In [6]:

judge.set_data(X,y).set_algorithms_and_names(models,algorithm_names).set_params(params).set_metrics(metrics)

<__main__.Judge at 0x1ebdb860910>

In [7]:
#Analysis finally!!
judge.get_final_performance(5,5)

Unnamed: 0,model,accuracy,roc_auc
0,Knn,92.97,96.56
1,Lr,95.26,99.29
2,Dt,92.27,94.17
3,Bc,94.56,97.72
4,Rf,96.13,99.11
5,AdaB,95.79,98.7
6,Gb,96.66,99.36
7,XGB,97.02,99.36
