# **Experiment Parameters**

In [None]:
import os
print(os.environ['PATH'])

In [None]:
# -----------------------------------------  Datasets

CLASS_LEVEL_REFACTORINGS = [ "Extract Class.csv", "Extract Interface.csv", "Extract Subclass.csv", "Extract Superclass.csv", 
                               "Move And Rename Class.csv", "Move Class.csv","Rename Class.csv" ]

METHOD_LEVEL_REFACTORINGS = ["Extract And Move Method.csv", "Extract Method.csv", "Inline Method.csv",  "Move Method.csv",
                                "Pull Up Method.csv",  "Push Down Method.csv", 
                                "Rename Method.csv"]

VARIABLE_LEVEL_REFACTORINGS = [ "Extract Variable.csv", "Inline Variable.csv", "Parameterize Variable.csv", "Rename Parameter.csv",
                                 "Rename Variable.csv", "Replace Variable With Attribute.csv"]


#"Extract Method.csv", 'Rename Parameter.csv', 
short = [ 'Move And Rename Class.csv', 'Extract And Move Method.csv']

fnames = METHOD_LEVEL_REFACTORINGS

In [None]:
# ----------------  Experiment Parameters
CV  = '10SCV'    # cross validation (CS) values: 10, 10SCV, 10x10, 5x2, LOO
dpi = 600

F = False
T = True

#-----------------  Feature Selection
_GainRatio  = T
_PCA        = F
num_feature = 0.95

#-----------------  File Output
_WriteFlag = T

#-----------------  Visualization
_Bar = F # Bar charts
_ROC = F # Roc curve
_Box = T # Boxplots
_ESD = T # ScottKnottESD

#-----------------  Hyperparameters
_GridSearch  = F
_CompIndTune = F # Compare_Tune_Vs_Default for indivsual model
_CompTenTune = F # Compare_Tune_Vs_Default for Tree-based ensemble models

#----------------- Feature transformation 
_MinMax = T  # Enable Min-Max Normalization: Scales and translates each feature individually such that it is in the given range on the training set between zero and one.
_Scaler = F  # Enable Standard Scaling: Standardize features by removing the mean and scaling to unit variance.
_MaxAbs = F  # Enable MaxAbs Scaling: Scale each feature by its maximum absolute value to ensure the data range is [-1, 1].
_Robust = F  # Enable Robust Scaling: Scale features using statistics that are robust to outliers.
_Quanti = F  # Possibly indicates Quantile Transformation: Transform features to follow a uniform or a normal distribution.
_BoxCox = F  # Enable Box-Cox Transformation: Transform features to be more Gaussian-like.
_LogTra = F  # Enable Log Transformation: Apply natural logarithm transformation to the features.


#----------------- Sampling 
_Sampling       = T      # Apply sampling techniques
_SMOTE          = F
_UnderSampling  = T
_max_rows      = None

#-----------------  Model Selection
incInd    = T      # Indivisual models
incBag    = F      # Bagging ensemble
incBos    = F      # Boosting ensemble
incTen    = F      # Tree-based ensembles
incSta    = T      # Stacking ensemble
incTenSta = T      # Tree-based Stacking ensemble
incVot    = T      # Voting ensemble

#-----------------  Comparison Flags
_CompAll  = T   # Compare all models
_CompInd  = F   # Comapre indivsual models
_StatFlag = T   # Statistical analysis


#----------------- Select metrics
statMetric = 'Accuracy'
boxMetrics = ['Accuracy', 'F1-score'  , 'F1-micro'  , 'F1-weight' , 'Brier' , 'AUC' ]
barMetric = ['Accuracy']
ESDmetric = 'AUC'

# **G-Drive**

In [None]:
#%%capture
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
#cd drive/My Drive/MachineLearning

In [None]:
def namestr(obj, namespace):
    name = [name for name in namespace if namespace[name] is obj]
    name = name[0]
    
    return name


from datetime import datetime

def get_output_dir():
    
    ref_type = namestr(fnames, globals())
    version = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 

    if _GainRatio:
        return ref_type + '/GainRatio '+version + '/'
    elif _PCA:
        return ref_type + '/PCA '+version + '/'
    else:
        return ref_type + '/NoFS '+version + '/'

In [None]:
# ----------------  Output directory name
output_dir = get_output_dir() #'Run1/'+ datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


dataPath = 'Datasets/'
prePath = 'Datasets/Preprocess/'
indPath = output_dir + 'IndivResults/'
bagPath = output_dir +'BagResults/'
bosPath = output_dir +'BosResults/'
tenPath = output_dir +'TenResults/'
staPath = output_dir +'StackResults/'
tenstaPath = output_dir +'TenStackResults/'
votPath = output_dir +'VoteResults/'
comPath = output_dir +'CompareResults/'

import os

directories = [
    'Datasets/Preprocess',
    output_dir +'IndivResults',
    #output_dir +'BagResults',
    #output_dir +'BosResults',
    output_dir +'TenResults',
    output_dir +'StackResults',
    output_dir +'TenStackResults',
    output_dir +'VoteResults',
    output_dir +'CompareResults'
]

for dir_name in directories:
    os.makedirs(dir_name, exist_ok=True)
    print(f"Directory '{dir_name}' created" if not os.path.exists(dir_name) else f"Directory '{dir_name}' already exist")



# **Installs**

In [None]:
%%capture
#pip install info_gain
#!pip install xgboost
#!pip install catboost
#!pip install seaborn
#!pip install imblearn
#!pip install scipy==1.11.4
#!pip install openpyxl
#!pip install scikit-learn-intelex

# **Imports**

In [None]:
%%capture
%matplotlib inline
%config InlineBackend.figure_format='retina'

from sklearnex import patch_sklearn, config_context

patch_sklearn()


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import string
import time
from   datetime import date
import warnings
import logging

warnings.filterwarnings('ignore')
sys.setrecursionlimit(1000)
sns.set(style='ticks', palette='muted')

from sklearn.preprocessing       import LabelEncoder, MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler, QuantileTransformer, PowerTransformer, FunctionTransformer
from sklearn.impute              import SimpleImputer
from sklearn.decomposition       import PCA
from info_gain                   import info_gain
from imblearn.over_sampling      import SMOTE
from imblearn.under_sampling     import RandomUnderSampler, EditedNearestNeighbours
from scipy                       import interp
from scipy.stats                 import shapiro, mannwhitneyu, ttest_ind, wilcoxon, ttest_rel
from statsmodels.stats.multitest import multipletests
from statistics                  import mean
from random                      import randint
from enum                        import Enum
from collections                 import Counter

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble     import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, StackingClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron
from sklearn.calibration  import CalibratedClassifierCV
from sklearn.svm          import SVC, LinearSVC, OneClassSVM
from sklearn.naive_bayes  import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.tree         import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.neighbors    import KNeighborsClassifier, RadiusNeighborsClassifier, NearestCentroid
from sklearn.mixture      import GaussianMixture
from xgboost              import XGBClassifier
from catboost             import CatBoostClassifier
from sklearn.neural_network           import MLPClassifier
from sklearn.gaussian_process         import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, Matern
from sklearn.discriminant_analysis    import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics.pairwise         import rbf_kernel, linear_kernel

from sklearn.metrics         import accuracy_score, f1_score, roc_curve, roc_auc_score, auc, brier_score_loss, confusion_matrix #plot_roc_curve 
from sklearn.model_selection import cross_validate, cross_val_score, StratifiedKFold, KFold, LeaveOneOut, StratifiedShuffleSplit, RepeatedStratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics         import make_scorer, matthews_corrcoef

In [None]:
# -----------------------------------------  ScottKnottESD installation
if _ESD :

  #!pip install rpy2
  #!pip install pyreadr
  #!pip install pandas plotnine

  # import rpy2.robjects as ro
  # package_name = "ScottKnottESD"
  # try:
  #     sk = importr(package_name)
  # except:
  #     ro.r(f'install.packages("{package_name}")')
  #     sk = importr(package_name)

  import rpy2.robjects.packages as rpackages
  from rpy2.robjects.vectors  import StrVector

  from rpy2.robjects.packages import importr
  from rpy2.robjects import r, pandas2ri
  from plotnine import *
  pandas2ri.activate()
    
  pandas2ri.activate()

  utils = rpackages.importr('utils')  # import R's utility package
  utils.chooseCRANmirror(ind=1)       # select the first mirror in the list
  utils.install_packages(StrVector('ScottKnottESD'))
  
  packnames = ["ScottKnottESD"]

  # Selectively install what needs to be installed.
  names_to_install = [x for x in packnames if not rpackages.isinstalled(x)]
  if len(names_to_install) > 0:
    utils.install_packages(StrVector(names_to_install))

  try:
      sk = importr('ScottKnottESD')
  except:
      #ro.r(f'install.packages("{package_name}")')
      #sk = importr(package_name)
      print('Cant download ScottKnottESD')

# **Utilities**

In [None]:
class colors:
    HEADER = '\033[95m'
    BLUE   = '\033[94m'
    CYAN   = '\033[96m'
    GREEN  = '\033[92m'
    WARN   = '\033[93m'
    RED    = '\033[91m'
    ENDC   = '\033[0m'
    BOLD   = '\033[1m'
    UNDER  = '\033[4m'

In [None]:
start_time = time.time()

def printStart():

  print( colors.BOLD + colors.BLUE + '========================================================================')
  print( colors.BOLD + colors.BLUE + '\t\t\t\tStart ')
  print( colors.BOLD + colors.BLUE + '========================================================================')

In [None]:
def printString(str):

  print( colors.BOLD + colors.BLUE + '========================================================================')
  print( colors.BOLD + colors.BLUE + '\t\t\t' + str)
  print( colors.BOLD + colors.BLUE + '========================================================================')

In [None]:
def printDone():

  print( colors.BOLD + colors.BLUE + '========================================================================')
  print( colors.BOLD + colors.BLUE + '\t\t\t\tDONE')
  print( colors.BOLD + colors.BLUE + '\t\t\t\tTime (minutes): ', round((time.time() - start_time)/60))
  print( colors.BOLD + colors.BLUE + '========================================================================')

In [None]:
def Save_CSV (result, fname, mode='w'):
  if not _WriteFlag: return

  if not os.path.isfile(fname) : result.to_csv (fname, header='column_names')
  else :
    if mode == 'w' : result.to_csv (fname, mode=mode, header='column_names')
    if mode == 'a' : result.to_csv (fname, mode=mode, header=False)

In [None]:
def Save_Excel (result, fname, mode='w'):
  if not _WriteFlag: return

  if not os.path.isfile(fname) : result.to_excel (fname, header='column_names')
  else :
    if mode == 'w' : result.to_excel (fname, header='column_names')
    if mode == 'a' : result.to_excel (fname, header=False)

In [None]:
def Save_File (result, fname, mode='w'):
  if not _WriteFlag: return

  f = open (fname, mode)
  f.write (result)
  f.close ()

In [None]:
def getToday ():
  return str(date.today())

def getTime ():
  t = time.localtime()
  current_time = time.strftime('%H-%M-%S', t)
  return current_time

In [None]:
def getPath (type):

  path = ''

  if   type == 'Preprocess'    : path = prePath
  elif type == 'Individual'    : path = indPath
  elif type == 'Bagging'       : path = bagPath
  elif type == 'Boosting'      : path = bosPath
  elif type == 'TreeBased'     : path = tenPath
  elif type == 'Stacking'      : path = staPath
  elif type == 'TreeStacking'  : path = tenstaPath
  elif type == 'Voting'        : path = votPath
  elif type == 'Compare'       : path = comPath

  return path

In [None]:
def getDatasetName (fname):

  fname = fname.replace ('Datasets/','')
  fname = fname.replace ('Preprocess/','')
  fname = fname.replace ('IndivResults/','')
  fname = fname.replace ('VoteResults/','')
  fname = fname.replace ('BagResults/','')
  fname = fname.replace ('BosResults/','')
  fname = fname.replace ('TenResults/','')
  fname = fname.replace ('StackResults/','')
  fname = fname.replace ('TenStackResults/','')
  fname = fname.replace ('CompareResults/','')
  fname = fname.replace (output_dir,'')
  fname = fname.replace ('_IndxBag','')
  fname = fname.replace ('_IndxBos','')
  fname = fname.replace ('_IndxTen','')
  fname = fname.replace ('_IndxStack','')
  fname = fname.replace ('_IndxVote','')
  fname = fname.replace ('_IndxTune','')
  fname = fname.replace ('_TenxTune','')
  fname = fname.replace ('_Clean','')
  fname = fname.replace ('_results','')
  fname = fname.replace ('PCA','')
  fname = fname.replace ('_','')
  fname = fname.replace ('Ten','')
  fname = fname.replace ('.csv','')

  return fname

# **Data Preprocessing**

In [None]:
def Dataset_Preprocess (fname):
    
  start_time = time.time()
    
  fname = dataPath + fname

  fname = Clean_Dataset (fname)

  if   _Sampling    : fname = Balance_Data(fname)
      
  if   _GainRatio : fname = FS_GainRatio (fname)
  elif _PCA       : fname = FS_PCA (fname, num_feature)


      
  logger.info(f"Preprocessing time: {time.time() - start_time:.3f} seconds")

  return fname

In [None]:
def prepareVars(fname):

  dataset = pd.read_csv (fname)
  ind = dataset.iloc[:, 0:-1]
  dep = dataset.iloc[:,-1]
  indTun, depTun = None, None

  if _GridSearch:
    ind, indTun, dep, depTun = train_test_split(ind, dep, test_size=0.1,  random_state=1, stratify=dep)
    ind = ind.reset_index(drop = True)
    dep = dep.reset_index(drop = True)

  return ind, dep, indTun, depTun

## **Clean Data**

In [None]:
# -----------------------------------------  Fun: Handle Missing Values

def Clean_Dataset (fname):

  dataset = Read_DataSet (fname)
  fname   = fname.replace(dataPath, prePath)

  dataset = Handle_Missing (dataset)

  dataset = Feature_Transformation (dataset)

  fname = fname.replace('.csv','') + '_Clean.csv'
  dataset.to_csv (fname, ',', index = False)

  return fname

In [None]:
def Read_DataSet (fname) :

    missing = ['NA', 'n/a', 'na', '--']
    dataset = pd.read_csv (fname, na_values = missing)

    # Log the initial number of rows and columns in the dataset
    initial_rows, initial_cols = dataset.shape
    logger.info(f"Initial dataset shape: {initial_rows} rows, {initial_cols} columns")
    
    # Remove duplicate rows from the dataset
    dataset.drop_duplicates(inplace=True)  # 'inplace=True' modifies the dataset directly
    
    # Apply 'nunique' across columns to find the number of unique values in each column
    nunique = dataset.apply(pd.Series.nunique)
    
    # Identify columns where the number of unique values is exactly 1
    colsDrop = nunique[nunique == 1].index
    dataset.drop(colsDrop, axis=1, inplace=True)
    
    if len(colsDrop) > 0:
        logger.info(f"Columns to drop: {len(colsDrop)} {list(colsDrop)}")
    else:
         logger.info("No columns to drop.")
    
    final_rows, final_cols = dataset.shape
    logger.info(f"Final dataset shape after dropping columns and remove duplicates: {final_rows} rows, {final_cols} columns")

    
    return dataset

In [None]:
def Handle_Missing (dataset) :

  ind = dataset.iloc[:, 0:-1].values

  imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
  imputer = imputer.fit(ind[:, 1:])
  ind[:, 1:] = imputer.transform(ind[:, 1:])

  dataset.iloc[:, 0:-1] = ind

  return dataset

In [None]:
def Feature_Transformation (dataset) :

  ind = dataset.iloc[:, 0:-1]
  dep = dataset.iloc[:,-1]

  if   _MinMax : newInd = MinMaxScaler().fit_transform(ind.values)
  elif _Scaler : newInd = StandardScaler().fit_transform(ind.values)
  elif _MaxAbs : newInd = MaxAbsScaler().fit_transform(ind.values)
  elif _Robust : newInd = RobustScaler().fit_transform(ind.values)
  elif _Quanti : newInd = QuantileTransformer().fit_transform(ind.values)
  elif _BoxCox : newInd = PowerTransformer(method='box-cox').fit_transform(ind.values)
  elif _LogTra : newInd = FunctionTransformer(np.log1p, validate=True).fit_transform(ind.values)

  encodeDep = LabelEncoder()
  newDep = encodeDep.fit_transform(dep.values)

  newDS = pd.concat([pd.DataFrame(newInd), pd.DataFrame(newDep)], axis=1)
  newDS.columns = dataset.columns

  return newDS

## **Imbalance Data**

In [None]:
# -----------------------------------------  Check if dataset is imbalanced

def Check_Imbalance(fname):

  imbFlag = False
  threshold = 10
  dataset = pd.read_csv (fname)
  dep = dataset.iloc[:,-1]

  count = dep.value_counts()
  falseRatio = (count[0] / count.sum()) * 100
  trueRatio = (count[1] / count.sum()) * 100

  if falseRatio < threshold or trueRatio < threshold:
    imbFlag = True

  return imbFlag

In [None]:
# -----------------------------------------  Balance Imbalanced Data

def Balance_Data (fname):

  dataset = pd.read_csv (fname)

  ind = dataset.iloc[:, 0:-1]
  dep = dataset.iloc[:,-1]
  logger.info(f"Instances before balancing: {Counter(dep)}")

  # Count the instances of each class after NCR
  unique, counts = np.unique(dep, return_counts=True)
  class_counts = dict(zip(unique, counts))


  if _SMOTE:
      sm = SMOTE(random_state=42)
  elif _UnderSampling:
      #enn = EditedNearestNeighbours(sampling_strategy='all', n_neighbors=3)
      #ind, dep = enn.fit_resample(ind, dep)
      if _max_rows == None: 
          sm = RandomUnderSampler()
      elif  all(count > _max_rows for count in class_counts.values()):
          sm = RandomUnderSampler(sampling_strategy={key: _max_rows for key in class_counts.keys()})
      else: 
          sm = RandomUnderSampler()

  newInd, newDep = sm.fit_resample(ind, dep)
  logger.info(f"Instances after balancing: {Counter(newDep)}")

  balanced = pd.concat([pd.DataFrame(newInd), pd.DataFrame(newDep)], axis=1)
  balanced.columns = dataset.columns

  fname = fname.replace('.csv','') + "_Balanced.csv"
  balanced.to_csv (fname, ",", index = False)

  return fname

## **Feature Selection**

In [None]:
# -----------------------------------------  Feature Selection - Gain Ratio

def FS_GainRatio (fname, num_features=1):


  
  gain_ratio_scores = {}
  gr_score = pd.DataFrame(columns = ['Features', 'Gain Ratio', 'Dataset'])

  # -----------------------------------------  Reading dataset
  Dataset = pd.read_csv(fname)
  ind = Dataset.iloc[:, 0:-1]
  dep = Dataset.iloc[:,-1]
    
  logger.info(f"Total number of features before FS: {ind.shape[1]}")
    
  # -----------------------------------------  GR for Each Independent Var
  for col in ind.columns:
    score = info_gain.info_gain_ratio(dep, ind[col])
    gain_ratio_scores[col] = score
    gr_score = pd.concat([gr_score, pd.DataFrame([{'Features' : col, 'Gain Ratio' : score, 'Dataset' : getDatasetName(fname)}])], ignore_index = True )

  # -----------------------------------------  Sort Features (GR)
  #best = sorted(gain_ratio_scores.items(), key=lambda x: x[1], reverse=True)[0:num_features]
  best = sorted(gain_ratio_scores.items(), key=lambda x: x[1], reverse=True)

  # -----------------------------------------  Calculate Score Threshold (Mean)
  mean_score = mean([b[1] for b in best])
  gr_score['Threshold']= mean_score

  # -----------------------------------------  Filter Features - Threshold
  best_features = [b[0] for b in best if b[1] >= mean_score]
  best_scores   = [b[1] for b in best if b[1] >= mean_score]

  # -----------------------------------------  Save GR results
  Save_CSV (gr_score, fname.replace('.csv','_GainRatioValues.csv'))

  # -----------------------------------------  Save reduced file
  fname = fname.replace('.csv','_GainRatio.csv')
  ReducedDataset = pd.concat([ind[best_features], dep], axis=1, sort=False)
  ReducedDataset.to_csv(fname, ",", index = False)
  
  logger.info(f"Total number of features after FS: {ReducedDataset.shape[1] -1}")

  return fname

In [None]:
def FS_GainRatio(fname, num_features=1):
    #gain_ratio_scores = {}
    #gr_score = pd.DataFrame(columns = ['Features', 'Gain Ratio', 'Dataset'])
  
    # Read dataset
    Dataset = pd.read_csv(fname)
    ind = Dataset.iloc[:, :-1]
    dep = Dataset.iloc[:, -1]
    
    logger.info(f"Total number of features before FS: {ind.shape[1]}")
    
    # Calculate Gain Ratio for each feature
    gain_ratio_scores = {col: info_gain.info_gain_ratio(dep, ind[col]) for col in ind.columns}
    
    # Create DataFrame from scores
    gr_score = pd.DataFrame([
        {'Features': col, 'Gain Ratio': score, 'Dataset': getDatasetName(fname)}
        for col, score in gain_ratio_scores.items()
    ])

    # Sort features by Gain Ratio
    best = sorted(gain_ratio_scores.items(), key=lambda x: x[1], reverse=True)

    # Calculate mean score as threshold
    mean_score = mean(score for _, score in best)
    gr_score['Threshold'] = mean_score

    # Filter features by the threshold
    best_features = [feature for feature, score in best if score >= mean_score]

    # Save Gain Ratio results
    Save_CSV(gr_score, fname.replace('.csv', '_GainRatioValues.csv'))

    # Save reduced dataset
    fname = fname.replace('.csv', '_GainRatio.csv')
    ReducedDataset = pd.concat([ind[best_features], dep], axis=1)
    ReducedDataset.to_csv(fname, index=False)
  
    logger.info(f"Total number of features after FS: {len(best_features)}")

    return fname


In [None]:
# -----------------------------------------  Feature Selection - Principal Component Analysis

def FS_PCA (fname, pca_num=2):

  ind, dep, _, _ = prepareVars (fname)
  logger.info(f"Total number of features before FS: {ind.shape[1]}")
    
  fname = fname.replace('.csv','_PCA.csv')

  pca = PCA(n_components=pca_num)
  com = pca.fit_transform(ind)

  ReducedDataset = pd.concat([pd.DataFrame(data=com), dep], axis=1, sort=False)
  ReducedDataset.to_csv(fname, ',', index = False)
  logger.info(f"Total number of features after FS: {ReducedDataset.shape[1] -1}")
    
  if _WriteFlag :
    output = getDatasetName (fname) + ' : ' + str(pca.explained_variance_ratio_) + '\n'
    Save_File (output, fname.replace('.csv','_Variance.txt'))

  Remove_Unsupp_Clfs ()
  PCA_variance_ratio_plot(pca)

  return fname

In [None]:
# ------------------------- Remove Naive Bayes classifiers - when using PCA
def Remove_Unsupp_Clfs ():
    try:
        #indClfs.pop('NB(B)')
        #indClfs.pop('NB(G)')
        #indClfs.pop('NB(M)')
        keys_to_remove = [key for key in indClfs.keys() if 'NB(M)' in key]
        # Remove the keys from the dictionary
        for key in keys_to_remove:
            indClfs.pop(key, None) 
    
    except:
        pass

In [None]:
def PCA_variance_ratio_plot(pca):

    plt.rcParams["figure.figsize"] = (12,6)
    
    fig, ax = plt.subplots()
    y = np.cumsum(pca.explained_variance_ratio_)
    xi = np.arange(1, len(y)+1 , step=1)
    
    plt.ylim(0.0,1.1)
    plt.plot(xi, y, marker='o', linestyle='--', color='b')
    
    plt.xlabel('Number of Components')
    plt.xticks(np.arange(0, len(y)+1, step=1)) #change from 0-based array index to 1-based human-readable label
    plt.ylabel('Cumulative variance (%)')
    plt.title('The number of components needed to explain variance')
    
    plt.axhline(y=0.95, color='r', linestyle='-')
    plt.text(0.5, 0.85, '95% cut-off threshold', color = 'red', fontsize=16)
    
    ax.grid(axis='x')
    plt.show()

# **Visualization**

In [None]:
# -----------------------------------------  Plot BarChart

def Plot_BarChart (allRes, fname, Color=None, Hue = None):

  metrices = barMetric

  for metric in metrices:

    f, ax = plt.subplots (figsize=(8, 8))

    sns.set_color_codes ('muted')
    fig = sns.barplot (x=metric, y='Classifier', hue=Hue, palette='Set2', data=allRes, color=Color)
    sns.despine()

    plt.xlabel (metric+ ' %')
    plt.title (getDatasetName(fname), fontweight='bold',size=12)

    if _WriteFlag : fig.get_figure().savefig(fname.replace('_results.csv','') + '_' + metric + '.png' , bbox_inches = 'tight', dpi=dpi)

    #plt.show()

In [None]:
# -----------------------------------------  Plot Boxplot

def Plot_Boxplot (boxRes, fname, Color = None, Hue = None):

  metrices = boxMetrics

  for metric in metrices: 

    f, ax = plt.subplots(figsize=(8, 8))

    flierprops = dict(markerfacecolor='0.75', markersize=5, linestyle='none')

    sns.boxplot (x=metric, y='Classifier', data=boxRes, hue=Hue, palette='vlag', flierprops=flierprops) #palette="vlag"

    plt.ylabel ('Classifier', size=12)
    plt.xlabel (metric, size=12)
    plt.title (getDatasetName(fname), fontweight='bold',size=12)

    if _WriteFlag : plt.savefig(fname.replace('_results.csv', '_' + metric + '_boxplot') + ".png" , bbox_inches = 'tight', dpi=dpi )

    #plt.show()

In [None]:
# -----------------------------------------  Plot ROC Curve Plot

def Plot_ROC (rocRes, fname):

  fig = plt.figure(figsize=(8, 6))

  for i in rocRes.index:
    plt.plot ( rocRes.loc[i]['FPR'],
               rocRes.loc[i]['TPR'],
               label='%s' % (i) + ' (' + str(round(rocRes.loc[i]['AUC'],2)) + ')')

  plt.plot ([0,1], [0,1], color='black', linestyle='--')

  plt.xticks (np.arange(0.0, 1.1, step=0.1))
  plt.xlabel ('False Positive Rate', fontsize=12)

  plt.yticks (np.arange(0.0, 1.1, step=0.1))
  plt.ylabel ('True Positive Rate', fontsize=12)

  plt.title (getDatasetName(fname), fontweight='bold', fontsize=13)
  plt.legend (prop={'size':13}, loc='lower right', bbox_to_anchor=(1.37, 0))

  if _WriteFlag : fig.savefig(fname.replace('_results.csv','_roc_curve') + '.png' , bbox_inches = 'tight', dpi=dpi)

  #plt.show()

In [None]:
# -----------------------------------------  Plot ScottKnottESD

def Plot_ScottKnottESD (boxplot_rank, fname):
    
  boxplot_rank['rank'] = boxplot_rank['rank'].astype('category')

  flip_xlabels = theme(
      axis_text_x = element_text(size=8,angle = 30, hjust = 6, color="black"),
      panel_background = element_rect(fill='white', alpha=.2),
      panel_border = element_rect(color='gray', size=1),
      legend_position = 'none',
      plot_title = element_text(size = 10, face = "bold"),
      axis_title_x = element_text(size=10, color="black"),
      axis_title_y = element_text(size=10, color="black")
    )

  ncol = boxplot_rank['rank'].nunique()
  #  palette= BuPu or Blues or YlGnBu, (type='div', palette='RdYlBu')
  img = ggplot(boxplot_rank, aes(x='Classifier', y='AUC', fill='rank')) + \
      geom_boxplot(alpha=.8) + \
      facet_wrap(['rank'], ncol = ncol, scales='free_x') + \
      scale_fill_brewer(palette='BuPu') + \
      flip_xlabels + \
      ggtitle(getDatasetName (fname))

  print (img)
    
  #ggsave(plot=img, filename=fname.replace('_results.csv','_ESD') + '.png', dpi=dpi)
  if _WriteFlag : img.save(filename=fname.replace('_results.csv','_ESD') + '.png', dpi=dpi) 

#  **Model Validation**

## **Cross Validation**

In [None]:
# -----------------------------------------  Cross Validation Type

def Cross_Fold ():

  if   CV == '10SCV' : folds = StratifiedKFold(n_splits=10)
  elif CV == '10'    : folds = KFold(n_splits=10, random_state = 42)
  elif CV == 'LOO'   : folds = LeaveOneOut()
  elif CV == '5x2'   : folds = RepeatedStratifiedKFold(n_splits=2,  n_repeats=5,  random_state=1)
  elif CV == '10x10' : folds = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=1)

  return folds

## **Evaluate Model**

In [None]:
def mcc(tp,fp,tn,fn):
    try:
        return (tp*tn - fp*fn)/((tp+fp)*(tp+fn)*(fn+tn)*(fp+tn))**0.5
    except:
        return 0

In [None]:
# -----------------------------------------  Fun: Evaluate ML Models

def Evaluate_Model (clf, algorithm_name, ind, dep, fname, type = 'Individual'):
  global Res, Roc, Det

  folds = Cross_Fold ()

  scoring = {
      'Accuracy'  : 'accuracy',
      'Precision' : 'precision',
      'Recall'    : 'recall',
      'F1-score'  : 'f1',
      'F1-micro'  : 'f1_micro',
      'F1-weight' : 'f1_weighted',
      'Brier'     : 'neg_brier_score',
      'AUC'       : 'roc_auc',
      'MCC'       : make_scorer(matthews_corrcoef),
      'Confusion' : make_scorer(ConfusionMatrix, greater_is_better=False, algorithm_name=algorithm_name, fname=fname, type=type)
  }

  print (clf)

  scores = cross_validate (clf, ind, dep, cv=folds, scoring=scoring, n_jobs=-1)

  acc_results = scores ['test_Accuracy']
  pre_results = scores ['test_Precision']
  rec_results = scores ['test_Recall']
  f1s_results = scores ['test_F1-score']
  f1m_results = scores ['test_F1-micro']
  f1w_results = scores ['test_F1-weight']
  bre_results = abs( scores ['test_Brier'] )
  auc_results = scores ['test_AUC']
  mcc_results = scores ['test_MCC']
    
  Res = pd.concat([Res, pd.DataFrame([{'Classifier' : algorithm_name,
                    'Accuracy'   : np.round(acc_results.mean() * 100,2),
                    'Precision'  : np.round(pre_results.mean() * 100,2),
                    'Recall'     : np.round(rec_results.mean() * 100,2),
                    'F1-score'   : np.round(f1s_results.mean() * 100,2),
                    'F1-micro'   : np.round(f1m_results.mean() * 100,2),
                    'F1-weight'  : np.round(f1w_results.mean() * 100,2),
                    'Brier'      : np.round(abs(bre_results.mean()),2),
                    'AUC'        : np.round(auc_results.mean(),2),
                    'MCC'        : np.round(mcc_results.mean(), 2), 
                    'Type'       : type,
                    'Dataset'    : getDatasetName (fname)}])], ignore_index=True)

  for i in range(0,len(acc_results)):
    Det =  pd.concat([Det, pd.DataFrame([{'Classifier' : algorithm_name,
                      'Accuracy'   : np.round(acc_results[i] * 100,2),
                      'Precision'  : np.round(pre_results[i] * 100,2),
                      'Recall'     : np.round(rec_results[i] * 100,2),
                      'F1-score'   : np.round(f1s_results[i] * 100,2),
                      'F1-micro'   : np.round(f1m_results[i] * 100,2),
                      'F1-weight'  : np.round(f1w_results[i] * 100,2),
                      'Brier'      : np.round(abs(bre_results[i]),2),
                      'AUC'        : np.round(auc_results[i],2),
                      'MCC'        : np.round(mcc_results.mean(), 2), 
                      'Type'       : type,
                      'Dataset'    : getDatasetName (fname)}])],
                      ignore_index = True)



In [None]:
def ConfusionMatrix (y_true, y_pred, algorithm_name, fname, type, **kwargs):
  global Mat

  y_true, y_pred = np.array(y_true), np.array(y_pred)

  for i in range(0,len(y_true)):
    Mat =  pd.concat([Mat, pd.DataFrame([{'Classifier' : algorithm_name,
                      'Actual'     : y_true[i],
                      'Pred'       : y_pred[i],
                      'Type'       : type,
                      'Dataset'    : getDatasetName (fname)}])],
                      ignore_index=True)

  return -1

## **Hyperparameters Tuning**

In [None]:
def Tune_Model (clf, algorithm_name, ind, dep, fname):

  TuneDF = prepareTuneDataFrame ()

  print( colors.BOLD + colors.RED + '======================   Hyperparameters Tuning ('+algorithm_name+'). Be patient 😊')

  folds = RepeatedStratifiedKFold(n_splits=2, n_repeats=5, random_state=1)
  # folds = StratifiedKFold(n_splits=5, random_state=1)

  space = dict()

  if algorithm_name == 'DT':
    space = {
        'max_depth'        : [None, 2, 4, 6, 8, 10, 12],
        'splitter'         : ['best', 'random'],
        'max_features'     : [None, 'auto', 'sqrt', 'log2'],
        'criterion'        : ['gini', 'entropy']
    }

  elif algorithm_name == 'LR':
    space = {
        'solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
        'penalty' : ['none', 'l1', 'l2', 'elasticnet'],
        'C'       : [0.01, 0.1, 1, 10, 100]
    }

  elif algorithm_name == 'MLP':
    space = {
        'hidden_layer_sizes' : [(100,)],
        'activation'         : ['relu', 'identity', 'logistic','tanh'],
        'solver'             : ['adam', 'lbfgs', 'sgd'],
        'alpha'              : [0.0001, 0.05],
        'learning_rate'      : ['constant', 'invscaling', 'adaptive']
    }

  elif algorithm_name == 'SGD':
    space = {
        'loss'          : ['squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
        'penalty'       : ['l1', 'l2', 'elasticnet'],
        'alpha'         : [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'learning_rate' : ['constant', 'optimal', 'invscaling', 'adaptive'],
        'class_weight'  : [{1:0.5, 0:0.5}, {1:0.4, 0:0.6}, {1:0.6, 0:0.4}, {1:0.7, 0:0.3}],
        'eta0'          : [0.01, 1, 10, 100]
    }

  elif algorithm_name == 'GP':
    space = {
        'kernel' : [None, 1.0*RBF(1.0), 1.0*Matern(1.0)]
    }

  elif algorithm_name == 'LDA':
    space = {
        'solver' : ['svd', 'lsqr', 'eigen'],
        'tol'    : [0.0001,0.0002,0.0003]
    }

  elif algorithm_name == 'KNN':
    space = {
        'n_neighbors' : range(1, 10, 2),
        'weights'     : ['uniform', 'distance'],
        'metric'      : ['euclidean', 'manhattan', 'minkowski'],
        'algorithm'   : ['auto', 'ball_tree', 'kd_tree', 'brute']
    }

  elif algorithm_name == 'SVM':
    space = {
        'kernel' : ['rbf', 'linear', 'poly', 'sigmoid'],
        'C'      : [1.0, 100, 10, 0.1, 0.001],
        'gamma'  : ['scale', 'auto']
    }

  elif 'BG-' in algorithm_name:
    space = {
        'n_estimators' : [10, 100, 1000],
        'bootstrap'    : [True, False]
    }

  elif  algorithm_name in  ['BS-','Ada']:
    space = {
        'n_estimators'  : [50, 100, 1000],
        'learning_rate' : [1, 0.1, 0.001, 0.01]
    }

  elif 'Stack' in algorithm_name:
    space = {
    }

  elif 'Vote' in algorithm_name:
    space = {
    }

  elif algorithm_name == 'RF':
    space = {
        # 'n_estimators'      : [100, 50, 500, 1000],
        'n_estimators'      : [100, 50, 40, 30],
        # 'max_features'      : ['auto', 'sqrt', 'log2'],
        # 'min_samples_split' : [2, 5, 10],
        # 'min_samples_leaf'  : [1, 2, 4],
        'criterion'         : ['gini', 'entropy'],
        # 'bootstrap'         : [True, False],
        'max_depth'           : [1, 4, 8],
        'min_samples_leaf'    : [1, 10, 5]

    }

  elif algorithm_name == 'ET':
    space = {
        # 'n_estimators'      : [100, 50, 500, 1000],
        'n_estimators'      : [100, 50, 40, 30],
        #'max_features'      : ['auto', 'sqrt', 'log2'],
        #'min_samples_split' : [2, 5, 10],
        'min_samples_leaf'  : [1, 2, 4],
        'criterion'         : ['gini', 'entropy'],
        #'bootstrap'         : [False, True],
        'max_depth'           : [1, 4, 8]
    }

  elif algorithm_name == 'GB':
    space = {
        'n_estimators'      : [100, 50, 500, 1000],
        'loss'              : ['deviance', 'exponential'],
        'learning_rate'     : [0.1, 0.001, 0.01],
        # 'subsample'       : [1.0, 0.5, 0.7 ],
        'max_depth'         : [3, 7, 9],
        'min_samples_leaf'  : [1, 10, 5]
    }

  elif algorithm_name == 'HGB':
    space = {
        'loss'              : ['auto', 'binary_crossentropy', 'categorical_crossentropy'],
        'max_iter'          : [100, 50, 500, 1000],
        'learning_rate'     : [0.1, 0.001, 0.01],
        'max_depth'         : [None, 1, 3, 5],
        'min_samples_leaf'  : [20, 10, 5],
        # 'max_depth'         : [None, 25, 50, 75],
        # 'l2_regularization' : [0, 1.5]
    }

  elif algorithm_name == 'XGB':
    space = {
        'n_estimators'     : [100, 50, 500, 1000],
        'learning_rate'    : [0.3, 0.1, 0.001, 0.01],
        'max_depth'        : [6, 3, 4, 5],
        # 'subsample'        : [1.0, 0.6, 0.8],
        # 'colsample_bytree' : [1.0, 0.6, 0.8],
        # 'min_child_weight' : [1, 5, 10],
        # 'gamma'            : [0, 0.5, 1, 1.5, 2, 5]
    }

  elif algorithm_name == 'CAT':
    space = {
        'n_estimators'      : [1000, 100, 50, 500],
        'depth'             : [3, 1 , 5, 10],
        'learning_rate'     : [0.03, 0.001, 0.01, 0.1],
        'loss_function'     : ['Logloss', 'MultiClass'],
        'min_data_in_leaf'  : [1, 10],
        # 'l2_leaf_reg'       : [1, 10, 100]
    }

  start_time = time.time()

  search = GridSearchCV (clf, space, n_jobs=-1, cv=folds, scoring = 'roc_auc')
  result = search.fit (ind, dep)

  tune_time = round((time.time() - start_time))

  if _WriteFlag:
    TuneDF = pd.concat([TuneDF, pd.DataFrame([{'Dataset'    : getDatasetName(fname),
                            'Classifier' : algorithm_name,
                            'Time'       : tune_time,
                            'Params'     : result.best_estimator_.get_params()}])]
                           ,ignore_index=True)

    Save_CSV (TuneDF, '_HyperTuning.csv', 'a')

  return result.best_estimator_

# **Machine Learning Models**

## **Prepare Models**

In [None]:
def add_feature_selection_to_names(clfs):

    if _PCA:
        # Create a new dictionary with 'PCA' appended to the keys if feature selection is True
        clfs = {f"{name}-PCA": clf for name, clf in clfs.items()}
    elif _GainRatio:
        # Create a new dictionary with 'PCA' appended to the keys if feature selection is True
        clfs = {f"{name}-GR": clf for name, clf in clfs.items()}
        
    return clfs

In [None]:
# -----------------------------------------  Prepare Classifiers

def prepareIndividualModels ():

  clfs = {
            'DT'   : DecisionTreeClassifier(),
            'LR'   : LogisticRegression(),
            'SVM'  : CalibratedClassifierCV(LinearSVC()) ,  #SVC(kernel='linear', max_iter=1000, probability = True), #SVC(kernel='linear', max_iter=1000, probability = True)
            'MLP'  : MLPClassifier(),
            'SGD'  : SGDClassifier(loss= 'log_loss'),
            #'GP'   : GaussianProcessClassifier(),
            'NB(B)': BernoulliNB(),
            'NB(G)': GaussianNB(),
            'NB(M)': MultinomialNB(),
            #'LDA'  : LinearDiscriminantAnalysis(),
            'KNN'  : KNeighborsClassifier()
         }

  clfs = add_feature_selection_to_names(clfs)

  return clfs

In [None]:
def prepareBaggingEnsemble ():

  clfs = indClfs.copy()

  for name in indClfs:

    bagName = 'BG-' + name
    clfs[bagName] = clfs.pop(name)

  return clfs

In [None]:
def prepareBoostingEnsemble ():

  clfs = indClfs.copy()

  for name in indClfs:

    # ------------- Remove clfs that doesn't support sample_weight
    if name in {'MLP','GP','KNN','LDA'}:
      clfs.pop (name)
      continue

    bosName = 'BS-' + name
    clfs[bosName] = clfs.pop(name)

  return clfs

In [None]:
def prepareTreeBasedEnsembles ():

  clfs = {
      'Ada' : AdaBoostClassifier (algorithm="SAMME"),
      'RF'  : RandomForestClassifier (random_state=0),
      'ET'  : ExtraTreesClassifier (random_state=0),
      'GB'  : GradientBoostingClassifier (random_state=0),
      'HGB' : HistGradientBoostingClassifier (),
      'XGB' : XGBClassifier (verbose=0),
      'CAT' : CatBoostClassifier (verbose=0)
  }
    
  clfs = add_feature_selection_to_names(clfs)
    
  return clfs

In [None]:
def get_stacking_meta ():
    m = {
      'LR'  : LogisticRegression(),
      'DT'  : DecisionTreeClassifier(),
      #'SVM' : SVC(kernel='linear', probability = True)
      }
    return m

In [None]:
def prepareStackClassifier ():

  clfs = []
  stacks = {}

  if incInd :
    for key, value in indClfs.items():
      temp = [key,value]
      clfs.append(temp)

  elif incTen :
    for key, value in tenClfs.items():
      temp = [key,value]
      clfs.append(temp)

  meta = get_stacking_meta ()

  for name in meta:
    staName = 'Stack' + '-' + name
    stacks[staName] = clfs
      
  stacks = add_feature_selection_to_names(stacks)
    
  return stacks, meta

In [None]:
def prepareTreeStackClassifier ():

  clfs = []
  stacks = {}

  for key, value in tenClfs.items():
      temp = [key,value]
      clfs.append(temp)

  meta = get_stacking_meta ()

  for name in meta:
    staName = 'TBStack' + '-' + name
    stacks[staName] = clfs

  stacks = add_feature_selection_to_names(stacks)
    
  return stacks, meta

In [None]:
def prepareVotingEnsemble ():

  clfs = []
  vote = {}

  for key, value in indClfs.items():
    temp = [key,value]
    clfs.append(temp)

  vote['Vote'] = clfs
  vote = add_feature_selection_to_names(vote)
    
  return vote

In [None]:
def prepareModelsTuning (clfs, ind, dep, fname, type='Individual'):
  global indClfs, unIndClfs, tenClfs, unTenClfs

  for name in clfs:

    clf = clfs [name]
    clf = Tune_Model (clf, name, ind, dep, fname)

    if type == 'Individual':
      unIndClfs ['D-' + name] =  indClfs [name]
      indClfs [name] = clf

    if type == 'TreeBased':
      unTenClfs ['D-' + name] =  tenClfs [name]
      tenClfs [name] = clf

In [None]:
def Create_Ensemble (clf, type, meta = ''):

  if   type == 'Bagging'      :  clf = BaggingClassifier  (base_estimator=clf)
  elif type == 'Boosting'     :  clf = AdaBoostClassifier (base_estimator=clf, algorithm="SAMME")
  elif type == 'Stacking'     :  clf = StackingClassifier (estimators=clf, final_estimator=meta)
  elif type == 'TreeStacking' :  clf = StackingClassifier (estimators=clf, final_estimator=meta)
  elif type == 'Voting'       :  clf = VotingClassifier   (estimators=clf, voting='soft')

  return clf

## **Prepare Dataframes**

In [None]:
# -----------------------------------------  Prepare Data Frames

def prepareDataFrames ():

  ResDF  = pd.DataFrame  (columns = ['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-score','F1-micro', 'F1-weight', 'Brier', 'AUC', 'MCC', 'Type', 'Dataset'])
  RocDF  = pd.DataFrame  (columns = ['Classifier', 'FPR', 'TPR', 'AUC', 'Type', 'Dataset'])
  DetDF  = pd.DataFrame  (columns = ['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-score','F1-micro', 'F1-weight', 'Brier', 'AUC', 'MCC', 'Type', 'Dataset'])
  MatDF  = pd.DataFrame  (columns = ['Classifier', 'Actual', 'Pred', 'Type', 'Dataset'])
  TimeDF = pd.DataFrame  (columns = ['Classifier', 'Dataset', 'Training Time'])

  return ResDF, RocDF, DetDF, MatDF, TimeDF

In [None]:
def prepareTuneDataFrame ():

  TuneDF = pd.DataFrame  (columns = ['Dataset', 'Classifier', 'Time', 'Params'])

  return TuneDF

In [None]:
# -----------------------------------------  Prepare Stat Data Frames

def prepareStatDataFrame ():

  StatDF = pd.DataFrame (columns = ['Classifier_1', 'Classifier_2', 'Test', 'Stat', 'p-value', 'Null Hypo', 'Win', 'Lost', 'Dataset'])

  return StatDF

## **Models**

In [None]:
# -----------------------------------------  Machine Learning Models

def Machine_Learning_Models (clfs, ind, dep, fname, type = 'Individual', meta = ''):
  global Res, Roc, Det, Mat, TimeDF

  Res, Roc, Det, Mat, TimeDF = prepareDataFrames ()

  fname = getPath(type) + fname.replace('.csv','_results.csv')

  if type in ['Stacking', 'TreeStacking']:

    for (name,clf), (m,mClf) in zip(clfs.items(), meta.items()):

      clf = Create_Ensemble (clf, type, mClf)
      # log training time
      start_time = time.time()
      Evaluate_Model (clf, name, ind, dep, fname, type)
      logger.info(f"Training and evaluation time for {name}: {time.time() - start_time:.3f} seconds")
      TimeDF = pd.concat([TimeDF, pd.DataFrame([{'Classifier' : name,
                    'Dataset'    : getDatasetName (fname),
                    'Training Time'  : round(time.time() - start_time, 3),
                    'Type'       : type,
                    }])], ignore_index=True)

  else:
    for ( name , clf ) in clfs.items():

      if   type == 'Bagging'  : clf = Create_Ensemble (clf, type)
      elif type == 'Boosting' : clf = Create_Ensemble (clf, type)
      elif type == 'Voting'   : clf = Create_Ensemble (clf, type)
      # log time
      start_time = time.time()
      Evaluate_Model (clf, name, ind, dep, fname, type)
      logger.info(f"Training and evaluation time for {name}: {time.time() - start_time:.3f} seconds")
      TimeDF = pd.concat([TimeDF, pd.DataFrame([{'Classifier' : name,
                    'Dataset'    : getDatasetName (fname),
                    'Training Time'  :round(time.time() - start_time, 3),
                    'Type'       : type }])], ignore_index=True)

  Roc.set_index ('Classifier', inplace=True)

  Save_CSV (Res, fname)
  Save_CSV (Det, fname.replace('results.csv', 'detailed_results.csv'))
  Save_CSV (Mat, fname.replace('results.csv', 'matrix_results.csv'))
  Save_CSV (TimeDF, fname.replace('results.csv', 'Training_Time.csv'))
  

  if _Bar : Plot_BarChart (Res, fname)
  if _ROC : Plot_ROC      (Roc, fname)
  if _Box : Plot_Boxplot  (Det, fname)
  if _ESD : 
      try: Plot_ScottKnottESD (ScottKnottESD(Det, fname), fname)
      except: print('Only one model cant run ESD')

## **Compare Models**

In [None]:
# -----------------------------------------  Comparing Models Within Dataset

def Compare_Models_Within_Dataset (ind, dep, indTun, depTun, fname):
  global allRes, allRoc, allDet, allMat, allTimeDF, allStat, allDatasetRes, allDatasetRoc, allDatasetDet, allDatasetMat, allDatasetTimeDF, allDatasetStat

  if _GridSearch:
    if incInd : prepareModelsTuning (indClfs, indTun, depTun, fname, 'Individual')
    if incTen : prepareModelsTuning (tenClfs, indTun, depTun, fname, 'TreeBased')

  if incInd:
    Machine_Learning_Models (indClfs, ind, dep, fname, 'Individual')
    allRes = Res
    allRoc = Roc
    allDet = Det
    allMat = Mat
    allTimeDF = TimeDF
    

  if incBag:
    bagClfs = prepareBaggingEnsemble ()
    Machine_Learning_Models (bagClfs, ind, dep, fname, 'Bagging')
    allRes  =  pd.concat([allRes, Res])
    allRoc  =  pd.concat([allRoc, Roc])
    allDet  =  pd.concat([allDet, Det])
    allMat  =  pd.concat([allMat, Mat])
    allTimeDF  =  pd.concat([allTimeDF, TimeDF])

  if incBos:
    bosClfs = prepareBoostingEnsemble ()
    Machine_Learning_Models (bosClfs, ind, dep, fname, 'Boosting')
    allRes  =  pd.concat([allRes, Res])
    allRoc  =  pd.concat([allRoc, Roc])
    allDet  =  pd.concat([allDet, Det])
    allMat  =  pd.concat([allMat, Mat])
    allTimeDF  =  pd.concat([allTimeDF, TimeDF])

  if incTen:
    Machine_Learning_Models (tenClfs, ind, dep, fname, 'TreeBased')
    allRes  =  pd.concat([allRes, Res])
    allRoc  =  pd.concat([allRoc, Roc])
    allDet  =  pd.concat([allDet, Det])
    allMat  =  pd.concat([allMat, Mat])
      
  if incVot:
    votClfs = prepareVotingEnsemble ()
    Machine_Learning_Models (votClfs, ind, dep, fname, 'Voting')
    allRes  =  pd.concat([allRes, Res])
    allRoc  =  pd.concat([allRoc, Roc])
    allDet  =  pd.concat([allDet, Det])
    allMat  =  pd.concat([allMat, Mat])
    allTimeDF  =  pd.concat([allTimeDF, TimeDF])
      
  if incSta:
    stackClfs, metaClfs = prepareStackClassifier ()
    Machine_Learning_Models (stackClfs, ind, dep, fname, 'Stacking', metaClfs)
    allRes  =  pd.concat([allRes, Res])
    allRoc  =  pd.concat([allRoc, Roc])
    allDet  =  pd.concat([allDet, Det])
    allMat  =  pd.concat([allMat, Mat])
    allTimeDF  =  pd.concat([allTimeDF, TimeDF])

  if incTenSta:
    tenstackClfs, metaClfs = prepareTreeStackClassifier ()
    Machine_Learning_Models (tenstackClfs, ind, dep, fname, 'TreeStacking', metaClfs)
    allRes  =  pd.concat([allRes, Res])
    allRoc  =  pd.concat([allRoc, Roc])
    allDet  =  pd.concat([allDet, Det])
    allMat  =  pd.concat([allMat, Mat])
    allTimeDF  =  pd.concat([allTimeDF, TimeDF])
      

  if _CompIndTune:
    Machine_Learning_Models (unIndClfs, ind, dep, fname, 'Individual')
    allRes  =  pd.concat([allRes, Res])
    allRoc  =  pd.concat([allRoc, Roc])
    allDet  =  pd.concat([allDet, Det])
    allMat  =  pd.concat([allMat, Mat])
    allTimeDF  =  pd.concat([allTimeDF, TimeDF])
    Compare_Tune_Vs_Default (allRes, allRoc, allDet, fname)

  if _CompTenTune:
    Machine_Learning_Models (unTenClfs, ind, dep, fname, 'TreeBased' )
    allRes  =  pd.concat([allRes, Res])
    allRoc  =  pd.concat([allRoc, Roc])
    allDet  =  pd.concat([allDet, Det])
    allMat  =  pd.concat([allMat, Mat])
    allTimeDF  =  pd.concat([allTimeDF, TimeDF])
    Compare_Tune_Vs_Default (allRes, allRoc, allDet, fname)

  if _CompInd:
    Compare_Models_Vs_Indiv (allRes, allRoc, allDet, fname)

  if _CompAll:
    Compare_Models (allRes, allRoc, allDet, fname)
    allDatasetRes  =  pd.concat([allDatasetRes, allRes])
    allDatasetRoc  =  pd.concat([allDatasetRoc, allRoc])
    allDatasetDet  =  pd.concat([allDatasetDet, allDet])
    allDatasetMat  =  pd.concat([allDatasetMat, allMat])
    allDatasetTimeDF  =  pd.concat([allDatasetTimeDF, allTimeDF])


  if _StatFlag:
    allStat = Stat_Test (allDet, fname, save=False)
    allDatasetStat = pd.concat([allDatasetStat, allStat])

In [None]:
# -----------------------------------------  Comparing Models

def Compare_Models (allRes, allRoc, allDet, fname, type = ''):

  fname = getPath('Compare') + fname.replace('.csv', '_' + type + '_results.csv')

  if _ROC : Plot_ROC     (allRoc, fname)
  if _Box : Plot_Boxplot (allDet, fname)
  if _ESD : 
      try: Plot_ScottKnottESD (ScottKnottESD(allDet, fname), fname)
      except: print('Cant run ESD')
  

  if _StatFlag : Stat_Test (allDet, fname.replace(getPath('Compare'), ''))

  Save_CSV (allRes, fname)
  Save_CSV (allDet, fname.replace('results.csv', 'detailed_results.csv'))
  Save_CSV (allTimeDF, fname.replace('results.csv', 'Training_Time.csv'))

In [None]:
# -----------------------------------------  Comparing Against Individual Models

def Compare_Models_Vs_Indiv (allRes, allRoc, allDet, fname):

  if incBag:
    Res = allRes.loc[(allRes.Type == 'Individual') | (allRes.Type == 'Bagging')]
    Roc = allRoc.loc[(allRoc.Type == 'Individual') | (allRoc.Type == 'Bagging')]
    Det = allDet.loc[(allDet.Type == 'Individual') | (allDet.Type == 'Bagging')]
    Compare_Models (Res, Roc, Det, fname, 'IndxBag')

  if incBos:
    Res = allRes.loc[(allRes.Type == 'Individual') | (allRes.Type == 'Boosting')]
    Roc = allRoc.loc[(allRoc.Type == 'Individual') | (allRoc.Type == 'Boosting')]
    Det = allDet.loc[(allDet.Type == 'Individual') | (allDet.Type == 'Boosting')]
    Compare_Models (Res, Roc, Det, fname, 'IndxBos')

  if incTen:
    Res = allRes.loc[(allRes.Type == 'Individual') | (allRes.Type == 'TreeBased')]
    Roc = allRoc.loc[(allRoc.Type == 'Individual') | (allRoc.Type == 'TreeBased')]
    Det = allDet.loc[(allDet.Type == 'Individual') | (allDet.Type == 'TreeBased')]
    Compare_Models (Res, Roc, Det, fname, 'IndxTen')

  if incSta:
    Res = allRes.loc[(allRes.Type == 'Individual') | (allRes.Type == 'Stacking')]
    Roc = allRoc.loc[(allRoc.Type == 'Individual') | (allRoc.Type == 'Stacking')]
    Det = allDet.loc[(allDet.Type == 'Individual') | (allDet.Type == 'Stacking')]
    Compare_Models (Res, Roc, Det, fname, 'IndxStack')

  if incTenSta:
    Res = allRes.loc[(allRes.Type == 'Individual') | (allRes.Type == 'TreeStacking')]
    Roc = allRoc.loc[(allRoc.Type == 'Individual') | (allRoc.Type == 'TreeStacking')]
    Det = allDet.loc[(allDet.Type == 'Individual') | (allDet.Type == 'TreeStacking')]
    Compare_Models (Res, Roc, Det, fname, 'IndxTreeStack')

  if incVot:
    Res = allRes.loc[(allRes.Type == 'Individual') | (allRes.Type == 'Voting')]
    Roc = allRoc.loc[(allRoc.Type == 'Individual') | (allRoc.Type == 'Voting')]
    Det = allDet.loc[(allDet.Type == 'Individual') | (allDet.Type == 'Voting')]
    Compare_Models (Res, Roc, Det, fname, 'IndxVote')

In [None]:
def Compare_Tune_Vs_Default (allRes, allRoc, allDet, fname):

  if _CompIndTune:
    Res = allRes.loc[(allRes.Type == 'Individual')]
    Roc = allRoc.loc[(allRoc.Type == 'Individual')]
    Det = allDet.loc[(allDet.Type == 'Individual')]
    Compare_Models (Res, Roc, Det, fname, 'IndxTune')

  if _CompTenTune:
    Res = allRes.loc[(allRes.Type == 'TreeBased')]
    Roc = allRoc.loc[(allRoc.Type == 'TreeBased')]
    Det = allDet.loc[(allDet.Type == 'TreeBased')]
    Compare_Models (Res, Roc, Det, fname, 'TenxTune')

In [None]:
# -----------------------------------------  Comparing Models Across Datasets

def Save_Models_All_Datasets ():

  fname = getPath('Compare')

  Save_Excel (allDatasetRes,  fname + getToday() + '_' + getTime() + '_AllDatasetResults.xlsx')
  Save_Excel (allDatasetRoc,  fname + getToday() + '_' + getTime() + '_AllDatasetROC.xlsx')
  Save_Excel (allDatasetDet,  fname + getToday() + '_' + getTime() + '_AllDatasetDet.xlsx')
  #Save_Excel (allDatasetMat,  fname + getToday() + '_' + getTime() + '_AllDatasetMat.xlsx')
  Save_Excel (allDatasetStat, fname + getToday() + '_' + getTime() + '_AllDatasetStatisticalAnalysis.xlsx')
  Save_Excel (allDatasetTimeDF, fname + getToday() + '_' + getTime() + '_AllDatasetTrainingTime.xlsx')

## **Statistical Analysis**

In [None]:
def Check_Normal (sample):

  #return False
  alpha = 0.05
  normal = False

  w , p = shapiro(sample)

  if p > alpha : normal = True
  else         : normal = False

  return normal

In [None]:
def Stat_Test (allDet, fname):

  fname   = getPath('Compare') + fname.replace('.csv','_results.csv')
  statRes = prepareStatDataFrame ()
  Metric  = statMetric #'Accuracy'

  statDet = pd.concat([allDet['Classifier'], allDet[Metric]], axis=1)
  names = statDet['Classifier'].unique().tolist()

  original_alpha = 0.05  # Original significance level
  k = len(names)  # Number of models
  m = k * (k - 1) / 2  # Number of pairwise comparisons

  # Adjust the threshold p-value
  alpha_adjusted = original_alpha / m
    
  for i in range(len(names)):

    name = names[i]
    nestedNames = names[i+1:]

    for nested in nestedNames:

      model_1 =  statDet.loc[statDet['Classifier'] == name  ][Metric]
      model_2 =  statDet.loc[statDet['Classifier'] == nested][Metric]

      m1_score = model_1.mean()
      m2_score = model_2.mean()

      win, lost = '',''

      if Check_Normal (model_1) and Check_Normal (model_2):
        test = 't-test'
        stat, p = ttest_rel(model_1, model_2)
      else:
        test = 'Wilcoxon'
        if m1_score == m2_score : stat, p = 1, 1
        else                    : stat, p = wilcoxon (model_1, model_2)
      if p > alpha_adjusted:
        decision = 'Accept'
      else:
        decision = 'Reject'
        if   m1_score > m2_score : win, lost = name, nested
        else                     : win, lost = nested, name

      statRes = pd.concat([statRes, pd.DataFrame([{'Classifier_1' : name,
                                'Classifier_2' : nested,
                                'Test'         : test,
                                'Stat'         : stat,
                                'p-value'      : p,
                                'Null Hypo'    : decision,
                                'Win'          : win,
                                'Lost'         : lost,
                                'Dataset'      : getDatasetName (fname)}])]
                                ,ignore_index=True)

  Save_Excel (statRes, fname.replace('results.csv', 'statistical_results.xlsx'))

  return statRes

In [None]:
def Stat_Test(allDet, fname, save=True):
    fname = getPath('Compare') + fname.replace('.csv', '_results.csv')
    statRes = prepareStatDataFrame()
    Metric = statMetric  # 'Accuracy'

    statDet = pd.concat([allDet['Classifier'], allDet[Metric]], axis=1)
    names = statDet['Classifier'].unique().tolist()

    original_alpha = 0.05  # Original significance level
    k = len(names)  # Number of models

    p_values = []
    comparisons = []

    for i, name in enumerate(names):
        nestedNames = names[i + 1:]
        for nested in nestedNames:
            model_1 = statDet.loc[statDet['Classifier'] == name][Metric]
            model_2 = statDet.loc[statDet['Classifier'] == nested][Metric]

            normal_1 = Check_Normal(model_1)
            normal_2 = Check_Normal(model_2)

            if normal_1 and normal_2:  
                test = 't-test'
                stat, p = ttest_rel(model_1, model_2)
            else:  
                test = 'Wilcoxon'
                stat, p = wilcoxon(model_1, model_2)

            p_values.append(p)
            comparisons.append((name, nested, model_1.mean(), model_2.mean()))

    corrected_p_values = multipletests(p_values, alpha=original_alpha, method='bonferroni')[1]

    for (name, nested, mean_1, mean_2), p in zip(comparisons, corrected_p_values):
        decision = 'Accept' if p > original_alpha else 'Reject'
        win, lost = ('', '') if decision == 'Accept' else ((name, nested) if mean_1 > mean_2 else (nested, name))
        
        new_row = {
            'Classifier_1': name,
            'Classifier_2': nested,
            'Test': test,
            'Stat': stat,
            'p-value': p,
            'Null Hypo': decision,
            'Win': win,
            'Lost': lost,
            'Dataset': getDatasetName(fname)
        }

        statRes = pd.concat([statRes, pd.DataFrame([new_row])], ignore_index=True)

    if save: Save_Excel(statRes, fname.replace('results.csv', 'statistical_results.xlsx'))

    return statRes

In [None]:
def ScottKnottESD (Det, fname):

  boxplot_results = Det
  boxplot_results = boxplot_results.rename(columns = {'Classifier': 'variable', ESDmetric: 'value'}, inplace = False)

  data = sk.long2wide (boxplot_results)
  print (data)

  r_sk = sk.sk_esd(data)
  print (r_sk[2])

  r_sk[2] = fixScottNames(r_sk[2])

  ranking = pd.DataFrame({'columns':r_sk[2], 'rank':list(r_sk[1])}) # long format

  a_dict = dict(zip(r_sk.names, list(r_sk)))
  data_dict = { 'nms': a_dict['nms'], 'ord': a_dict['ord']}
  rank_val = a_dict['groups']

  ord_dict = {}
  for i in range(0,len(data_dict['nms'])):
    ord_dict[i+1] = data_dict['nms'][i]

  ord_dict2 =  {}
  for i in range(0,len(data_dict['nms'])):
    ord_dict2[i+1] = ord_dict[a_dict['ord'][i]]

  rank_dict =  {}
  for i in range(0,len(data_dict['nms'])):
    rank_dict[ord_dict2[i+1]] = rank_val[i]

  print (rank_dict)

  rank = pd.DataFrame.from_dict(rank_dict, orient='index',  columns=['rank'])
  rank.reset_index(inplace=True)
  rank.rename(columns = {'index': 'variable'}, inplace = True)

  boxplot_rank = pd.merge(boxplot_results, rank, on='variable')
  boxplot_rank.rename(columns = {'variable': 'Classifier', 'value': 'AUC'}, inplace = True)
  #print('boxplot_rank',boxplot_rank)

  if _WriteFlag : Save_Excel (boxplot_rank, fname.replace('results.csv', 'ScottKnotESD_results.xlsx'))

  return boxplot_rank


def fixScottNames( rank ):
  for i in range(len(rank)):
    s = rank[i]
    if 'NB.' in s:
        s_list = list(s)
        index = s.find('.')
        s_list[index] = '('
        next_dot_index = s.find('.', index + 1)
        s_list[next_dot_index] = ')'
        s = ''.join(s_list)
        s = s.replace('.', '-')
        rank[i] = s
        continue
    
    if '.' in s:
      rank[i] = s.replace('.','-')
      continue
            
    if 'BS.' in s:
      rank[i] = s.replace('BS.','BS-')
      continue
    if 'BG.' in s:
      rank[i] = s.replace('BG.','BG-')
      continue
    if 'Stack.' in s:
      rank[i] = s.replace('Stack.','Stack-')
      continue


  return rank

In [None]:
def set_logging(log_fname):
    # Configure logging
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    
    # If you also want to log to a file, you can add a FileHandler in addition to basicConfig
    log_filename = log_fname+'.log'
    file_handler = logging.FileHandler(log_filename)
    file_handler.setLevel(logging.INFO)
    file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
    
    # Get the root logger and add the file handler to it
    logging.getLogger().addHandler(file_handler)
    
    
    # Example of setting a specific logger for your module
    logger = logging.getLogger(__name__)

    return logger

import logging

def set_logging(log_fname):
    # Create a logger for your application
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)  # Set the log level to INFO
    
    # Create file handler which logs even debug messages
    log_filename = log_fname + '.log'
    file_handler = logging.FileHandler(log_filename)
    file_handler.setLevel(logging.INFO)
    file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
    
    # Add the handler to the logger
    logger.addHandler(file_handler)
    
    # Prevent logs from other libraries from propagating to this logger
    logger.propagate = False
    
    return logger


#     **---    Main**

In [None]:
# -----------------------------------------  Global Variables

indClfs = prepareIndividualModels ()
tenClfs = prepareTreeBasedEnsembles ()

Res, Roc, Det, Mat, TimeDF = prepareDataFrames ()
allRes, allRoc, allDet, allMat, allTimeDF = prepareDataFrames ()

allDatasetRes, allDatasetRoc, allDatasetDet, allDatasetMat, allDatasetTimeDF = prepareDataFrames ()
allDatasetStat = prepareStatDataFrame ()

# -------------- Untuned Clfs
unIndClfs = {}
unTenClfs = {}

# --------------- logger
log_fname= output_dir + namestr(fnames, globals())
logger = set_logging(log_fname)

In [None]:
# ====================     Execution Starts Here  ===================

printStart()

for fname in fnames:
    
  logger.info(f" ====================  {fname}  ==================== ")

  printString (getDatasetName(fname))

  newFile = Dataset_Preprocess (fname)

  ind, dep, indTun, depTun = prepareVars (newFile)

  Compare_Models_Within_Dataset (ind, dep, indTun, depTun, fname)

Save_Models_All_Datasets ()

printDone()

# ====================     Execution Ends Here  ===================