## Importing packages

In [46]:
import numpy as np
import pandas as pd
import warnings
import os
import random

from sklearn import preprocessing
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneGroupOut
from sklearn import metrics

from imblearn.pipeline import Pipeline as ImbPipeline
from itertools import product
import time

import pickle

## Importing data

In [58]:
# Moving to project directory
os.chdir('..')

In [59]:
data_set = pd.read_pickle("data/processed/data_set.pkl")
data_set

Unnamed: 0,baseClassifier,datasetName,attributeName,dsMatrixCorrelSD,dsEigenvaluePropIntercept,dsEigenvalueCumulativeIntercept,dsChiSquaredMax,attChiSquaredNormalized,attClassifierLogisticNormalized,attCorrelationNormalized,attReliefFNormalized,attSymmetricalUncertNormalized,dsLOGnInstances,dsLnNumClasses,wrapperRelevance
0,simpleLogisticRegression,iris,petallength,0.666645,0.493463,0.719181,268.419048,1.000000,0.988836,1.000000,0.954787,0.925373,2.176091,1.098612,no
1,simpleLogisticRegression,iris,petalwidth,0.666645,0.493463,0.719181,268.419048,0.994371,1.000000,0.962602,1.000000,1.000000,2.176091,1.098612,yes
2,simpleLogisticRegression,iris,sepallength,0.666645,0.493463,0.719181,268.419048,0.470954,0.657097,0.777236,0.372340,0.469575,2.176091,1.098612,no
3,simpleLogisticRegression,iris,sepalwidth,0.666645,0.493463,0.719181,268.419048,0.263171,0.357257,0.645528,0.324468,0.275545,2.176091,1.098612,no
4,simpleLogisticRegression,labor_part1,wage-increase-second-year,0.406496,0.216496,0.398940,15.064286,1.000000,1.000000,0.878689,0.956530,1.000000,1.755875,0.693147,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
488,simpleLogisticRegression,zoo,aquatic,0.381562,0.156774,0.517723,208.165859,0.222956,0.350015,0.463506,0.363265,0.383380,2.004321,1.945910,yes
489,simpleLogisticRegression,zoo,catsize,0.381562,0.156774,0.517723,208.165859,0.178550,0.249975,0.555631,0.366118,0.299295,2.004321,1.945910,no
490,simpleLogisticRegression,zoo,venomous,0.381562,0.156774,0.517723,208.165859,0.091768,0.000000,0.297375,0.046668,0.156368,2.004321,1.945910,no
491,simpleLogisticRegression,zoo,predator,0.381562,0.156774,0.517723,208.165859,0.058555,0.000000,0.154953,0.105274,0.090641,2.004321,1.945910,no


In [49]:
# target features
y = data_set.iloc[:,-1]

# Encode categorical target values
le = preprocessing.LabelEncoder()
le.fit(y)
y = le.transform(y)
y = pd.DataFrame(y)

#dependent variables
X = data_set.iloc[:,3:-1] 
# X = data_set.iloc[:29,3:-1]   #test

# Group used in cross-validation accordind to original dataset
groups = data_set.iloc[:,:].datasetName
# groups = data_set.iloc[:29,:].datasetName  #test

#Attributes
attributes = data_set.iloc[:,:].attributeName

#All datasets that will be in test set each run
test_cols =list(groups.unique())

In [50]:
for i in groups.unique():
    exec('dataset_{KEY} = pd.read_pickle("data\processed\dataset_{KEY}.pkl")'.format(KEY = i))

In [51]:
# Supressing warnings
warnings.filterwarnings("ignore")

## HistGradientBoostingClassifier Parameters

In [52]:
# parameters 
c_max_iter =["max_iter=100", "max_iter=1000"]
c_max_depth =["max_depth=None","max_depth=4", "max_depth=6","max_depth=8"]
c_min_samples_leaf = ["min_samples_leaf=10" ,"min_samples_leaf=20" ]
c_learning_rate= ["learning_rate=0.01", "learning_rate=0.1"] 
a = list(product(c_max_iter,c_max_depth,c_min_samples_leaf,c_learning_rate))
d=[]
for i in a:    
         c ='HistGradientBoostingClassifier('+ ','.join(i)+')'
         d.append(c)
d = [t.replace('"', '') for t in d]
hist_parameters =[]
for i in d:
    hist_parameters.append(eval(i))
hist_parameters

[HistGradientBoostingClassifier(learning_rate=0.01, min_samples_leaf=10),
 HistGradientBoostingClassifier(min_samples_leaf=10),
 HistGradientBoostingClassifier(learning_rate=0.01),
 HistGradientBoostingClassifier(),
 HistGradientBoostingClassifier(learning_rate=0.01, max_depth=4,
                                min_samples_leaf=10),
 HistGradientBoostingClassifier(max_depth=4, min_samples_leaf=10),
 HistGradientBoostingClassifier(learning_rate=0.01, max_depth=4),
 HistGradientBoostingClassifier(max_depth=4),
 HistGradientBoostingClassifier(learning_rate=0.01, max_depth=6,
                                min_samples_leaf=10),
 HistGradientBoostingClassifier(max_depth=6, min_samples_leaf=10),
 HistGradientBoostingClassifier(learning_rate=0.01, max_depth=6),
 HistGradientBoostingClassifier(max_depth=6),
 HistGradientBoostingClassifier(learning_rate=0.01, max_depth=8,
                                min_samples_leaf=10),
 HistGradientBoostingClassifier(max_depth=8, min_samples_leaf=10),
 H

## Modelling

In [53]:
pipe = ImbPipeline(
    [
        ("scaling", None),
        ("balance", None),        
        ("classifier", None),
    ]
)

param_grid = [
    {
         "scaling": ['passthrough'],
         'balance':[None],
         'classifier': hist_parameters
        }   
]


for i in test_cols:
    

    
    results_modeling = {} #ver
    
    if not os.path.isfile(f'models\\{i}_model.sav'):        
    # If model does not exist yet
    
        # Random seed used in case of replication of results
        SEED = random.randint(0,10000)
        
        
        start_time = time.time()
        
        results_modeling['dataset']= i #ver    
        results_modeling['seed']= SEED #ver

        #defining test_set
        print('\ntest_set:',i,'\n')
     
        col_test = [i]    
        test_set = data_set[data_set.datasetName.isin(col_test)].copy()
        X_test = test_set.iloc[:,3:-1]
        groups_test = test_set.iloc[:,:].datasetName

        attributes_test = test_set.iloc[:,:].attributeName 

        y_test = test_set.iloc[:,-1]
        le.fit(y_test)
        y_test = le.transform(y_test)
        y_test = pd.DataFrame(y_test,index = test_set.index)

        #defining training_set
        training_set = data_set[~data_set.datasetName.isin(col_test)].copy()
        X_training = training_set.iloc[:,3:-1]
        groups_training = training_set.iloc[:,:].datasetName
        y_training = training_set.iloc[:,-1]
        le.fit(y_training)
        y_training = le.transform(y_training)
        y_training = pd.DataFrame(y_training)

        #fitting model
        np.random.seed(SEED)
        grid = GridSearchCV(pipe, n_jobs=1, param_grid=param_grid,scoring='accuracy',cv=LeaveOneGroupOut(),return_train_score=True)
        grid.fit(X_training, y_training, groups=groups_training)
        print(grid.best_estimator_)
        print("Cross Validation Best Accuracy (validation_set):",grid.best_score_)
        
        results_modeling['CV_best_model_validation_score'] = grid.best_score_ #ver

        # save each the model to disk
        filename = f'models\\{i}_model.sav'
        exec("pickle.dump(grid.best_estimator_, open(filename, 'wb'))".format(KEY = i))



        # make predictions using the best estimator
        y_pred = grid.best_estimator_.predict(X_test)
        print("Accuracy Model Relevant Features (test_set):",  metrics.accuracy_score(y_test, y_pred))
        
        results_modeling['Accuracy_test_set'] = metrics.accuracy_score(y_test, y_pred) #ver

        # compare real values with weka and new_model predictions
        pred=pd.DataFrame()
        pred['dataset'] = groups_test
        pred = pd.DataFrame(pred)
        pred['y_test'] = y_test
        pred['y_pred'] = y_pred    
        pred['attributeName'] = attributes_test #ver

        # if model does not choose any feature, it will be forced to choose at least one
        if sum(pred.y_pred)==0:
            proba_df=pd.DataFrame(grid.best_estimator_.predict_proba(X_test))
            
            results_modeling['variaveis_new_model_test_set'] = [pred.attributeName.iloc[proba_df.iloc[:,1].idxmax()]] #ver
            results_modeling['variaveis_new_model_test_set_remove'] = list(set(list(pred.attributeName.unique())) - set(pred.attributeName.iloc[proba_df.iloc[:,1].idxmax()])) #ver

        else:
            aux = []
            aux_remove=[]    
            aux = pred[pred.y_pred==1].groupby('dataset')['attributeName'].apply(lambda x: x.tolist()).to_dict() #ver
            aux_remove = pred[pred.y_pred==0].groupby('dataset')['attributeName'].apply(lambda x: x.tolist()).to_dict() #ver

            results_modeling['variaveis_new_model_test_set'] = aux[f'{i}'] #ver
            if (len(aux_remove)>0):    #checking if there are variables to remove
                results_modeling['variaveis_new_model_test_set_remove'] = aux_remove[f'{i}'] #ver
            else:
                results_modeling['variaveis_new_model_test_set_remove'] = None 
            


        
        # Specify the filename/path where you want to save the dictionary
        filename = f'models\\results\\{i}_dict.pkl'

        # Save the dictionary to disk using pickle.dump
        with open(filename, 'wb') as file:
            pickle.dump(results_modeling, file)        
            
        end_time = time.time()
        time_spent = end_time - start_time
        print("Time spent: ", time_spent)
        results_modeling['time']= time_spent #ver 

        print('------------------------------------------------------------------------------')        
        
    else:
        print(f'The {i}_model  already exists.')    
    



test_set: iris 

Pipeline(steps=[('scaling', 'passthrough'), ('balance', None),
                ('classifier',
                 HistGradientBoostingClassifier(max_depth=8, max_iter=1000,
                                                min_samples_leaf=10))])
Cross Validation Best Accuracy (validation_set): 0.678341260416732
Accuracy Model Relevant Features (test_set): 0.5
Time spent:  6173.346262216568
------------------------------------------------------------------------------

test_set: labor_part1 

Pipeline(steps=[('scaling', 'passthrough'), ('balance', None),
                ('classifier',
                 HistGradientBoostingClassifier(learning_rate=0.01,
                                                max_depth=6))])
Cross Validation Best Accuracy (validation_set): 0.6795702724948007
Accuracy Model Relevant Features (test_set): 0.625
Time spent:  6006.8883101940155
------------------------------------------------------------------------------

test_set: labor_part2 

Pipeline