<a href="https://colab.research.google.com/github/alaudee/rf_model_credit_default_classification/blob/main/random_oti_GA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import pygad as pygad

from sklearn.impute import SimpleImputer
from scipy import stats
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score

In [None]:
arquivo_dat = 'australian.dat'
arquivo_csv = 'australian.csv'

columns_names = ['A1','A2','A3','A4','A5','A6','A7','A8','A9','A10','A11','A12','A13','A14','A15']

df = pd.read_csv(arquivo_dat, sep='\s+', header = None, names = columns_names)

df.to_csv(arquivo_csv, index=False)

print(f"Arquivo '{arquivo_dat}' convertido para '{arquivo_csv}' com sucesso!")

base = pd.read_csv('australian.csv')

Arquivo 'australian.dat' convertido para 'australian.csv' com sucesso!


  df = pd.read_csv(arquivo_dat, sep='\s+', header = None, names = columns_names)


In [None]:
column_names = base.columns

imputer = SimpleImputer(missing_values = np.nan, strategy='median')

# Fit the imputer to the data and transform it
imputed_base = imputer.fit_transform(base)

imputed_base_df = pd.DataFrame(imputed_base, columns=column_names)

print("Original Data:\n", base)
print("\nImputed Data:\n", imputed_base_df)

Original Data:
      A1     A2      A3  A4  A5  A6     A7  A8  A9  A10  A11  A12  A13   A14  \
0     1  22.08  11.460   2   4   4  1.585   0   0    0    1    2  100  1213   
1     0  22.67   7.000   2   8   4  0.165   0   0    0    0    2  160     1   
2     0  29.58   1.750   1   4   4  1.250   0   0    0    1    2  280     1   
3     0  21.67  11.500   1   5   3  0.000   1   1   11    1    2    0     1   
4     1  20.17   8.170   2   6   4  1.960   1   1   14    0    2   60   159   
..   ..    ...     ...  ..  ..  ..    ...  ..  ..  ...  ...  ...  ...   ...   
685   1  31.57  10.500   2  14   4  6.500   1   0    0    0    2    0     1   
686   1  20.67   0.415   2   8   4  0.125   0   0    0    0    2    0    45   
687   0  18.83   9.540   2   6   4  0.085   1   0    0    0    2  100     1   
688   0  27.42  14.500   2  14   8  3.085   1   1    1    0    2  120    12   
689   1  41.00   0.040   2  10   4  0.040   0   1    1    0    1  560     1   

     A15  
0      0  
1      0  
2 

In [None]:
X = imputed_base_df.drop('A15', axis=1)
y = imputed_base_df['A15']

# Dividir os dados em treino e teste com 30% para teste (test_size = 0.3)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

'''
Escalonamento dos dados
'''

# Inicializa scalers
scaler = StandardScaler()

scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

'''
Transforma também os valores de y em arrays numpy
para melhor manipulação por indexação por inteiros
'''
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

In [None]:
# SEED para manter a replicabilidade do algoritmo
# dict para armazenar os melhores hiperparametros p/ fold executada

SEED = 5

In [None]:
'''
Função para gerar um modelo do tipo RandomForestClassifier
'''

# macro para n instanciar muitas vezes
TEMP_CRITERION = ''
def create_rf_model(hyperparameters):
  # converte a solucao de float para inteiro
  solution = hyperparameters.astype(int)

  '''
  regra para poder variar entre valores para o hiperparâmetro
  critetion = ['gini', 'entropy'], dado que nativamente pelo
  atributo da classe pyGAD, gene_space não suporta tipo de dados
  string
  '''

  if solution[4] == 1:
    TEMP_CRITERION = 'gini'
  elif solution[4] == 2:
    TEMP_CRITERION = 'entropy'

  rf = RandomForestClassifier(n_estimators=solution[0], max_depth=solution[1], min_samples_split=solution[2], min_samples_leaf=solution[3], criterion=TEMP_CRITERION, random_state=SEED)

  return rf

In [None]:
'''
Função fitness para otimização por AG
utilizando técnica de cluster para
acesso aos sub folds
'''

def make_fitness(X_all_values_idx, y_all_values_idx, inner_folds_list):
  def fitness(ga_instance, solution, solution_idx):

    scores = []

    for inner_train_idx, inner_test_idx in inner_folds_list:

      X_inner_train, X_inner_test = X_all_values_idx[inner_train_idx], X_all_values_idx[inner_test_idx]
      y_inner_train, y_inner_test = y_all_values_idx[inner_train_idx], y_all_values_idx[inner_test_idx]

      rf_model = create_rf_model(solution)

      rf_model.fit(X_inner_train, y_inner_train)

      y_inner_predicts = rf_model.predict(X_inner_test)

      recall = recall_score(y_inner_test, y_inner_predicts)

      scores.append(recall)

    return np.mean(scores)

  return fitness

In [None]:
'''
Instanciação do modelo Random Forest sem otimização de hiperparâmetros
'''

rf_model = RandomForestClassifier(random_state = SEED)

kf = KFold(n_splits=10, random_state = SEED, shuffle = True)
kf_inner = KFold(n_splits = 2, random_state = SEED, shuffle = True)

best_hyperparams_per_fold = {}

for train_outer_index, test_outer_index in kf.split(X_train, y_train):

    # Primeira divisão em 10 folds em cima dos 70% disponivel para treinamento
    X_outer_train, X_outer_test = X_train[train_outer_index], X_train[test_outer_index]
    y_outer_train, y_outer_test = y_train[train_outer_index], y_train[test_outer_index]

    # Segunda divisão em 2 folds sobre a fold de treinamento
    inner_folds_list = list(kf_inner.split(X_outer_train, y_outer_train))

    fitness_fn = make_fitness(X_outer_train, y_outer_train, inner_folds_list)

    ga_instance = pygad.GA(
      num_generations=50,
      num_parents_mating=4,
      fitness_func=fitness_fn,
      sol_per_pop=10,
      num_genes=5,
      gene_space=[range(50, 201), range(10, 20), range(5, 51), range(5, 51), [1, 2]],
      parent_selection_type="tournament",
      crossover_type="uniform",
      mutation_type="random",
      mutation_percent_genes=15
    )

    ga_instance.run()

    solution, solution_fitness, solution_idx = ga_instance.best_solution()

    rf_model_optimized = create_rf_model(solution)

    rf_model_optimized.fit(X_outer_train, y_outer_train)

    y_outer_predicts = rf_model_optimized.predict(X_outer_test)

    recall_score_outer = recall_score(y_outer_test, y_outer_predicts)

    # Armazena em um dicionario valor de revocação e solução
    # para posteriormente recuperar os melhores valores
    best_hyperparams_per_fold[recall_score_outer] = solution

If you do not want to mutate any gene, please set mutation_type=None.
If you do not want to mutate any gene, please set mutation_type=None.
If you do not want to mutate any gene, please set mutation_type=None.
If you do not want to mutate any gene, please set mutation_type=None.
If you do not want to mutate any gene, please set mutation_type=None.
If you do not want to mutate any gene, please set mutation_type=None.
If you do not want to mutate any gene, please set mutation_type=None.
If you do not want to mutate any gene, please set mutation_type=None.
If you do not want to mutate any gene, please set mutation_type=None.
If you do not want to mutate any gene, please set mutation_type=None.


{0.8571428571428571: array([141.,  12.,  47.,   7.,   2.]), 0.9090909090909091: array([155.,  14.,  45.,   5.,   1.]), 0.8: array([83., 13., 12.,  5.,  1.]), 0.7692307692307693: array([108.,  17.,  14.,   5.,   1.]), 0.8695652173913043: array([109.,  14.,  48.,   9.,   1.]), 0.85: array([143.,  17.,  49.,  12.,   1.]), 0.6666666666666666: array([74., 19., 40.,  8.,  1.]), 0.7619047619047619: array([138.,  10.,  35.,  10.,   1.]), 0.8461538461538461: array([96., 13., 10.,  5.,  2.])}


In [None]:
best_key_hyperparameters = max(best_hyperparams_per_fold)

best_hyperparameters = best_hyperparams_per_fold[best_key_hyperparameters]

# Modelo final com os melhores hiperparametros encontrados
rf_model_final = create_rf_model(best_hyperparameters)

rf_model_final.fit(X_train, y_train)

y_predicts = rf_model_final.predict(X_test)

In [None]:
# Métricas finais com otimização por AG

accuracy = accuracy_score(y_test, y_predict)
precision = precision_score(y_test, y_predict, average = 'weighted')
recall = recall_score(y_test, y_predict, average = 'weighted')
f1 = f1_score(y_test, y_predict, average = 'weighted')
print(f'Acurácia : {accuracy}, Precisão : {precision}, Recall : {recall}, F1 : {f1}')

report = classification_report(y_test, y_predict)
print(report)