# Preprocessamento e criação do modelo

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import lime
import lime.lime_tabular
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
import tensorflow as tf
import joblib
from fuzzywuzzy import process 
from datetime import datetime
import random as python_random
from utils import *


In [31]:

seed = 41
np.random.seed(seed)
python_random.seed(seed)
tf.random.set_seed(seed)

In [32]:
df = pd.read_csv('./data/bronze/data.csv')

In [33]:
# Conversão de tipos
df['idade'] = df['idade'].astype(int)
df['valorsolicitado'] = df['valorsolicitado'].astype(float)
df['valortotalbem'] = df['valortotalbem'].astype(float)

## Tratamento de Nulos

In [34]:
fill_na(df)

## Tratamento de erros de digitação

In [35]:
profissoes_validas = ['Advogado','Arquiteto','Cientista de Dados',
                      'Contador','Dentista','Empresário','Engenheiro',
                      'Médico','Programador']
fix_categoric_errors(df,'profissao', profissoes_validas)

## Tratamento de Outliers

In [36]:
df = fix_outliers(df, 'idade', 18, 100)
df = fix_outliers(df, 'idade', 0, 110)

## Engenharia de features

In [37]:
df['proporcaosolicitadototal'] = (df['valorsolicitado'] / df['valortotalbem']).astype(float)

## Divisão de dados para o modelo

In [38]:
X = df.drop('classe', axis=1)
y = df['classe']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)

# Normalização

X_test = save_scalers(X_test, ['tempoprofissao','renda','idade','dependentes','valorsolicitado',
                               'valortotalbem','proporcaosolicitadototal'])
X_train = save_scalers(X_train, ['tempoprofissao','renda','idade','dependentes','valorsolicitado',
                               'valortotalbem','proporcaosolicitadototal'])

# Label Encoding
mapeamento = {'ruim':0, 'bom':1}
y_train = np.array([mapeamento[classe] for classe in y_train])
y_test = np.array([mapeamento[classe] for classe in y_test])

X_train = save_encoders(X_train, ['profissao','tiporesidencia','escolaridade',
                                  'score','estadocivil','produto'])
X_test = save_encoders(X_test, ['profissao','tiporesidencia','escolaridade',
                                  'score','estadocivil','produto'])

# # Feature Selection
# rf = RandomForestClassifier()
# selector = RFE(rf, n_features_to_select=10, step=1)
# selector = selector.fit(X_train, y_train)

# # Transformação dos dados

# X_train = selector.transform(X_train)
# X_test = selector.transform(X_test)

# # Salvar seletor
# joblib.dump(selector, './models/selector.joblib')


In [39]:
# print(selector.support_)
# print(selector.ranking_)

## Matriz de confusão

**Classificações corretas:** (VP + VN) / Total

**Precisão:** VP / (VP + FP)

**Recall:** VP / (VP + FN)

In [40]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

optimazer = tf.keras.optimizers.Adam(learning_rate=0.001)

model.compile(optimizer=optimazer, loss='binary_crossentropy', metrics=['accuracy'])

model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    epochs=500,
    batch_size=10,
    verbose=1
)



Epoch 1/500


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - accuracy: 0.4501 - loss: 0.7198 - val_accuracy: 0.5238 - val_loss: 0.7241
Epoch 2/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.5231 - loss: 0.6818 - val_accuracy: 0.5714 - val_loss: 0.7253
Epoch 3/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6193 - loss: 0.6479 - val_accuracy: 0.5714 - val_loss: 0.7284
Epoch 4/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6144 - loss: 0.6581 - val_accuracy: 0.5714 - val_loss: 0.7279
Epoch 5/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5518 - loss: 0.6200 - val_accuracy: 0.5714 - val_loss: 0.7236
Epoch 6/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.6404 - loss: 0.6023 - val_accuracy: 0.6667 - val_loss: 0.7219
Epoch 7/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x1edca807a40>

In [41]:
model.save('./models/modelo_RNA.keras')

In [42]:
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step


In [43]:
print("Avaliação do modelo nos dados de teste:")
model.evaluate(X_test, y_test)
print("\nMétricas de classificação:")
print(classification_report(y_test, y_pred))

Avaliação do modelo nos dados de teste:
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.6884 - loss: 3.2128

Métricas de classificação:
              precision    recall  f1-score   support

           0       0.64      0.50      0.56        18
           1       0.71      0.81      0.76        27

    accuracy                           0.69        45
   macro avg       0.68      0.66      0.66        45
weighted avg       0.68      0.69      0.68        45



## Função de explicabilidade do modelo

In [44]:
def model_predict(data_asarray):
    data_asframe = pd.DataFrame(data_asarray, columns=X_train.columns)
    data_asframe = save_scalers(data_asframe, ['tempoprofissao','renda','idade','dependentes','valorsolicitado',
                                                'valortotalbem','proporcaosolicitadototal'])
    data_asframe = save_encoders(data_asframe, ['profissao','tiporesidencia','escolaridade',
                                                'score','estadocivil','produto'])
    predictions = model.predict(data_asframe)
    
    return np.hstack((1-predictions, predictions))

explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values,
                                                   mode='classification',
                                                   feature_names=X_train.columns,
                                                   class_names=['ruim','bom'],)

exp = explainer.explain_instance(X_test.values[1], model_predict, num_features=10)
# Gera um html

exp.save_to_file('lime_explanation.html')

print('\n Imprimindo os recursos e seus pesos para classe Bom')
if 1 in exp.local_exp:
    for feature, weight in exp.local_exp[1]:
        print(f'{feature}: {weight}')
        
print('\n Acessar os valores das features e seus pesos para classe Bom')
features_importances = exp.as_list(label=1)
for feature, weight in features_importances:
    print(f'{feature}: {weight}')

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 834us/step

 Imprimindo os recursos e seus pesos para classe Bom
3: 0.10351140212123905
8: 0.044794680865427275
5: 0.04412222208696344
9: 0.0413317196569411
2: -0.0158944813021923
4: -0.014555652629845003
0: 0.011031123204992124
10: 0.005306995308915805
7: -0.0043042190896196335
12: 0.003788184235174128

 Acessar os valores das features e seus pesos para classe Bom
tiporesidencia <= 0.00: 0.10351140212123905
estadocivil > 2.00: 0.044794680865427275
1.00 < score <= 2.00: 0.04412222208696344
3.00 < produto <= 5.00: 0.0413317196569411
-0.91 < renda <= -0.00: -0.0158944813021923
escolaridade > 2.00: -0.014555652629845003
2.00 < profissao <= 5.00: 0.011031123204992124
valorsolicitado > 0.23: 0.005306995308915805
0.04 < dependentes <= 1.05: -0.0043042190896196335
proporcaosolicitadototal > -0.15: 0.003788184235174128
