# Preprocessamento e criação do modelo

In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
import tensorflow as tf
import joblib
from fuzzywuzzy import process 
from datetime import datetime
import random as python_random
from utils import *


In [73]:

seed = 41
np.random.seed(seed)
python_random.seed(seed)
tf.random.set_seed(seed)

In [74]:
df = pd.read_csv('./data/bronze/data.csv')

In [75]:
# Conversão de tipos
df['idade'] = df['idade'].astype(int)
df['valorsolicitado'] = df['valorsolicitado'].astype(float)
df['valortotalbem'] = df['valortotalbem'].astype(float)

## Tratamento de Nulos

In [76]:
fill_na(df)

## Tratamento de erros de digitação

In [77]:
profissoes_validas = ['Advogado','Arquiteto','Cientista de Dados',
                      'Contador','Dentista','Empresário','Engenheiro',
                      'Médico','Programador']
fix_categoric_errors(df,'profissao', profissoes_validas)

## Tratamento de Outliers

In [78]:
df = fix_outliers(df, 'idade', 18, 100)
df = fix_outliers(df, 'idade', 0, 110)

## Engenharia de features

In [79]:
df['proporcaosolicitadototal'] = (df['valorsolicitado'] / df['valortotalbem']).astype(float)

## Divisão de dados para o modelo

In [80]:
X = df.drop('classe', axis=1)
y = df['classe']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)

# Normalização

X_test = save_scalers(X_test, ['tempoprofissao','renda','idade','dependentes','valorsolicitado',
                               'valortotalbem','proporcaosolicitadototal'])
X_train = save_scalers(X_train, ['tempoprofissao','renda','idade','dependentes','valorsolicitado',
                               'valortotalbem','proporcaosolicitadototal'])

# Label Encoding
mapeamento = {'ruim':0, 'bom':1}
y_train = np.array([mapeamento[classe] for classe in y_train])
y_test = np.array([mapeamento[classe] for classe in y_test])

X_train = save_encoders(X_train, ['profissao','tiporesidencia','escolaridade',
                                  'score','estadocivil','produto'])
X_test = save_encoders(X_test, ['profissao','tiporesidencia','escolaridade',
                                  'score','estadocivil','produto'])

# Feature Selection
rf = RandomForestClassifier()
selector = RFE(rf, n_features_to_select=10, step=1)
selector.fit(X_train, y_train)

# Transformação dos dados

X_train = selector.transform(X_train)
X_test = selector.transform(X_test)

# Salvar seletor
joblib.dump(selector, './models/selector.joblib')


['./models/selector.joblib']

In [81]:
print(selector.support_)
print(selector.ranking_)

[ True  True  True False  True  True  True False  True False  True  True
  True]
[1 1 1 2 1 1 1 4 1 3 1 1 1]


## Matriz de confusão

**Classificações corretas:** (VP + VN) / Total

**Precisão:** VP / (VP + FP)

**Recall:** VP / (VP + FN)

In [82]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

optimazer = tf.keras.optimizers.Adam(learning_rate=0.001)

model.compile(optimizer=optimazer, loss='binary_crossentropy', metrics=['accuracy'])

model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    epochs=500,
    batch_size=10,
    verbose=1
)



Epoch 1/500


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 29ms/step - accuracy: 0.5307 - loss: 0.6859 - val_accuracy: 0.5714 - val_loss: 0.6877
Epoch 2/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.6326 - loss: 0.6601 - val_accuracy: 0.5714 - val_loss: 0.6894
Epoch 3/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6280 - loss: 0.6244 - val_accuracy: 0.5714 - val_loss: 0.6923
Epoch 4/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5838 - loss: 0.6402 - val_accuracy: 0.5714 - val_loss: 0.6902
Epoch 5/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6733 - loss: 0.5834 - val_accuracy: 0.5714 - val_loss: 0.6922
Epoch 6/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6335 - loss: 0.6102 - val_accuracy: 0.6667 - val_loss: 0.6956
Epoch 7/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x1fce3a41ee0>

In [83]:
model.save('./models/modelo_RNA.keras')

In [84]:
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step


In [85]:
print("Avaliação do modelo nos dados de teste:")
model.evaluate(X_test, y_test)
print("\nMétricas de classificação:")
print(classification_report(y_test, y_pred))

Avaliação do modelo nos dados de teste:
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.6528 - loss: 3.4005 

Métricas de classificação:
              precision    recall  f1-score   support

           0       0.60      0.50      0.55        18
           1       0.70      0.78      0.74        27

    accuracy                           0.67        45
   macro avg       0.65      0.64      0.64        45
weighted avg       0.66      0.67      0.66        45

