# Tratamento Inicial dos Dados

Este notebook realiza o pré-processamento inicial dos dados do projeto de previsão de churn bancário. As etapas incluem:

1. Importação de bibliotecas.
2. Leitura e limpeza da base de dados.
3. Criação de variáveis derivadas.
4. Classificação de variáveis categóricas.
5. Salvamento da base processada para uso posterior.

# Etapa 1: Imports iniciais

In [None]:
# Importação de bibliotecas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path

# Configurações globais
warnings.filterwarnings("ignore")
BASE_DIR = Path().resolve().parent.parent

# Etapa 2: Leitura da base de dados

In [None]:
# Leitura da base de dados
DATA_PATH = BASE_DIR / "data" / "BankChurners.csv"

try:
    df = pd.read_csv(DATA_PATH)
    print(f"[✔️] Base carregada com sucesso. Formato: {df.shape}")
except FileNotFoundError:
    print(f"[❌] Arquivo não encontrado: {DATA_PATH}")
    raise

Formato da base: (10127, 23)


Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,9.3e-05,0.99991
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,5.7e-05,0.99994
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,2.1e-05,0.99998
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,3313.0,2517,796.0,1.405,1171,20,2.333,0.76,0.000134,0.99987
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,4716.0,0,4716.0,2.175,816,28,2.5,0.0,2.2e-05,0.99998


# Etapa 3: Verificação de nulos

In [None]:
# Verificação de valores nulos
nulos = df.isnull().sum()
if nulos.sum() > 0:
    print(f"[⚠️] Existem {nulos.sum()} valores nulos na base. Substituindo por 0.")
    df = df.fillna(0)
else:
    print("[✔️] Nenhum valor nulo encontrado.")

CLIENTNUM                                                                                                                             0
Attrition_Flag                                                                                                                        0
Customer_Age                                                                                                                          0
Gender                                                                                                                                0
Dependent_count                                                                                                                       0
Education_Level                                                                                                                       0
Marital_Status                                                                                                                        0
Income_Category                                 

# Etapa 4: Criação de variáveis derivadas

In [20]:
df["Ticket_Medio"] = df["Total_Trans_Amt"] / df["Total_Trans_Ct"]
df["Transacoes_por_Mes"] = df["Total_Trans_Ct"] / df["Months_on_book"]
df["Gasto_Medio_Mensal"] = df["Total_Trans_Amt"] / df["Months_on_book"]
df["Rotativo_Ratio"] = df["Total_Revolving_Bal"] / df["Credit_Limit"]
df["Disponibilidade_Relativa"] = (df["Credit_Limit"] - df["Total_Revolving_Bal"]) / df["Credit_Limit"]
df["Caiu_Transacoes"] = (df["Total_Trans_Ct"] < df["Total_Ct_Chng_Q4_Q1"] * df["Total_Trans_Ct"]).astype(int)
df["Caiu_Valor"] = (df["Total_Trans_Amt"] < df["Total_Amt_Chng_Q4_Q1"] * df["Total_Trans_Amt"]).astype(int)


# Etapa 5: Classificações de variáveis

In [21]:
def classificar_idade(x):
    if x < 30:
        return "<30"
    elif x < 50:
        return "30-49"
    elif x < 70:
        return "50-69"
    else:
        return "70+"

def classificar_renda(x):
    if x in ["$60K - $80K", "$80K - $120K", "$120K +"]:
        return "Alta"
    elif x in ["$40K - $60K", "$20K - $40K"]:
        return "Média"
    else:
        return "Baixa"

df["Faixa_Idade"] = df["Customer_Age"].apply(classificar_idade)
df["Renda_Class"] = df["Income_Category"].apply(classificar_renda)


# Etapa 6: Exibição das novas variáveis criadas

In [22]:
novas = ["Ticket_Medio", "Transacoes_por_Mes", "Gasto_Medio_Mensal", "Rotativo_Ratio", "Disponibilidade_Relativa", 
         "Caiu_Transacoes", "Caiu_Valor", "Faixa_Idade", "Renda_Class"]
df[novas].describe(include="all")

Unnamed: 0,Ticket_Medio,Transacoes_por_Mes,Gasto_Medio_Mensal,Rotativo_Ratio,Disponibilidade_Relativa,Caiu_Transacoes,Caiu_Valor,Faixa_Idade,Renda_Class
count,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127,10127
unique,,,,,,,,4,3
top,,,,,,,,30-49,Baixa
freq,,,,,,,,6402,4673
mean,62.612717,1.92312,131.011977,0.274892,0.725108,0.065962,0.098055,,
std,26.404198,0.911977,115.7223,0.275689,0.275689,0.248228,0.297403,,
min,19.137931,0.188679,10.0,0.0,0.001232,0.0,0.0,,
25%,47.514573,1.272727,62.361111,0.022714,0.497309,0.0,0.0,,
50%,55.794872,1.857143,105.8,0.17565,0.82435,0.0,0.0,,
75%,65.476673,2.361111,141.361149,0.502691,0.977286,0.0,0.0,,


# Etapa 7: Salvamento da base processada

In [None]:
# Salvamento da base processada
OUTPUT_PATH = BASE_DIR / "data" / "BankChurners_tratado.csv"

try:
    OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(OUTPUT_PATH, index=False)
    print(f"[✔️] Base salva com sucesso em: {OUTPUT_PATH}")
except Exception as e:
    print(f"[❌] Erro ao salvar a base: {e}")
    raise

✅ Dados tratados e variáveis derivadas criadas.
