# Preprocesado aux

Este es un procesado de los datos a partir del preprocesado inicial. Se tomara de df_train directamente donde ya se realizaron modificaciones

In [1]:
import pandas as pd
import numpy as np

In [4]:
df_train = pd.read_csv('../../data/processed/df_train.csv')
df_test =  pd.read_csv('../../data/processed/df_test_nolabel.csv')

In [5]:
df_train.describe()

Unnamed: 0,LoanNr_ChkDgt,ApprovalFY,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,DisbursementGross,BalanceGross,Accept
count,22835.0,22835.0,22835.0,22821.0,22835.0,22835.0,22835.0,22835.0,22743.0,22775.0,22835.0,22835.0,22835.0
mean,4367086000.0,2001.596628,12.368995,0.299417,18.479571,20.960981,3232.818831,0.832582,0.280966,0.135851,169321.3,44.027721,0.832231
std,2557573000.0,6.198582,68.14267,0.458013,377.130205,377.219172,13631.291286,0.681753,0.449481,0.342637,249108.1,6593.120964,0.373669
min,1000554000.0,1970.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2183700000.0,1997.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40000.0,0.0,1.0
50%,3647265000.0,2003.0,5.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,87500.0,0.0,1.0
75%,6282694000.0,2006.0,12.0,1.0,2.0,4.0,1.0,1.0,1.0,0.0,189189.5,0.0,1.0
max,9995603000.0,2014.0,7538.0,1.0,8800.0,8800.0,92006.0,2.0,1.0,1.0,4831510.0,996262.0,1.0


In [6]:
import pandas as pd
import numpy as np

# Copias de trabajo para no modificar los originales
train = df_train.copy()
test = df_test.copy()

### ─────────────────────────────
### 1. Eliminar columna 'State' (sin variabilidad)
### ─────────────────────────────
train.drop(columns=['State'], inplace=True)
test.drop(columns=['State'], inplace=True)

### ─────────────────────────────
### 2. Imputación con categoría especial (2.0) para valores nulos en columnas binarias
### ─────────────────────────────
cols_to_impute = ['NewExist', 'RevLineCr', 'LowDoc']
for col in cols_to_impute:
    train[col] = train[col].fillna(2.0)
    test[col] = test[col].fillna(2.0)

### ─────────────────────────────
### 3. Conversión de fechas a datetime + imputación de 'DisbursementDate'
### ─────────────────────────────
date_cols = ['ApprovalDate', 'DisbursementDate']
for col in date_cols:
    train[col] = pd.to_datetime(train[col], errors='coerce')
    test[col] = pd.to_datetime(test[col], errors='coerce')

train['DisbursementDate'] = train['DisbursementDate'].fillna(train['ApprovalDate'])
test['DisbursementDate'] = test['DisbursementDate'].fillna(test['ApprovalDate'])

### ─────────────────────────────
### 4. Codificación de 'BankState' (Label Encoding)
### ─────────────────────────────
all_states = pd.concat([train['BankState'], test['BankState']]).unique()
state_mapping = {state: idx for idx, state in enumerate(all_states)}

train['BankState_enc'] = train['BankState'].map(state_mapping)
test['BankState_enc'] = test['BankState'].map(state_mapping)

### ─────────────────────────────
### 5. Binning personalizado de variables sesgadas
### ─────────────────────────────

# Definir función genérica
def bin_variable(df_train, df_test, column_name, bins, labels):
    bin_col_name = f"{column_name}_bin"
    df_train[bin_col_name] = pd.cut(df_train[column_name], bins=bins, labels=labels, include_lowest=True)
    df_test[bin_col_name] = pd.cut(df_test[column_name], bins=bins, labels=labels, include_lowest=True)
    return bin_col_name

# DisbursementGross (basado en percentiles observados)
bins_disb = [0, 5000, 40000, 87500, 189189, 628623, 1280660, np.inf]
labels_disb = list(range(len(bins_disb) - 1))
bin_variable(train, test, 'DisbursementGross', bins_disb, labels_disb)

# BalanceGross (casi todos son 0, pero binning por precaución)
bins_bal = [0, 0.01, 10000, 50000, 150000, np.inf]
labels_bal = list(range(len(bins_bal) - 1))
bin_variable(train, test, 'BalanceGross', bins_bal, labels_bal)

# NoEmp (empleados)
bins_emp = [-1, 0, 2, 5, 10, 20, 50, 100, np.inf]
labels_emp = list(range(len(bins_emp) - 1))
bin_variable(train, test, 'NoEmp', bins_emp, labels_emp)

# CreateJob
bins_create = [-1, 0, 1, 3, 5, 10, np.inf]
labels_create = list(range(len(bins_create) - 1))
bin_variable(train, test, 'CreateJob', bins_create, labels_create)

# RetainedJob
bins_retained = [-1, 0, 1, 3, 5, 10, 20, 50, np.inf]
labels_retained = list(range(len(bins_retained) - 1))
bin_variable(train, test, 'RetainedJob', bins_retained, labels_retained)


'RetainedJob_bin'

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22835 entries, 0 to 22834
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   id                     22835 non-null  object        
 1   LoanNr_ChkDgt          22835 non-null  int64         
 2   Name                   22835 non-null  object        
 3   City                   22835 non-null  object        
 4   Bank                   22835 non-null  object        
 5   BankState              22835 non-null  object        
 6   ApprovalDate           22835 non-null  datetime64[ns]
 7   ApprovalFY             22835 non-null  int64         
 8   NoEmp                  22835 non-null  int64         
 9   NewExist               22835 non-null  float64       
 10  CreateJob              22835 non-null  int64         
 11  RetainedJob            22835 non-null  int64         
 12  FranchiseCode          22835 non-null  int64         
 13  U

Feature Engineering

In [None]:
### ─────────────────────────────
###  1. days_to_disbursement: Diferencia entre DisbursementDate y ApprovalDate. Hipótesis: demoras largas pueden indicar trámites más complejos o dudas del banco.
### ─────────────────────────────
train['days_to_disbursement'] = (train['DisbursementDate'] - train['ApprovalDate']).dt.days
test['days_to_disbursement'] = (test['DisbursementDate'] - test['ApprovalDate']).dt.days

### ─────────────────────────────
### 2. job_ratio = CreateJob / (NoEmp + 1): Para evitar división por cero. Mide proporción de nuevos empleos respecto al tamaño inicial de la empresa.
### ─────────────────────────────
train['job_ratio'] = train['CreateJob'] / (train['NoEmp'] + 1)
test['job_ratio'] = test['CreateJob'] / (test['NoEmp'] + 1)

### ─────────────────────────────
### 3. retention_ratio = RetainedJob / (NoEmp + 1) - ¿Qué parte del personal se conserva?. Alta retención podría indicar estabilidad o planificación.
### ─────────────────────────────

train['retention_ratio'] = train['RetainedJob'] / (train['NoEmp'] + 1)
test['retention_ratio'] = test['RetainedJob'] / (test['NoEmp'] + 1)

### ─────────────────────────────
### 4. funding_ratio = DisbursementGross / (BalanceGross + 1) Mide si el préstamo fue desembolsado completamente o solo en parte. Un funding bajo podría indicar riesgo, cambios de decisión, etc.
### ─────────────────────────────

train['funding_ratio'] = train['DisbursementGross'] / (train['BalanceGross'] + 1)
test['funding_ratio'] = test['DisbursementGross'] / (test['BalanceGross'] + 1)


### ─────────────────────────────
### 5. is_franchise = (FranchiseCode != 0).astype(int): Diferencia entre empresas independientes y franquicias.
### ─────────────────────────────

train['is_franchise'] = (train['FranchiseCode'] != 0).astype(int)
test['is_franchise'] = (test['FranchiseCode'] != 0).astype(int)


### ─────────────────────────────
### 6. approval_year, approval_month. Para captar patrones temporales (recesiones, políticas, etc.)
### ─────────────────────────────

train['approval_year'] = train['ApprovalDate'].dt.year
train['approval_month'] = train['ApprovalDate'].dt.month
test['approval_year'] = test['ApprovalDate'].dt.year
test['approval_month'] = test['ApprovalDate'].dt.month



In [10]:
train.to_csv('df_train_v2.3.csv')
test.to_csv('df_test_v2.3.csv')
