#### Imports

In [38]:
# General import and load data
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

from sklearn.utils import resample

# Training and test spliting
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Preprocessing 
from sklearn.preprocessing import StandardScaler

# Estimators
from sklearn.svm import SVC

# Evaluation
from sklearn import metrics
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report

# Optimization
from sklearn.model_selection import GridSearchCV

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(color_codes=True)

#### Cargar datos

In [39]:
data = "/home/ines/dev/CDAW_reto1/LBBYs/data/processed/df_train.csv"
df = pd.read_csv(data)
df.head()

Unnamed: 0,id,LoanNr_ChkDgt,Name,City,State,Bank,BankState,ApprovalDate,ApprovalFY,NoEmp,...,CreateJob,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,DisbursementDate,DisbursementGross,BalanceGross,Accept
0,bd9d6267ec5,1523195006,"P-SCAPE LAND DESIGN, LLC",NORTHFIELD,OH,CITIZENS BANK NATL ASSOC,RI,2005-11-01,2006,2,...,0,2,0,1,0.0,0.0,2005-12-31,8000.0,0.0,1
1,9eebf6d8098,1326365010,The Fresh & Healthy Catering C,CANTON,OH,"FIRSTMERIT BANK, N.A.",OH,2005-06-06,2005,2,...,1,2,1,1,0.0,0.0,2005-07-31,166000.0,0.0,1
2,83806858500,6179584001,AARON MASON & HOWE LLC,SAWYERWOOD,OH,"PNC BANK, NATIONAL ASSOCIATION",OH,2003-03-18,2003,2,...,4,2,1,2,1.0,0.0,2003-03-31,25000.0,0.0,1
3,a21ab9cb3af,8463493009,MID OHIO CAR WASH,COLUMBUS,OH,THE HUNTINGTON NATIONAL BANK,OH,1995-06-28,1995,2,...,0,0,1,0,0.0,0.0,1996-01-31,220100.0,0.0,1
4,883b5e5385e,3382225007,Bake N Brew LLC,Newark,OH,THE HUNTINGTON NATIONAL BANK,OH,2009-04-16,2009,0,...,0,0,0,1,0.0,0.0,2009-05-31,25000.0,0.0,0


#### Submuestreo

In [40]:
# Separar clases mayoritaria y minoritaria
df_majority = df[df["Accept"] == 1]  # Créditos aceptados (clase mayoritaria)
df_minority = df[df["Accept"] == 0]  # Créditos no aceptados (clase minoritaria)

# Aplicar Submuestreo (undersampling)
df_majority_downsampled = resample(df_majority, 
                                   replace=False,  # Sin reemplazo
                                   n_samples=len(df_minority),  # Igualar cantidad de muestras
                                   random_state=42)  # Fijar semilla

# Unir el dataset balanceado
df_balanced = pd.concat([df_majority_downsampled, df_minority])

# Mezclar los datos para evitar sesgos
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
df.dtypes

id                    object
LoanNr_ChkDgt          int64
Name                  object
City                  object
State                 object
Bank                  object
BankState             object
ApprovalDate          object
ApprovalFY             int64
NoEmp                  int64
NewExist             float64
CreateJob              int64
RetainedJob            int64
FranchiseCode          int64
UrbanRural             int64
RevLineCr            float64
LowDoc               float64
DisbursementDate      object
DisbursementGross    float64
BalanceGross         float64
Accept                 int64
dtype: object

In [41]:
# Eliminar filas con NaN en las columnas relevantes
df_balanced= df_balanced.dropna()
cols_to_drop = ['id', 'LoanNr_ChkDgt', 'Name', 'ApprovalDate', 'DisbursementDate', 'State']
df_balanced = df_balanced.drop(columns=cols_to_drop)

df_balanced.head()

Unnamed: 0,City,Bank,BankState,ApprovalFY,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,DisbursementGross,BalanceGross,Accept
0,Greenwillow,THE HUNTINGTON NATIONAL BANK,OH,2007,5,1.0,0,1,0,1,0.0,0.0,31000.0,0.0,0
2,COLUMBUS,THE HUNTINGTON NATIONAL BANK,OH,2008,4,0.0,0,4,1,1,0.0,0.0,50000.0,0.0,1
3,Reynoldsburg,"BUSINESS LOAN CENTER, LLC",FL,2006,3,1.0,0,3,0,1,0.0,0.0,25000.0,0.0,0
4,WESTLAKE,CITIZENS BANK NATL ASSOC,RI,2006,2,0.0,0,2,0,1,0.0,0.0,42000.0,0.0,1
5,SHAWNEE HILLS,THE HUNTINGTON NATIONAL BANK,OH,1999,5,1.0,5,0,1,1,0.0,0.0,150000.0,0.0,0


In [54]:
var = df_balanced["UrbanRural"].value_counts()
var

UrbanRural
1    4193
0    1974
2    1423
Name: count, dtype: int64

#### Hipótesis 1: Impacto de la ubicación (City) en la probabilidad de aprobación

In [6]:
# Primero, eliminar filas con valores faltantes en las columnas relevantes
cols_relevantes = ['UrbanRural', 'NoEmp', 'NewExist', 'Accept']
df_balanced = df_balanced.dropna(subset=cols_relevantes)

# Seleccionar variables predictoras y variable objetivo
X = df_balanced[['UrbanRural', 'NoEmp', 'NewExist']]
y = df_balanced['Accept']

# Convertir la variable categórica 'UrbanRural' a variables dummy
X = pd.get_dummies(X, columns=['UrbanRural'], drop_first=True)

# División en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Escalado de las características
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Definir el grid de hiperparámetros para el SVM
param_grid = {
    'C': [10, 100],
    'gamma': [1, 0.1, 0.01]
}

# Configurar GridSearchCV con SVC (kernel 'rbf')
grid = GridSearchCV(SVC(kernel='rbf', random_state=42, class_weight='balanced'),
                    param_grid, cv=5, verbose=2)
grid.fit(X_train_scaled, y_train)

print("Mejores parámetros encontrados:", grid.best_params_)

# Usar el mejor modelo para predecir en el conjunto de prueba
y_pred = grid.predict(X_test_scaled)

# Evaluar el modelo
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ......................................C=10, gamma=1; total time=   0.7s
[CV] END ......................................C=10, gamma=1; total time=   0.7s
[CV] END ......................................C=10, gamma=1; total time=   0.7s
[CV] END ......................................C=10, gamma=1; total time=   0.7s
[CV] END ......................................C=10, gamma=1; total time=   0.7s
[CV] END ....................................C=10, gamma=0.1; total time=   0.6s
[CV] END ....................................C=10, gamma=0.1; total time=   0.5s
[CV] END ....................................C=10, gamma=0.1; total time=   0.5s
[CV] END ....................................C=10, gamma=0.1; total time=   0.5s
[CV] END ....................................C=10, gamma=0.1; total time=   0.5s
[CV] END ...................................C=10, gamma=0.01; total time=   0.5s
[CV] END ...................................C=10,

In [7]:
print("Matriz de Confusión:")
print(cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)
print("Reporte de clasificación:")
print(report)

Matriz de Confusión:
[[976 180]
 [703 439]]
Accuracy: 0.6157528285465622
Precision: 0.7092084006462036
Recall: 0.38441330998248685
F1-score: 0.498580352072686
Reporte de clasificación:
              precision    recall  f1-score   support

           0       0.58      0.84      0.69      1156
           1       0.71      0.38      0.50      1142

    accuracy                           0.62      2298
   macro avg       0.65      0.61      0.59      2298
weighted avg       0.64      0.62      0.59      2298



####  Hipótesis 2: "Impacto de la ubicación geográfica (Urban vs Rural) en la aprobación

In [8]:
df_balanced['UrbanRural'] = df_balanced['UrbanRural'].astype(str).str.lower()

# Asegurarse de que la columna 'UrbanRural' esté en minúsculas
df_balanced['UrbanRural'] = df_balanced['UrbanRural'].str.lower()

# Eliminar filas con valores faltantes en las columnas relevantes
cols_relevantes = ['UrbanRural', 'Accept']
df_balanced = df_balanced.dropna(subset=cols_relevantes)

# Seleccionar la variable predictora y la variable objetivo
X2 = df_balanced[['UrbanRural']]
y2 = df_balanced['Accept']

# Convertir la variable categórica 'UrbanRural' a variables dummy
# Por ejemplo, si los valores son "urban" y "rural", se creará una columna "UrbanRural_urban"
X2 = pd.get_dummies(X2, columns=['UrbanRural'], drop_first=True)

# División en conjuntos de entrenamiento y prueba
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3, random_state=42)

# Escalado de las características
scaler2 = StandardScaler()
X2_train_scaled = scaler2.fit_transform(X2_train)
X2_test_scaled = scaler2.transform(X2_test)

# Definir el grid de hiperparámetros para el SVM
param_grid2 = {
    'C': [1, 10, 100],
    'gamma': [1, 0.1, 0.01]
}

# Configurar GridSearchCV con SVC (kernel 'rbf') y balanceo de clases
grid2 = GridSearchCV(SVC(kernel='rbf', random_state=42, class_weight='balanced'),
                     param_grid2, cv=5, verbose=2)
grid2.fit(X2_train_scaled, y2_train)

print("Mejores parámetros encontrados para Hipótesis 2:", grid2.best_params_)

# Usar el mejor modelo para predecir en el conjunto de prueba
y2_pred = grid2.predict(X2_test_scaled)

# Evaluación del modelo
cm2 = confusion_matrix(y2_test, y2_pred)
acc2 = accuracy_score(y2_test, y2_pred)
prec2 = precision_score(y2_test, y2_pred)
rec2 = recall_score(y2_test, y2_pred)
f12 = f1_score(y2_test, y2_pred)
report2 = classification_report(y2_test, y2_pred)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[CV] END .......................................C=1, gamma=1; total time=   0.4s
[CV] END .......................................C=1, gamma=1; total time=   0.4s
[CV] END .......................................C=1, gamma=1; total time=   0.4s
[CV] END .......................................C=1, gamma=1; total time=   0.4s
[CV] END .......................................C=1, gamma=1; total time=   0.4s
[CV] END .....................................C=1, gamma=0.1; total time=   0.4s
[CV] END .....................................C=1, gamma=0.1; total time=   0.4s
[CV] END .....................................C=1, gamma=0.1; total time=   0.4s
[CV] END .....................................C=1, gamma=0.1; total time=   0.4s
[CV] END .....................................C=1, gamma=0.1; total time=   0.4s
[CV] END ....................................C=1, gamma=0.01; total time=   0.4s
[CV] END ....................................C=1, gamma=0.01; total time=   0.4s
[CV] END ...................

In [9]:
print("Matriz de Confusión (Hipótesis 2):")
print(cm2)
print("Accuracy:", acc2)
print("Precision:", prec2)
print("Recall:", rec2)
print("F1-score:", f12)
print("Reporte de clasificación:")
print(report2)

Matriz de Confusión (Hipótesis 2):
[[979 177]
 [708 434]]
Accuracy: 0.6148825065274152
Precision: 0.7103109656301145
Recall: 0.38003502626970226
F1-score: 0.49515116942384485
Reporte de clasificación:
              precision    recall  f1-score   support

           0       0.58      0.85      0.69      1156
           1       0.71      0.38      0.50      1142

    accuracy                           0.61      2298
   macro avg       0.65      0.61      0.59      2298
weighted avg       0.64      0.61      0.59      2298



In [None]:
# Eliminar columnas irrelevantes
cols_to_drop = ['LoanNr_ChkDgt', 'Name', 'ApprovalDate', 'DisbursementDate', 'State']
df_balanced = df_balanced.drop(columns=cols_to_drop)

# Eliminar filas con valores faltantes en todas las columnas restantes
df_balanced = df_balanced.dropna()

# Separar la variable objetivo y los predictores
y = df_balanced['Accept']
X = df_balanced.drop(columns=['Accept'])

# Convertir todas las columnas categóricas a variables dummy
X = pd.get_dummies(X, drop_first=True)

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Escalar las características
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Definir el grid de hiperparámetros para el SVM
param_grid = {
    'C': [10, 100],
    'gamma': [1, 0.1, 0.01]
}

# Configurar GridSearchCV con SVC (kernel 'rbf')
grid = GridSearchCV(SVC(kernel='rbf', random_state=42, class_weight='balanced'),
                    param_grid, cv=5, verbose=2)
grid.fit(X_train_scaled, y_train)

print("Mejores parámetros encontrados:", grid.best_params_)

# Usar el mejor modelo para predecir en el conjunto de prueba
y_pred = grid.predict(X_test_scaled)


In [16]:

# Evaluar el modelo
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Matriz de Confusión:")
print(cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)
print("Reporte de clasificación:")
print(report)

Matriz de Confusión:
[[723 377]
 [471 712]]
Accuracy: 0.6285589137100307
Precision: 0.6538108356290174
Recall: 0.6018596787827557
F1-score: 0.6267605633802817
Reporte de clasificación:
              precision    recall  f1-score   support

           0       0.61      0.66      0.63      1100
           1       0.65      0.60      0.63      1183

    accuracy                           0.63      2283
   macro avg       0.63      0.63      0.63      2283
weighted avg       0.63      0.63      0.63      2283



In [17]:
# Definir el grid de hiperparámetros para el SVM
param_grid = {
    'C': [1, 10, 100],
    'gamma': [1, 0.1, 0.01]
}

# Configurar GridSearchCV con SVC (kernel 'rbf')
grid = GridSearchCV(SVC(kernel='rbf', random_state=42, class_weight='balanced'),
                    param_grid, cv=5, verbose=2)
grid.fit(X_train_scaled, y_train)

print("Mejores parámetros encontrados:", grid.best_params_)

# Usar el mejor modelo para predecir en el conjunto de prueba
y_pred = grid.predict(X_test_scaled)


Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] END .......................................C=1, gamma=1; total time=  16.0s
[CV] END .......................................C=1, gamma=1; total time=  14.8s
[CV] END .......................................C=1, gamma=1; total time=  14.7s
[CV] END .......................................C=1, gamma=1; total time=  15.3s
[CV] END .......................................C=1, gamma=1; total time=  15.6s
[CV] END .....................................C=1, gamma=0.1; total time=  14.8s
[CV] END .....................................C=1, gamma=0.1; total time=  15.3s
[CV] END .....................................C=1, gamma=0.1; total time=  15.1s
[CV] END .....................................C=1, gamma=0.1; total time=  14.9s
[CV] END .....................................C=1, gamma=0.1; total time=  14.8s
[CV] END ....................................C=1, gamma=0.01; total time=  14.1s
[CV] END ....................................C=1,

In [18]:
# Evaluar el modelo
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Matriz de Confusión:")
print(cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)
print("Reporte de clasificación:")
print(report)

Matriz de Confusión:
[[723 377]
 [471 712]]
Accuracy: 0.6285589137100307
Precision: 0.6538108356290174
Recall: 0.6018596787827557
F1-score: 0.6267605633802817
Reporte de clasificación:
              precision    recall  f1-score   support

           0       0.61      0.66      0.63      1100
           1       0.65      0.60      0.63      1183

    accuracy                           0.63      2283
   macro avg       0.63      0.63      0.63      2283
weighted avg       0.63      0.63      0.63      2283



In [20]:
import pandas as pd
import datetime

# Cargar el dataset de test
df_test = pd.read_csv("/home/ines/dev/CDAW_reto1/LBBYs/data/raw/test_nolabel.csv")

# --- Preprocesamiento ---
# Debes replicar exactamente los pasos que aplicaste al entrenamiento.
# 1. Eliminar las columnas irrelevantes (como hiciste en entrenamiento)
cols_to_drop = ['id', 'LoanNr_ChkDgt', 'Name', 'ApprovalDate', 'DisbursementDate', 'State']
df_test_clean = df_test.drop(columns=cols_to_drop)

# 2. Aplicar one-hot encoding a las columnas categóricas que usaste en entrenamiento:
#    En tu entrenamiento usaste: ['Bank', 'City', 'BankState'] con drop_first=True.
df_test_clean = pd.get_dummies(df_test_clean, columns=['Bank', 'City', 'BankState'], drop_first=True)

# 3. Imputar valores nulos (igual que en entrenamiento)
df_test_clean.fillna(0, inplace=True)

# --- Alinear las features ---
# Durante el entrenamiento, tu DataFrame final balanceado 'df_clean' tenía un cierto conjunto de columnas (features).
# Asumiendo que entrenaste el modelo con:
#   X_train = df_clean.drop('Accept', axis=1)
# Guarda la lista de features de entrenamiento:
features = list(df_balanced.drop('Accept', axis=1).columns)
print("Número de features en entrenamiento:", len(features))

# Reindexa el DataFrame de test para que tenga exactamente las mismas columnas, llenando con 0 las que no estén.
df_test_clean = df_test_clean.reindex(columns=features, fill_value=0)
print("Número de features en test:", df_test_clean.shape[1])

# --- Predicción ---
# Extrae el array de features del test
X_test = df_test_clean.values

# Realiza las predicciones usando tu modelo entrenado (en este caso, 'grid')
df_test['Accept'] = grid.predict(X_test)

# Asegúrate de que la columna 'Accept' sea de tipo entero
df_test['Accept'] = df_test['Accept'].astype(int)

# --- Exportar Submission ---
# Crea el archivo CSV con las columnas requeridas: 'id' y 'Accept'
filename = f"svm_update_{datetime.datetime.now().strftime('%Y%m%d_%H_%M_%S')}.csv"
df_test.to_csv(filename, columns=['id', 'Accept'], index=False)

print("Archivo de submission 'my-model.csv' generado correctamente.")

Número de features en entrenamiento: 14
Número de features en test: 14


ValueError: could not convert string to float: 'Y'