#### Imports

In [21]:
# General import and load data
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

from sklearn.utils import resample

# Training and test spliting
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Preprocessing 
from sklearn.preprocessing import StandardScaler

# Estimators
from sklearn.svm import SVC

# Evaluation
from sklearn import metrics
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report

# Optimization
from sklearn.model_selection import GridSearchCV

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(color_codes=True)

#### Cargar datos

In [22]:
data = "/home/ines/dev/CDAW_reto1/LBBYs/data/processed/df_train.csv"
df = pd.read_csv(data)
df.head()

Unnamed: 0,id,LoanNr_ChkDgt,Name,City,State,Bank,BankState,ApprovalDate,ApprovalFY,NoEmp,...,CreateJob,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,DisbursementDate,DisbursementGross,BalanceGross,Accept
0,bd9d6267ec5,1523195006,"P-SCAPE LAND DESIGN, LLC",NORTHFIELD,OH,CITIZENS BANK NATL ASSOC,RI,2005-11-01,2006,2,...,0,2,0,1,0.0,0.0,2005-12-31,8000.0,0.0,1
1,9eebf6d8098,1326365010,The Fresh & Healthy Catering C,CANTON,OH,"FIRSTMERIT BANK, N.A.",OH,2005-06-06,2005,2,...,1,2,1,1,0.0,0.0,2005-07-31,166000.0,0.0,1
2,83806858500,6179584001,AARON MASON & HOWE LLC,SAWYERWOOD,OH,"PNC BANK, NATIONAL ASSOCIATION",OH,2003-03-18,2003,2,...,4,2,1,2,1.0,0.0,2003-03-31,25000.0,0.0,1
3,a21ab9cb3af,8463493009,MID OHIO CAR WASH,COLUMBUS,OH,THE HUNTINGTON NATIONAL BANK,OH,1995-06-28,1995,2,...,0,0,1,0,0.0,0.0,1996-01-31,220100.0,0.0,1
4,883b5e5385e,3382225007,Bake N Brew LLC,Newark,OH,THE HUNTINGTON NATIONAL BANK,OH,2009-04-16,2009,0,...,0,0,0,1,0.0,0.0,2009-05-31,25000.0,0.0,0


#### Submuestreo

In [23]:
# Separar clases mayoritaria y minoritaria
df_majority = df[df["Accept"] == 1]  # Créditos aceptados (clase mayoritaria)
df_minority = df[df["Accept"] == 0]  # Créditos no aceptados (clase minoritaria)

# Aplicar Submuestreo (undersampling)
df_majority_downsampled = resample(df_majority, 
                                   replace=False,  # Sin reemplazo
                                   n_samples=len(df_minority),  # Igualar cantidad de muestras
                                   random_state=42)  # Fijar semilla

# Unir el dataset balanceado
df_balanced = pd.concat([df_majority_downsampled, df_minority])

# Mezclar los datos para evitar sesgos
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
df.dtypes

id                    object
LoanNr_ChkDgt          int64
Name                  object
City                  object
State                 object
Bank                  object
BankState             object
ApprovalDate          object
ApprovalFY             int64
NoEmp                  int64
NewExist             float64
CreateJob              int64
RetainedJob            int64
FranchiseCode          int64
UrbanRural             int64
RevLineCr            float64
LowDoc               float64
DisbursementDate      object
DisbursementGross    float64
BalanceGross         float64
Accept                 int64
dtype: object

In [24]:
# Eliminar filas con NaN en las columnas relevantes
columnas_relevantes = ['City', 'NoEmp', 'UrbanRural', 'NewExist', 'Accept']
df_balanced= df_balanced.dropna(subset=columnas_relevantes)
df_balanced = df_balanced.drop(columns=["id"])

df_balanced.head()

Unnamed: 0,LoanNr_ChkDgt,Name,City,State,Bank,BankState,ApprovalDate,ApprovalFY,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,DisbursementDate,DisbursementGross,BalanceGross,Accept
0,2271315002,"DAS Transportation, LLC",Greenwillow,OH,THE HUNTINGTON NATIONAL BANK,OH,2007-02-12,2007,5,1.0,0,1,0,1,0.0,0.0,2007-06-30,31000.0,0.0,0
1,2740965002,"Our Family Home, Inc.",COLUMBUS,OH,JPMORGAN CHASE BANK NATL ASSOC,IL,2007-10-03,2008,7,1.0,0,7,0,1,1.0,0.0,,61461.0,0.0,1
2,3439346006,CALLAHAN CUTTING TOOLS INC,COLUMBUS,OH,THE HUNTINGTON NATIONAL BANK,OH,2008-09-25,2008,4,0.0,0,4,1,1,0.0,0.0,2008-09-30,50000.0,0.0,1
3,1563195000,"H2O Motion Oasis, LLC",Reynoldsburg,OH,"BUSINESS LOAN CENTER, LLC",FL,2005-12-02,2006,3,1.0,0,3,0,1,0.0,0.0,2005-12-31,25000.0,0.0,0
4,1687155001,LOVE PHOTOGRAPHY AND CUSTOM FR,WESTLAKE,OH,CITIZENS BANK NATL ASSOC,RI,2006-03-01,2006,2,0.0,0,2,0,1,0.0,0.0,2006-03-31,42000.0,0.0,1


#### Hipótesis 1: Impacto de la ubicación (City) en la probabilidad de aprobación

In [25]:
# Primero, eliminar filas con valores faltantes en las columnas relevantes
cols_relevantes = ['UrbanRural', 'NoEmp', 'NewExist', 'Accept']
df_balanced = df_balanced.dropna(subset=cols_relevantes)

# Seleccionar variables predictoras y variable objetivo
X = df_balanced[['UrbanRural', 'NoEmp', 'NewExist']]
y = df_balanced['Accept']

# Convertir la variable categórica 'UrbanRural' a variables dummy
X = pd.get_dummies(X, columns=['UrbanRural'], drop_first=True)

# División en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Escalado de las características
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Definir el grid de hiperparámetros para el SVM
param_grid = {
    'C': [10, 100],
    'gamma': [1, 0.1, 0.01]
}

# Configurar GridSearchCV con SVC (kernel 'rbf')
grid = GridSearchCV(SVC(kernel='rbf', random_state=42, class_weight='balanced'),
                    param_grid, cv=5, verbose=2)
grid.fit(X_train_scaled, y_train)

print("Mejores parámetros encontrados:", grid.best_params_)

# Usar el mejor modelo para predecir en el conjunto de prueba
y_pred = grid.predict(X_test_scaled)

# Evaluar el modelo
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ......................................C=10, gamma=1; total time=   0.7s
[CV] END ......................................C=10, gamma=1; total time=   0.7s
[CV] END ......................................C=10, gamma=1; total time=   0.6s
[CV] END ......................................C=10, gamma=1; total time=   0.6s
[CV] END ......................................C=10, gamma=1; total time=   0.7s
[CV] END ....................................C=10, gamma=0.1; total time=   0.5s
[CV] END ....................................C=10, gamma=0.1; total time=   0.5s
[CV] END ....................................C=10, gamma=0.1; total time=   0.5s
[CV] END ....................................C=10, gamma=0.1; total time=   0.4s
[CV] END ....................................C=10, gamma=0.1; total time=   0.5s
[CV] END ...................................C=10, gamma=0.01; total time=   0.5s
[CV] END ...................................C=10,

In [26]:
print("Matriz de Confusión:")
print(cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)
print("Reporte de clasificación:")
print(report)

Matriz de Confusión:
[[976 180]
 [703 439]]
Accuracy: 0.6157528285465622
Precision: 0.7092084006462036
Recall: 0.38441330998248685
F1-score: 0.498580352072686
Reporte de clasificación:
              precision    recall  f1-score   support

           0       0.58      0.84      0.69      1156
           1       0.71      0.38      0.50      1142

    accuracy                           0.62      2298
   macro avg       0.65      0.61      0.59      2298
weighted avg       0.64      0.62      0.59      2298

