In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('loan_data.csv')
df.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   credit.policy      9578 non-null   int64  
 1   purpose            9578 non-null   object 
 2   int.rate           9578 non-null   float64
 3   installment        9578 non-null   float64
 4   log.annual.inc     9578 non-null   float64
 5   dti                9578 non-null   float64
 6   fico               9578 non-null   int64  
 7   days.with.cr.line  9578 non-null   float64
 8   revol.bal          9578 non-null   int64  
 9   revol.util         9578 non-null   float64
 10  inq.last.6mths     9578 non-null   int64  
 11  delinq.2yrs        9578 non-null   int64  
 12  pub.rec            9578 non-null   int64  
 13  not.fully.paid     9578 non-null   int64  
dtypes: float64(6), int64(7), object(1)
memory usage: 1.0+ MB


In [4]:
len(df)

9578

In [5]:
X = df.drop('credit.policy', axis=1)
y = df['credit.policy']


In [6]:
X.head()

X['purpose'].value_counts()

purpose
debt_consolidation    3957
all_other             2331
credit_card           1262
home_improvement       629
small_business         619
major_purchase         437
educational            343
Name: count, dtype: int64

In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['purpose']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot, categorical_features)], remainder='passthrough')

X = transformer.fit_transform(X)
X = pd.DataFrame(X, columns=transformer.get_feature_names_out())
X.head()

Unnamed: 0,one_hot__purpose_all_other,one_hot__purpose_credit_card,one_hot__purpose_debt_consolidation,one_hot__purpose_educational,one_hot__purpose_home_improvement,one_hot__purpose_major_purchase,one_hot__purpose_small_business,remainder__int.rate,remainder__installment,remainder__log.annual.inc,remainder__dti,remainder__fico,remainder__days.with.cr.line,remainder__revol.bal,remainder__revol.util,remainder__inq.last.6mths,remainder__delinq.2yrs,remainder__pub.rec,remainder__not.fully.paid
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.1189,829.1,11.350407,19.48,737.0,5639.958333,28854.0,52.1,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.1071,228.22,11.082143,14.29,707.0,2760.0,33623.0,76.7,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.1357,366.86,10.373491,11.63,682.0,4710.0,3511.0,25.6,1.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.1008,162.34,11.350407,8.1,712.0,2699.958333,33667.0,73.2,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.1426,102.92,11.299732,14.97,667.0,4066.0,4740.0,39.5,0.0,1.0,0.0,0.0


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

params = {
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear'],
    'class_weight': ['balanced']
}

grid_search = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=params,
    cv=10, # 10-fold cross validation 
    scoring='accuracy',
    n_jobs=-1
)


In [10]:
grid_search.fit(X_train, y_train)

0,1,2
,estimator,LogisticRegression()
,param_grid,"{'class_weight': ['balanced'], 'penalty': ['l1', 'l2'], 'solver': ['liblinear']}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,10
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'liblinear'
,max_iter,100


In [11]:
print("Mejores parámetros encontrados:")
print(grid_search.best_params_)
print(f"Mejor precisión encontrada: {grid_search.best_score_:.4f}")

y_pred = grid_search.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix



Mejores parámetros encontrados:
{'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'liblinear'}
Mejor precisión encontrada: 0.8565


In [12]:
grid_search.score(X_test, y_test)

0.8493389004871259

In [13]:
from sklearn.metrics import classification_report, confusion_matrix

# Obtener predicciones del mejor modelo
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Reporte de métricas
print(classification_report(y_test, y_pred))

# Matriz de confusión
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.59      0.84      0.69       575
           1       0.96      0.85      0.90      2299

    accuracy                           0.85      2874
   macro avg       0.77      0.85      0.80      2874
weighted avg       0.88      0.85      0.86      2874

[[ 485   90]
 [ 343 1956]]
