In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt


## Cleaning data

## Data dictionary

|    | Variable          | Explanation                                                                                                             |
|---:|:------------------|:------------------------------------------------------------------------------------------------------------------------|
|  0 | credit_policy     | 1 if the customer meets the credit underwriting criteria; 0 otherwise.                                                  |
|  1 | purpose           | The purpose of the loan.                                                                                                |
|  2 | int_rate          | The interest rate of the loan (more risky borrowers are assigned higher interest rates).                                |
|  3 | installment       | The monthly installments owed by the borrower if the loan is funded.                                                    |
|  4 | log_annual_inc    | The natural log of the self-reported annual income of the borrower.                                                     |
|  5 | dti               | The debt-to-income ratio of the borrower (amount of debt divided by annual income).                                     |
|  6 | fico              | The FICO credit score of the borrower.                                                                                  |
|  7 | days_with_cr_line | The number of days the borrower has had a credit line.                                                                  |
|  8 | revol_bal         | The borrower's revolving balance (amount unpaid at the end of the credit card billing cycle).                           |
|  9 | revol_util        | The borrower's revolving line utilization rate (the amount of the credit line used relative to total credit available). |
| 10 | inq_last_6mths    | The borrower's number of inquiries by creditors in the last 6 months.                                                   |
| 11 | delinq_2yrs       | The number of times the borrower had been 30+ days past due on a payment in the past 2 years.                           |
| 12 | pub_rec           | The borrower's number of derogatory public records.                                                                     |
| 13 | not_fully_paid    | 1 if the loan is not fully paid; 0 otherwise.   

[Source](https://www.kaggle.com/itssuru/loan-data) of dataset.

In [2]:
np.random.seed(42)


loan_data=pd.read_csv('loan_data.csv')
loan_data.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [3]:
loan_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   credit.policy      9578 non-null   int64  
 1   purpose            9578 non-null   object 
 2   int.rate           9578 non-null   float64
 3   installment        9578 non-null   float64
 4   log.annual.inc     9578 non-null   float64
 5   dti                9578 non-null   float64
 6   fico               9578 non-null   int64  
 7   days.with.cr.line  9578 non-null   float64
 8   revol.bal          9578 non-null   int64  
 9   revol.util         9578 non-null   float64
 10  inq.last.6mths     9578 non-null   int64  
 11  delinq.2yrs        9578 non-null   int64  
 12  pub.rec            9578 non-null   int64  
 13  not.fully.paid     9578 non-null   int64  
dtypes: float64(6), int64(7), object(1)
memory usage: 1.0+ MB


In [4]:
len(loan_data)

9578

In [5]:
loan_data['credit.policy'].value_counts()

credit.policy
1    7710
0    1868
Name: count, dtype: int64

## Split the data 

In [6]:
X = loan_data.drop('credit.policy', axis=1)
y = loan_data['credit.policy']

In [7]:
### OneHotEncoder

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_feature = ['purpose']

one_hot = OneHotEncoder()

transformer = ColumnTransformer([('one_hot',one_hot,categorical_feature)],
                                remainder='passthrough')

transformed_x = transformer.fit_transform(X)
transformed_x

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 1.]], shape=(9578, 19))

In [8]:
# Another way to see this
dummies = pd.get_dummies(loan_data[["purpose"]])
dummies

Unnamed: 0,purpose_all_other,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,False,False,True,False,False,False,False
1,False,True,False,False,False,False,False
2,False,False,True,False,False,False,False
3,False,False,True,False,False,False,False
4,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...
9573,True,False,False,False,False,False,False
9574,True,False,False,False,False,False,False
9575,False,False,True,False,False,False,False
9576,False,False,False,False,True,False,False


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(transformed_x,y,
                                                    test_size=0.2 )

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

params = {
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear'],  # liblinear soporta L1 y L2
    'class_weight': ['balanced']  # Probar con y sin balanceo
}

grid_search = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=params,
    cv=10,
    scoring='accuracy',
    n_jobs=-1
)


In [11]:
grid_search.fit(X_train, y_train)

print(f"Mejores parámetros: {grid_search.best_params_}")
best_model = grid_search.best_estimator_



Mejores parámetros: {'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'liblinear'}


In [12]:
grid_search.score(X_test,y_test)

0.8611691022964509

In [13]:
from sklearn.metrics import classification_report, confusion_matrix

# Obtener predicciones del mejor modelo
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Reporte de métricas
print(classification_report(y_test, y_pred))

# Matriz de confusión
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.60      0.83      0.70       372
           1       0.95      0.87      0.91      1544

    accuracy                           0.86      1916
   macro avg       0.78      0.85      0.80      1916
weighted avg       0.89      0.86      0.87      1916

[[ 308   64]
 [ 202 1342]]
