In [1]:
import numpy as np
import pandas as pd
import sklearn as skl
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_score
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
df= pd.read_csv("data/loan_approval_dataset.csv")

In [3]:
df.shape

(4269, 13)

In [4]:
df= df.dropna()

In [5]:
df.dtypes

loan_id                       int64
 no_of_dependents             int64
 education                   object
 self_employed               object
 income_annum                 int64
 loan_amount                  int64
 loan_term                    int64
 cibil_score                  int64
 residential_assets_value     int64
 commercial_assets_value      int64
 luxury_assets_value          int64
 bank_asset_value             int64
 loan_status                 object
dtype: object

In [6]:
df.shape
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [7]:
df = df.rename(columns=lambda x: x.strip())

In [8]:
df["self_employed"] = df["self_employed"].str.strip()
df["loan_status"] = df["loan_status"].str.strip()
df["education"] = df["education"].str.strip()

In [9]:
scale_mapper = {"Yes":1,"No":0}
df["self_employed"] = df["self_employed"].replace(scale_mapper)

In [10]:
scale_mapper = {"Approved":1,"Rejected":0}
df["loan_status"] = df["loan_status"].replace(scale_mapper)

In [11]:
scale_mapper = {"Graduate":1,"Not Graduate":0}
df["education"] = df["education"].replace(scale_mapper)

In [12]:
df.dtypes

loan_id                     int64
no_of_dependents            int64
education                   int64
self_employed               int64
income_annum                int64
loan_amount                 int64
loan_term                   int64
cibil_score                 int64
residential_assets_value    int64
commercial_assets_value     int64
luxury_assets_value         int64
bank_asset_value            int64
loan_status                 int64
dtype: object

In [13]:
df1= df.loc[df["loan_status"]==1,:]
df1.shape

(2656, 13)

In [14]:
df2= df.loc[df["loan_status"]==0,:]
df2.shape

(1613, 13)

In [15]:
#Hacemos underfitting

In [16]:
dfu= df1.sample(n=1613, random_state=32)
dfu.shape

(1613, 13)

In [17]:
df= pd.concat([df2,dfu]).reset_index(drop=True)
df.shape

(3226, 13)

In [18]:
X= df.loc[:,"loan_id":"bank_asset_value"]

In [19]:
y= df.loc[:,"loan_status"]

In [20]:
X.corr()>0.9

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
loan_id,True,False,False,False,False,False,False,False,False,False,False,False
no_of_dependents,False,True,False,False,False,False,False,False,False,False,False,False
education,False,False,True,False,False,False,False,False,False,False,False,False
self_employed,False,False,False,True,False,False,False,False,False,False,False,False
income_annum,False,False,False,False,True,True,False,False,False,False,True,False
loan_amount,False,False,False,False,True,True,False,False,False,False,False,False
loan_term,False,False,False,False,False,False,True,False,False,False,False,False
cibil_score,False,False,False,False,False,False,False,True,False,False,False,False
residential_assets_value,False,False,False,False,False,False,False,False,True,False,False,False
commercial_assets_value,False,False,False,False,False,False,False,False,False,True,False,False


In [None]:
#Eliminamos las variables que tiene una correlacion alta.

In [21]:
X.drop(columns=["loan_amount","luxury_assets_value"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(columns=["loan_amount","luxury_assets_value"], inplace=True)


In [None]:
#Separamos nuestra data en entrenamiento y prueba

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)

In [23]:
y_test.value_counts()

0    484
1    484
Name: loan_status, dtype: int64

In [24]:
scaler = StandardScaler()
X_norm= scaler.fit_transform(X_train)

In [25]:
X_norm

array([[ 0.84189697,  1.49118848, -1.00354926, ..., -0.30630712,
         2.13523667,  0.16117819],
       [-0.1841604 ,  0.30995586,  0.9964633 , ..., -0.18283279,
         1.06340173,  0.59370492],
       [-1.18008751,  0.30995586, -1.00354926, ...,  0.41910459,
         0.35644678,  0.09938866],
       ...,
       [ 1.03245048, -1.46189307, -1.00354926, ..., -0.90824449,
        -0.60136317, -0.92013864],
       [ 1.18065877, -0.87127676, -1.00354926, ...,  1.3142935 ,
        -0.1224582 ,  1.39696885],
       [ 0.00965044, -0.87127676,  0.9964633 , ..., -0.27543854,
         0.53888676, -0.51850668]])

In [26]:
clf = RandomForestClassifier(criterion="entropy", random_state=0)

In [27]:
scores= cross_val_score(clf, X_norm, y_train, cv=5)

In [28]:
scores

array([0.9579646 , 0.95353982, 0.95353982, 0.96895787, 0.96230599])

In [29]:
scores.mean()

0.9592616211761473

In [30]:
scores.std()

0.005838589515250647

In [31]:
#Aplicar GridSearchCV para encontrar los mejores hiperparámetros del modelo.

In [32]:
param_grid = {
    'criterion': ["giny","entropy","log_loss"],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [33]:
clfg = GridSearchCV(clf, param_grid, cv=5)

In [34]:
clfg.fit(X_norm, y_train)

135 fits failed out of a total of 405.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
135 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 340, in fit
    self._validate_params()
  File "C:\ProgramData\anaconda3\lib\site-packages\sklearn\base.py", line 581, in _validate_params
    validate_parameter_constraints(
  File "C:\ProgramData\anaconda3\lib\site-packages\sklearn\utils\_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidPara

In [35]:
clfg.best_params_

{'criterion': 'entropy',
 'max_depth': 10,
 'min_samples_leaf': 2,
 'min_samples_split': 2}

In [36]:
best_rf_model = clfg.best_estimator_

In [37]:
X_test_norm= scaler.fit_transform(X_test)

In [38]:
best_rf_model.fit(X_norm,y_train)

In [39]:
y_predict= best_rf_model.predict(X_test_norm)

In [40]:
accuracy_score(y_test, y_predict)

0.9535123966942148

In [42]:
conf_matrix = confusion_matrix(y_test, y_predict)
print("Matriz de Confusión:")
print(conf_matrix)

Matriz de Confusión:
[[467  17]
 [ 28 456]]


Como se puede ver existe una presicion mayor al 95% en el modelo que hemos creado, en este caso hemos escogido los mejores hiperparametros para la creacion del modelo de RF, ademas, aunque la matriz de confusion presenta algunos valores no deseados ("falsos positivos y falsos negativos"), en relacion a la cantidad total de datos siguen siendo pocas equivocaciones por parte del modelo.