In [62]:
import pandas as pd
import numpy as np

In [63]:
loan_data = pd.read_csv("loan_data.csv")

In [64]:
loan_data.head()

Unnamed: 0.1,Unnamed: 0,Initial payment,Last payment,Credit Score,House Number,Result
0,0,201,10018,250,3046,yes
1,1,205,10016,395,3044,yes
2,2,257,10129,109,3251,yes
3,3,246,10064,324,3137,yes
4,4,117,10115,496,3094,yes


In [65]:
loan_data.shape

(1000, 6)

# From the loan dataset we are removing the 'Unnamed' and 'House Number' because they we will not considered as feature.

In [66]:
loan_data.drop(columns=["Unnamed: 0", "House Number"], inplace=True)

In [67]:
loan_data.head()

Unnamed: 0,Initial payment,Last payment,Credit Score,Result
0,201,10018,250,yes
1,205,10016,395,yes
2,257,10129,109,yes
3,246,10064,324,yes
4,117,10115,496,yes


In [68]:
loan_data["Result"].value_counts()

No     521
yes    479
Name: Result, dtype: int64

# We saw that there is high randomness in the dataset. The values of yes and no is nearly same.

In [69]:
# Checking NaN values

loan_data.isna().sum()

Initial payment    0
Last payment       0
Credit Score       0
Result             0
dtype: int64

In [70]:
# Separating features and target 

X = loan_data[["Initial payment", "Last payment", "Credit Score"]].values
Y = loan_data[["Result"]].values

In [71]:
X

array([[  201, 10018,   250],
       [  205, 10016,   395],
       [  257, 10129,   109],
       ...,
       [  316, 14872,   613],
       [  305, 14926,   897],
       [  168, 14798,   834]], dtype=int64)

In [72]:
Y[:10]

array([['yes'],
       ['yes'],
       ['yes'],
       ['yes'],
       ['yes'],
       ['yes'],
       ['yes'],
       ['yes'],
       ['yes'],
       ['yes']], dtype=object)

In [73]:
from sklearn.model_selection import train_test_split

In [74]:
# Training and Testing

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

In [75]:
from sklearn.tree import DecisionTreeClassifier

In [76]:
model = DecisionTreeClassifier()

In [77]:
model.fit(X_train, Y_train)

In [78]:
# Generate cost complexity pruning path to identify ccp_alphas
path = model.cost_complexity_pruning_path(X_train, Y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

In [79]:
# Train a series of models using different ccp_alpha values
models = []
for ccp_alpha in ccp_alphas:
    pruned_model = DecisionTreeClassifier(random_state=42, ccp_alpha=ccp_alpha)
    pruned_model.fit(X_train, Y_train)
    models.append(pruned_model)

In [80]:
# Evaluate all models and select the best one based on test accuracy
train_scores = [accuracy_score(Y_train, model.predict(X_train)) for model in models]
test_scores = [accuracy_score(Y_test, model.predict(X_test)) for model in models]


In [81]:
# Identify the best model
best_index = test_scores.index(max(test_scores))
best_model = models[best_index]

In [82]:
# Output the best ccp_alpha and its test accuracy
best_ccp_alpha = ccp_alphas[best_index]
print(f"Best ccp_alpha: {best_ccp_alpha}")
print(f"Best Test Accuracy: {max(test_scores)}")

Best ccp_alpha: 0.0034285714285714267
Best Test Accuracy: 0.9033333333333333


In [83]:
from sklearn import tree
text_representation = tree.export_text(model)
print(text_representation)

|--- feature_1 <= 12542.50
|   |--- feature_1 <= 11898.50
|   |   |--- feature_1 <= 11487.00
|   |   |   |--- feature_0 <= 471.50
|   |   |   |   |--- class: yes
|   |   |   |--- feature_0 >  471.50
|   |   |   |   |--- feature_0 <= 476.50
|   |   |   |   |   |--- feature_2 <= 469.50
|   |   |   |   |   |   |--- class: yes
|   |   |   |   |   |--- feature_2 >  469.50
|   |   |   |   |   |   |--- class: No
|   |   |   |   |--- feature_0 >  476.50
|   |   |   |   |   |--- class: yes
|   |   |--- feature_1 >  11487.00
|   |   |   |--- feature_2 <= 587.00
|   |   |   |   |--- class: yes
|   |   |   |--- feature_2 >  587.00
|   |   |   |   |--- feature_2 <= 615.00
|   |   |   |   |   |--- class: No
|   |   |   |   |--- feature_2 >  615.00
|   |   |   |   |   |--- feature_0 <= 416.00
|   |   |   |   |   |   |--- feature_1 <= 11493.00
|   |   |   |   |   |   |   |--- class: No
|   |   |   |   |   |   |--- feature_1 >  11493.00
|   |   |   |   |   |   |   |--- feature_0 <= 368.00
|   |   |   |

In [84]:
# Cheching prediction of model

Y_pred = model.predict(X_test)

In [85]:
# Check accuracy of model
from sklearn.metrics import accuracy_score

In [86]:
print("Accuracy is ", accuracy_score(Y_test, Y_pred)*100)

Accuracy is  84.66666666666667


In [87]:
#  Checking the performance of model on training data

Y_train_pred = model.predict(X_train)

In [88]:
print("Accuracy is ", accuracy_score(Y_train, Y_train_pred)*100)

Accuracy is  100.0


# Overfitting of model: When the model try to ocuupie all the variability in the dataset. Creating a single equation for every possible h case.  

# Pruning: It is a technique that removes or cut the some subtree or brach of the tree. 

In [89]:
# Saving the model

import joblib

In [90]:
joblib.dump(model, r"C:\Users\Lenovo\OneDrive\Desktop\Juypter_projects\loan_model_DecisionTree.pkl" )

['C:\\Users\\Lenovo\\OneDrive\\Desktop\\Juypter_projects\\loan_model_DecisionTree.pkl']