In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score


In [2]:
df = pd.read_csv("XYZCorp_LendingData.txt", header=0, index_col=None, sep="\t")

  df = pd.read_csv("XYZCorp_LendingData.txt", header=0, index_col=None, sep="\t")


In [3]:
df.shape

(855969, 73)

### I have only used 20 features because using all the 72 columns will be alot to fill the form for prediction even if i remove some column which is unnecessary getting less than 30 will be not possible thats the reason i have only used 20 features.

In [4]:
feature_map = {
    'loan_amount': 'loan_amnt',
    'term': 'term',
    'interest_rate': 'int_rate',
    'installment': 'installment',
    'grade': 'grade',
    'emp_length': 'emp_length',
    'home_ownership': 'home_ownership',
    'annual_income': 'annual_inc',
    'verification_status': 'verification_status',
    'dti': 'dti',
    'delinq_2yrs': 'delinq_2yrs',
    'open_acc': 'open_acc',
    'pub_rec': 'pub_rec',
    'revol_util': 'revol_util',
    'purpose': 'purpose',
    'initial_list_status': 'initial_list_status',
    'total_rec_late_fee': 'total_rec_late_fee',
    'recoveries': 'recoveries',
    'acc_now_delinq': 'acc_now_delinq',
    'total_coll_amt': 'tot_coll_amt'
}

## I have get this dataset from kaggle this is the description from kaggle of every column that i have used for prediction.
### loan_amnt:- The listed amount of the loan applied for by the borrower. If at some point in time, the credit department reduces
### term:- The number of payments on the loan. Values are in months and can be either 36 or 60.
### int_rate :- Interest Rate on the loan.
### installment:- The monthly payment owed by the borrower if the loan originates.
### grade :- XYZ Corp. assigned loan grade (categorical loan risk rating such as A, B, C...).
### emp_length :- Employment length in years. Possible values are between 0 and 10, where 0 means less than one year and 10 means ten or more years.
### home_ownership :- The home ownership status provided by the borrower during registration. Values include RENT, OWN, MORTGAGE, OTHER.
### annual_income :- The self-reported annual income provided by the borrower during registration.
### verification_status :- Indicates if the income source was verified, not verified, or source verified by XYZ corp.
### dti :- A ratio calculated using the borrower’s total monthly debt payments (excluding mortgage and the requested loan), divided by the borrower’s self-reported monthly income.
### delinq_2yrs :- The number of 30+ days past-due incidences of delinquency in the borrower's credit file over the past 2 years.
### open_acc :- The number of open credit lines in the borrower's credit file.
### pub_rec :- The number of derogatory public records (e.g., bankruptcies, liens, or judgments).
### revol_util :- Revolving line utilization rate, or the amount of credit the borrower is using relative to all available revolving credit.
### purpose :- A category provided by the borrower for the loan purpose (e.g., debt consolidation, home improvement, etc.).
### initial_list_status :- The initial listing status of the loan. Possible values are: W (whole loan) and F (fractional).
### total_rec_late_fee :- Late fees received to date from the borrower.
### recoveries :- Post charge-off gross recovery amount.
### acc_now_delinq :- The number of accounts on which the borrower is currently delinquent.
### total_coll_amt :- Total collection amounts ever owed by the borrower.


In [5]:
selected_features = list(feature_map.keys())


X = df[[feature_map[feat] for feat in selected_features]].copy()
X.columns = selected_features  

y = df['default_ind']

print("Using this featyures:", X.columns.tolist())

Using this featyures: ['loan_amount', 'term', 'interest_rate', 'installment', 'grade', 'emp_length', 'home_ownership', 'annual_income', 'verification_status', 'dti', 'delinq_2yrs', 'open_acc', 'pub_rec', 'revol_util', 'purpose', 'initial_list_status', 'total_rec_late_fee', 'recoveries', 'acc_now_delinq', 'total_coll_amt']


### Treating the missing values with median

In [6]:
numerical_cols = ['loan_amount', 'interest_rate', 'installment', 'dti', 'delinq_2yrs', 
                  'open_acc', 'pub_rec', 'revol_util', 'total_rec_late_fee', 
                  'recoveries', 'acc_now_delinq', 'total_coll_amt', 'annual_income']

X[numerical_cols] = X[numerical_cols].fillna(X[numerical_cols].median())

### converting the missing values with numbers using labelencoder

In [7]:
categorical_cols = ['term', 'grade', 'emp_length', 'home_ownership', 'verification_status', 'purpose', 'initial_list_status']

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in categorical_cols:
    if col in X.columns:
        X[col] = le.fit_transform(X[col])

### Spliting the data into 80 to 20 percent 80% is traing data and 20% is testing data

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### I am using standarscaler to standardize the features of dataset.

In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Training data shape:", X_train_scaled.shape)
print("Test data shape:", X_test_scaled.shape)

Training data shape: (684775, 20)
Test data shape: (171194, 20)


### Using a simple neural network for binary classification using keras and the model has many dense layers with Relu activation, dropout for regularization and a sigmoid output.

In [10]:
model = keras.Sequential([
    layers.Input(shape=(X_train_scaled.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(16, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])


model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy', 'precision', 'recall']
)

model.summary()

history = model.fit(
    X_train_scaled, y_train,
    epochs=30,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

Epoch 1/30
[1m17120/17120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 482us/step - accuracy: 0.9624 - loss: 0.1436 - precision: 0.8619 - recall: 0.4232 - val_accuracy: 0.9720 - val_loss: 0.1159 - val_precision: 1.0000 - val_recall: 0.4819
Epoch 2/30
[1m17120/17120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 489us/step - accuracy: 0.9718 - loss: 0.1171 - precision: 0.9956 - recall: 0.4808 - val_accuracy: 0.9724 - val_loss: 0.1148 - val_precision: 1.0000 - val_recall: 0.4892
Epoch 3/30
[1m17120/17120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 497us/step - accuracy: 0.9722 - loss: 0.1156 - precision: 0.9934 - recall: 0.4908 - val_accuracy: 0.9734 - val_loss: 0.1113 - val_precision: 0.9989 - val_recall: 0.5082
Epoch 4/30
[1m17120/17120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 495us/step - accuracy: 0.9729 - loss: 0.1137 - precision: 0.9946 - recall: 0.5005 - val_accuracy: 0.9736 - val_loss: 0.1106 - val_precision: 1.0000 - val_recall: 0.5107


In [14]:
y_pred_prob = model.predict(X_test_scaled)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()

[1m5350/5350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 192us/step


### Model Performance

In [15]:
print("Accuracy:", accuracy_score(y_test, y_pred))

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

print("Classification Report:\n", classification_report(y_test, y_pred))

print("ROC AUC Score:", roc_auc_score(y_test, y_pred_prob))

Accuracy: 0.9737198733600476
Confusion Matrix:
 [[161791      0]
 [  4499   4904]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99    161791
           1       1.00      0.52      0.69      9403

    accuracy                           0.97    171194
   macro avg       0.99      0.76      0.84    171194
weighted avg       0.97      0.97      0.97    171194

ROC AUC Score: 0.8775697109343291


In [13]:
# model.save('neural_network_model.h5')
# import joblib
# joblib.dump(scaler, 'neural_network_scaler.pkl')

# print("\nNeural Network Model saved as 'neural_network_model.h5'")
# print("Neural Network Scaler saved as 'neural_network_scaler.pkl'")