# Predictive Analysis on Cleaned Data

In [1]:
#Import Libraries 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Regression Analysis 

In [2]:
# Load the dataset
df = pd.read_csv('preprocessed_loan_data.csv')

In [3]:
df.head()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,56,85994,50587,520,80,4,15.23,36,0.44,0,0,0,1,1,4,1,0
1,69,50432,124440,458,15,1,4.81,60,0.68,2,0,1,0,0,4,1,0
2,46,84208,129188,451,26,3,21.17,24,0.31,2,3,0,1,1,0,0,1
3,32,31713,44799,743,0,3,7.07,24,0.23,1,0,1,0,0,1,0,0
4,60,20437,9139,633,8,4,6.51,48,0.73,0,3,0,0,1,0,0,0


In [None]:
# Convert categorical variables to dummy/indicator variables
df = pd.get_dummies(df, columns=['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner'])

In [None]:
df.head()

In [4]:
# Separate features and target variable
X = df.drop('Default', axis=1)
y = df['Default']

In [5]:
X.head()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner
0,56,85994,50587,520,80,4,15.23,36,0.44,0,0,0,1,1,4,1
1,69,50432,124440,458,15,1,4.81,60,0.68,2,0,1,0,0,4,1
2,46,84208,129188,451,26,3,21.17,24,0.31,2,3,0,1,1,0,0
3,32,31713,44799,743,0,3,7.07,24,0.23,1,0,1,0,0,1,0
4,60,20437,9139,633,8,4,6.51,48,0.73,0,3,0,0,1,0,0


In [6]:
y.head()

0    0
1    0
2    1
3    0
4    0
Name: Default, dtype: int64

In [7]:
# Handle missing values if any
X = X.fillna(X.median())

In [8]:
# Normalize or scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42) # 80/20 Split

In [10]:
# Initialize and train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [11]:
# Make predictions
y_pred = model.predict(X_test)

In [12]:
# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[45058   112]
 [ 5717   183]]
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     45170
           1       0.62      0.03      0.06      5900

    accuracy                           0.89     51070
   macro avg       0.75      0.51      0.50     51070
weighted avg       0.86      0.89      0.84     51070



In [13]:
# Determine feature importance
feature_names = X.columns
coefficients = model.coef_[0]

In [14]:
# Combine feature names with their coefficients
feature_importance = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
feature_importance['AbsoluteCoefficient'] = np.abs(feature_importance['Coefficient'])
feature_importance = feature_importance.sort_values(by='AbsoluteCoefficient', ascending=False)

In [15]:

print(feature_importance)

           Feature  Coefficient  AbsoluteCoefficient
0              Age    -0.588019             0.588019
6     InterestRate     0.449855             0.449855
1           Income    -0.345758             0.345758
4   MonthsEmployed    -0.332339             0.332339
2       LoanAmount     0.297019             0.297019
10  EmploymentType     0.147228             0.147228
15     HasCoSigner    -0.139216             0.139216
13   HasDependents    -0.120308             0.120308
3      CreditScore    -0.120074             0.120074
5   NumCreditLines     0.097799             0.097799
9        Education    -0.077142             0.077142
12     HasMortgage    -0.074434             0.074434
8         DTIRatio     0.061739             0.061739
14     LoanPurpose    -0.037372             0.037372
11   MaritalStatus    -0.029366             0.029366
7         LoanTerm     0.002665             0.002665


<h3>Strong Variables Leading to Loan Default</h3>
<table border="1">
    <thead>
        <tr>
            <th>Feature</th>
            <th>Coefficient</th>
            <th>Absolute Coefficient</th>
            <th>Impact</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td>InterestRate</td>
            <td>0.449855</td>
            <td>0.449855</td>
            <td>High positive effect; higher interest rates increase the likelihood of default.</td>
        </tr>
        <tr>
            <td>Age</td>
            <td>-0.588019</td>
            <td>0.588019</td>
            <td>High negative effect; older individuals are less likely to default.</td>
        </tr>
        <tr>
            <td>Income</td>
            <td>-0.345758</td>
            <td>0.345758</td>
            <td>High negative effect; higher income decreases the likelihood of default.</td>
        </tr>
        <tr>
            <td>MonthsEmployed</td>
            <td>-0.332339</td>
            <td>0.332339</td>
            <td>High negative effect; more months employed decreases the likelihood of default.</td>
        </tr>
        <tr>
            <td>LoanAmount</td>
            <td>0.297019</td>
            <td>0.297019</td>
            <td>Positive effect; larger loan amounts increase the likelihood of default.</td>
        </tr>
    </tbody>
</table>

<h3>Variables with Minimal Effect</h3>
<table border="1">
    <thead>
        <tr>
            <th>Feature</th>
            <th>Coefficient</th>
            <th>Absolute Coefficient</th>
            <th>Impact</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td>EmploymentType</td>
            <td>0.147228</td>
            <td>0.147228</td>
            <td>Minimal positive effect.</td>
        </tr>
        <tr>
            <td>HasCoSigner</td>
            <td>-0.139216</td>
            <td>0.139216</td>
            <td>Minimal negative effect.</td>
        </tr>
        <tr>
            <td>HasDependents</td>
            <td>-0.120308</td>
            <td>0.120308</td>
            <td>Minimal negative effect.</td>
        </tr>
        <tr>
            <td>CreditScore</td>
            <td>-0.120074</td>
            <td>0.120074</td>
            <td>Minimal negative effect.</td>
        </tr>
        <tr>
            <td>NumCreditLines</td>
            <td>0.097799</td>
            <td>0.097799</td>
            <td>Minimal positive effect.
