# Credit Default Prediction using Machine Learning

**Objective:**  
This project aims to predict whether a customer will default on a loan using historical credit and personal data.  
We will perform exploratory data analysis (EDA), clean the data, build models, and evaluate their performance.


In [1]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Data understanding


In [2]:

df=pd.read_csv("credit_risk.csv")

In [3]:
df.head(5)

Unnamed: 0,Id,Age,Income,Home,Emp_length,Intent,Amount,Rate,Status,Percent_income,Default,Cred_length
0,0,22,59000,RENT,123.0,PERSONAL,35000,16.02,1,0.59,Y,3
1,1,21,9600,OWN,5.0,EDUCATION,1000,11.14,0,0.1,N,2
2,2,25,9600,MORTGAGE,1.0,MEDICAL,5500,12.87,1,0.57,N,3
3,3,23,65500,RENT,4.0,MEDICAL,35000,15.23,1,0.53,N,2
4,4,24,54400,RENT,8.0,MEDICAL,35000,14.27,1,0.55,Y,4


In [4]:
df.tail(5)

Unnamed: 0,Id,Age,Income,Home,Emp_length,Intent,Amount,Rate,Status,Percent_income,Default,Cred_length
32576,32576,57,53000,MORTGAGE,1.0,PERSONAL,5800,13.16,0,0.11,N,30
32577,32577,54,120000,MORTGAGE,4.0,PERSONAL,17625,7.49,0,0.15,N,19
32578,32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,35000,10.99,1,0.46,N,28
32579,32579,56,150000,MORTGAGE,5.0,PERSONAL,15000,11.48,0,0.1,N,26
32580,32780,66,42000,RENT,2.0,MEDICAL,6475,9.99,0,0.15,N,30


In [5]:
df.columns

Index(['Id', 'Age', 'Income', 'Home', 'Emp_length', 'Intent', 'Amount', 'Rate',
       'Status', 'Percent_income', 'Default', 'Cred_length'],
      dtype='object')

In [6]:
df.sample(5)

Unnamed: 0,Id,Age,Income,Home,Emp_length,Intent,Amount,Rate,Status,Percent_income,Default,Cred_length
9007,9007,23,54500,RENT,2.0,DEBTCONSOLIDATION,8000,,0,0.15,N,2
7128,7128,22,51800,MORTGAGE,4.0,EDUCATION,7200,11.71,0,0.14,N,3
7711,7711,25,30000,RENT,4.0,MEDICAL,7000,7.74,0,0.23,N,2
4660,4660,25,26880,RENT,1.0,VENTURE,4900,14.26,0,0.18,Y,4
5948,5948,22,47748,MORTGAGE,5.0,PERSONAL,4500,6.39,0,0.09,N,4


In [7]:
df.shape

(32581, 12)

In [8]:
df.columns

Index(['Id', 'Age', 'Income', 'Home', 'Emp_length', 'Intent', 'Amount', 'Rate',
       'Status', 'Percent_income', 'Default', 'Cred_length'],
      dtype='object')

In [9]:
df.describe()

Unnamed: 0,Id,Age,Income,Emp_length,Amount,Rate,Status,Percent_income,Cred_length
count,32581.0,32581.0,32581.0,31686.0,32581.0,29465.0,32581.0,32581.0,32581.0
mean,16290.006139,27.7346,66074.85,4.789686,9589.371106,11.011695,0.218164,0.170203,5.804211
std,9405.479594,6.348078,61983.12,4.14263,6322.086646,3.240459,0.413006,0.106782,4.055001
min,0.0,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,8145.0,23.0,38500.0,2.0,5000.0,7.9,0.0,0.09,3.0
50%,16290.0,26.0,55000.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,24435.0,30.0,79200.0,7.0,12200.0,13.47,0.0,0.23,8.0
max,32780.0,144.0,6000000.0,123.0,35000.0,23.22,1.0,0.83,30.0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Id              32581 non-null  int64  
 1   Age             32581 non-null  int64  
 2   Income          32581 non-null  int64  
 3   Home            32581 non-null  object 
 4   Emp_length      31686 non-null  float64
 5   Intent          32581 non-null  object 
 6   Amount          32581 non-null  int64  
 7   Rate            29465 non-null  float64
 8   Status          32581 non-null  int64  
 9   Percent_income  32581 non-null  float64
 10  Default         32581 non-null  object 
 11  Cred_length     32581 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 3.0+ MB


In [11]:
df.dtypes

Id                  int64
Age                 int64
Income              int64
Home               object
Emp_length        float64
Intent             object
Amount              int64
Rate              float64
Status              int64
Percent_income    float64
Default            object
Cred_length         int64
dtype: object

In [12]:
df.isnull().sum()

Id                   0
Age                  0
Income               0
Home                 0
Emp_length         895
Intent               0
Amount               0
Rate              3116
Status               0
Percent_income       0
Default              0
Cred_length          0
dtype: int64

In [13]:
df.duplicated().sum()

np.int64(0)

In [None]:
def detect_outliers_iqr(df, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower) | (df[column] > upper)]
    print(f"📌 {column}: {len(outliers)} outliers")
    return outliers
