In [2]:
#data loading

import pandas as pd

df = pd.read_csv('Data_08_Simulated Loan Risk Assessment Data.csv')
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,31.0,male,Bachelor,96852.0,10,MORTGAGE,5000.0,PERSONAL,6.91,0.05,9.0,682,Yes,0
1,29.0,female,High School,96737.0,6,MORTGAGE,6000.0,DEBTCONSOLIDATION,7.14,0.06,8.0,519,No,0
2,32.0,male,Bachelor,96948.0,6,MORTGAGE,12000.0,DEBTCONSOLIDATION,12.68,0.12,10.0,668,Yes,0
3,29.0,female,Associate,96865.0,4,MORTGAGE,20000.0,VENTURE,14.96,0.21,10.0,679,No,0
4,33.0,female,Bachelor,96986.0,15,MORTGAGE,8000.0,EDUCATION,20.0,0.08,5.0,667,No,0


In [3]:
#check data Types

df.dtypes

person_age                        float64
person_gender                      object
person_education                   object
person_income                     float64
person_emp_exp                      int64
person_home_ownership              object
loan_amnt                         float64
loan_intent                        object
loan_int_rate                     float64
loan_percent_income               float64
cb_person_cred_hist_length        float64
credit_score                        int64
previous_loan_defaults_on_file     object
loan_status                         int64
dtype: object

In [10]:
#Missing Value Treatment

df.isna().sum()



person_age                        0
person_gender                     0
person_education                  0
person_income                     0
person_emp_exp                    0
person_home_ownership             0
loan_amnt                         0
loan_intent                       0
loan_int_rate                     0
loan_percent_income               0
cb_person_cred_hist_length        0
credit_score                      0
previous_loan_defaults_on_file    0
loan_status                       0
dtype: int64

In [11]:
#if numeric column has missing value- fill with median

num_cols=df.select_dtypes(include='number').columns
for col in num_cols:
    df[col]=df[col].fillna(df[col].median())

In [13]:
# If categorical column has missing values → fill with "Missing"

cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    df[col]=df[col].fillna("missing")

In [14]:
#Handle Outliers

import numpy as np

upper_limit= df['person_income'].quantile(0.99)
df['person_income'] = np.where(df['person_income'] > upper_limit, upper_limit, df['person_income'])

In [15]:
upper_limit_loan = df['loan_amnt'].quantile(0.99)
df['loan_amnt'] = df['loan_amnt'].clip(upper=upper_limit_loan)

In [16]:
#Monthly income 

df['monthly_income']=df['person_income']/12

In [18]:
#Estimated Monthly Loan Payment

df['monthly_payment_est'] = (df['loan_amnt'] * df['loan_int_rate'] / 100) / 12


In [19]:
#Payment-to-Income Ratio

df['payment_to_income']=df['monthly_payment_est']/df['monthly_income']

In [21]:
#Age Buckets

import pandas as pd

df['age_bucket'] = pd.cut(df['person_age'], bins= [18,25,35,45,55,65,100], labels=['18–25','26–35','36–45','46–55','56–65','65+'])

In [22]:
#Credit Score Buckets

df['credit_bucket']= pd.cut (df['credit_score'], bins = [300,500,650,750,850], labels = ['Poor','Fair','Good','Excellent'])


In [23]:
#Encode Binary Variables

df['previous_loan_defaults_on_file'] = df['previous_loan_defaults_on_file'].map({'Yes':1,'No':0})


In [26]:
df.to_csv('cleaned_loan_data.csv', index=False)