In [72]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder , StandardScaler , OneHotEncoder
import joblib

In [73]:
df = pd.read_csv("loan.csv")

In [74]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [76]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

# Handling Missing Values

In [77]:
df["Gender"].fillna("Unknown", inplace=True)
df["Married"].fillna("Unknown", inplace=True)
df["Dependents"].fillna("Unknown", inplace=True)
df["Self_Employed"].fillna("Unknown", inplace=True)
df["LoanAmount"].fillna(df["LoanAmount"].median(), inplace=True)  # Median is more robust
df["Loan_Amount_Term"].fillna(df["Loan_Amount_Term"].median(), inplace=True)
df["Credit_History"].fillna(df["Credit_History"].mode()[0], inplace=True)  # Mode for categorical

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Gender"].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Married"].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

In [78]:
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

# Remove str from Loan_ID

In [79]:
df["Loan_ID"] = df["Loan_ID"].str.extract("(\d+)").astype(int)

# Converting Categorical columns into numerical ones

In [80]:
# Encode categorical variables
label_encoder = LabelEncoder()

df["Gender"] = label_encoder.fit_transform(df["Gender"])
df["Married"] = label_encoder.fit_transform(df["Married"])
df["Education"] = label_encoder.fit_transform(df["Education"])
df["Self_Employed"] = label_encoder.fit_transform(df["Self_Employed"])
df["Loan_Status"] = label_encoder.fit_transform(df["Loan_Status"])

In [81]:
# Use One-Hot Encoding for Property_Area (Better than manual mapping)
df = pd.get_dummies(df, columns=["Property_Area"], drop_first=True)

In [82]:
# Convert Dependents with mapping
dependents_mapping = {"0": 0, "1": 1, "2": 2, "3+": 3, "Unknown": -1}
df["Dependents"] = df["Dependents"].map(dependents_mapping)

In [83]:
# Standard Scaling
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

In [84]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Property_Area_Semiurban,Property_Area_Urban
0,1002,1,0,0,0,0,5849,0.0,128.0,360.0,1.0,1,False,True
1,1003,1,2,1,0,0,4583,1508.0,128.0,360.0,1.0,0,False,False
2,1005,1,2,0,0,2,3000,0.0,66.0,360.0,1.0,1,False,True
3,1006,1,2,0,1,0,2583,2358.0,120.0,360.0,1.0,1,False,True
4,1008,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,1,False,True


In [85]:
# Save the scaler for future use
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [86]:
# Convert back to DataFrame with column names
df_preprocessed = pd.DataFrame(df_scaled, columns=df.columns)

In [87]:
# Save the preprocessed data
df_preprocessed.to_csv("df_scaled.csv", index=False)

# Verify preprocessing
print("Preprocessing complete. Preprocessed data saved as 'df_scaled.csv'.")

Preprocessing complete. Preprocessed data saved as 'df_scaled.csv'.


In [None]:
df_preprocessed.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Property_Area_Semiurban,Property_Area_Urban
0,-1.760893,0.382616,-1.368405,-0.693376,-0.528362,-0.458646,0.072991,-0.554487,-0.211241,0.273231,0.411733,0.674519,-0.782016,1.428147
1,-1.759129,0.382616,0.734726,0.269821,-0.528362,-0.458646,-0.134412,-0.038732,-0.211241,0.273231,0.411733,-1.482537,-0.782016,-0.700208
2,-1.755599,0.382616,0.734726,-0.693376,-0.528362,2.414909,-0.393747,-0.554487,-0.948996,0.273231,0.411733,0.674519,-0.782016,1.428147
3,-1.753834,0.382616,0.734726,-0.693376,1.892641,-0.458646,-0.462062,0.25198,-0.306435,0.273231,0.411733,0.674519,-0.782016,1.428147
4,-1.750305,0.382616,-1.368405,-0.693376,-0.528362,-0.458646,0.097728,-0.554487,-0.056551,0.273231,0.411733,0.674519,-0.782016,1.428147


: 