In [96]:
import pandas as pd

In [97]:
# Reading csv file form the folder
# source = https://www.kaggle.com/datasets/architsharma01/loan-approval-prediction-dataset
df = pd.read_csv("dataset/loan_approval_dataset.csv")

In [98]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   loan_id                    4269 non-null   int64 
 1    no_of_dependents          4269 non-null   int64 
 2    education                 4269 non-null   object
 3    self_employed             4269 non-null   object
 4    income_annum              4269 non-null   int64 
 5    loan_amount               4269 non-null   int64 
 6    loan_term                 4269 non-null   int64 
 7    cibil_score               4269 non-null   int64 
 8    residential_assets_value  4269 non-null   int64 
 9    commercial_assets_value   4269 non-null   int64 
 10   luxury_assets_value       4269 non-null   int64 
 11   bank_asset_value          4269 non-null   int64 
 12   loan_status               4269 non-null   object
dtypes: int64(10), object(3)
memory usage: 433.7+ KB


In [99]:
df.isnull().sum()

loan_id                      0
 no_of_dependents            0
 education                   0
 self_employed               0
 income_annum                0
 loan_amount                 0
 loan_term                   0
 cibil_score                 0
 residential_assets_value    0
 commercial_assets_value     0
 luxury_assets_value         0
 bank_asset_value            0
 loan_status                 0
dtype: int64

In [100]:
df.columns

Index(['loan_id', ' no_of_dependents', ' education', ' self_employed',
       ' income_annum', ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

In [101]:
# column names contains leading white space so removing them
df.columns = df.columns.str.strip()

In [102]:
df.columns

Index(['loan_id', 'no_of_dependents', 'education', 'self_employed',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value', 'loan_status'],
      dtype='object')

In [103]:
df.head(3)

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected


In [104]:
numeric_columns = ['income_annum', 'loan_amount', 'residential_assets_value', 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value']
df[numeric_columns].skew()

income_annum               -0.012814
loan_amount                 0.308724
residential_assets_value    0.978451
commercial_assets_value     0.957791
luxury_assets_value         0.322208
bank_asset_value            0.560725
dtype: float64

In [105]:
categorical_columns = ['education', 'self_employed', 'loan_status']

for item in categorical_columns:
    print(f"Unique values in {item} are: {df[item].unique()}")

Unique values in education are: [' Graduate' ' Not Graduate']
Unique values in self_employed are: [' No' ' Yes']
Unique values in loan_status are: [' Approved' ' Rejected']


In [106]:
# All object categorical features have only two unique values so using label encoding to make copy so not to change the orginal data
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()
directory = {}
# Apply Label Encoding to the 'Gender' column
directory['education'] = label_encoder.fit_transform(df['education'])
directory['self_employed'] = label_encoder.fit_transform(df['self_employed'])
directory['loan_status'] = label_encoder.fit_transform(df['loan_status'])

In [107]:
df_copy = pd.DataFrame(directory)
df_copy.skew()

education        0.008905
self_employed   -0.014529
loan_status      0.504087
dtype: float64

# from above observation of skewness value of all numeric and categorical features we can consider data is very close to normal distribution as skew value is in range of (-1, 1)

In [108]:
df.to_csv('dataset/preprocessed_data.csv', index=False)

Dataset is already in cleaned and in normal distribution form so no need to perform much operations