In [2]:
import altair as alt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

##### Load the Data

In [5]:
df = pd.read_csv(r'C:\Users\Blake Dennett\Downloads\Summer2023\loan_approval_dataset.csv')
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [4]:
df.describe()

Unnamed: 0,loan_id,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
count,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0
mean,2135.0,2.498712,5059124.0,15133450.0,10.900445,599.936051,7472617.0,4973155.0,15126310.0,4976692.0
std,1232.498479,1.69591,2806840.0,9043363.0,5.709187,172.430401,6503637.0,4388966.0,9103754.0,3250185.0
min,1.0,0.0,200000.0,300000.0,2.0,300.0,-100000.0,0.0,300000.0,0.0
25%,1068.0,1.0,2700000.0,7700000.0,6.0,453.0,2200000.0,1300000.0,7500000.0,2300000.0
50%,2135.0,3.0,5100000.0,14500000.0,10.0,600.0,5600000.0,3700000.0,14600000.0,4600000.0
75%,3202.0,4.0,7500000.0,21500000.0,16.0,748.0,11300000.0,7600000.0,21700000.0,7100000.0
max,4269.0,5.0,9900000.0,39500000.0,20.0,900.0,29100000.0,19400000.0,39200000.0,14700000.0


##### Check for any missing values in the data

In [29]:
df.isna().any()
# no missing values

loan_id                      False
 no_of_dependents            False
 education                   False
 self_employed               False
 income_annum                False
 loan_amount                 False
 loan_term                   False
 cibil_score                 False
 residential_assets_value    False
 commercial_assets_value     False
 luxury_assets_value         False
 bank_asset_value            False
 loan_status                 False
dtype: bool

##### Check categorical features

In [37]:
print(df[' loan_status'].value_counts())
print()
print(df[' self_employed'].value_counts())
print()
df[' education'].value_counts()
# no issues found

 Approved    2656
 Rejected    1613
Name:  loan_status, dtype: int64

 Yes    2150
 No     2119
Name:  self_employed, dtype: int64



 Graduate        2144
 Not Graduate    2125
Name:  education, dtype: int64

##### Check for values outside of 3 standard deviations

In [86]:
numerical_df = df.drop(columns=[' loan_status', ' education', ' self_employed'])
for col in numerical_df.columns:
    print(col, end='')
print()
print()
count = 0
for col in numerical_df.columns:
    stdv = numerical_df[col].std()
    avg = numerical_df[col].mean()
    upper_limit, lower_limit = stdv * 3 + avg, stdv * -3 + avg
    
    for index, row in numerical_df.iterrows():
        if row[col] > upper_limit or row[col] < lower_limit:
            print(f'{col} {row[col]}')
            df.loc[index, col] = upper_limit
            count += 1
print(count)

# there are 33 outliers in the numerical data, about half of which are residential assets, the other being commercial assets
# the outliers are not removed because they are not far from the upper limit

loan_id no_of_dependents income_annum loan_amount loan_term cibil_score residential_assets_value commercial_assets_value luxury_assets_value bank_asset_value

 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 commercial_assets_value 18117781.090732403
 commercial_assets_value 18117781.090732403
 commercial_assets_value 18117781

##### Checking for inappropriate negative values

In [85]:
count = 0
for col in df.columns:
    if col != ' education' and col != ' self_employed' and col != ' loan_status':
        stdv = df[col].std()
        avg = df[col].mean()
        upper_limit, lower_limit = stdv * 3 + avg, stdv * -3 + avg
        
        for index, row in df.iterrows():
            if row[col] > upper_limit or row[col] < lower_limit:
                print(f'{col} {row[col]}')
                # df.loc[index, col] = upper_limit
                count += 1
print(count)

 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 residential_assets_value 26944478.621436063
 commercial_assets_value 18117781.090732403
 commercial_assets_value 18117781.090732403
 commercial_assets_value 18117781.090732403
 commercial_assets_value 18117781.090732403
 commercial_assets_value 18117781.090732403
 commercial_assets_value 18117781.090732403
 commercial_asse