In [423]:
import pandas as pd
from scipy.stats import pearsonr

##### Load the Data

In [424]:
df = pd.read_csv(r'C:\Users\Blake Dennett\Downloads\Summer2023\loan_approval_dataset.csv')
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [425]:
df.describe()

Unnamed: 0,loan_id,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
count,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0
mean,2135.0,2.498712,5059124.0,15133450.0,10.900445,599.936051,7472617.0,4973155.0,15126310.0,4976692.0
std,1232.498479,1.69591,2806840.0,9043363.0,5.709187,172.430401,6503637.0,4388966.0,9103754.0,3250185.0
min,1.0,0.0,200000.0,300000.0,2.0,300.0,-100000.0,0.0,300000.0,0.0
25%,1068.0,1.0,2700000.0,7700000.0,6.0,453.0,2200000.0,1300000.0,7500000.0,2300000.0
50%,2135.0,3.0,5100000.0,14500000.0,10.0,600.0,5600000.0,3700000.0,14600000.0,4600000.0
75%,3202.0,4.0,7500000.0,21500000.0,16.0,748.0,11300000.0,7600000.0,21700000.0,7100000.0
max,4269.0,5.0,9900000.0,39500000.0,20.0,900.0,29100000.0,19400000.0,39200000.0,14700000.0


# Data Cleaning

##### Check for any missing values in the data

In [426]:
df.isna().any()
# no missing values

loan_id                      False
 no_of_dependents            False
 education                   False
 self_employed               False
 income_annum                False
 loan_amount                 False
 loan_term                   False
 cibil_score                 False
 residential_assets_value    False
 commercial_assets_value     False
 luxury_assets_value         False
 bank_asset_value            False
 loan_status                 False
dtype: bool

##### Check categorical features

In [427]:
print(df[' loan_status'].value_counts())
print()
print(df[' self_employed'].value_counts())
print()
df[' education'].value_counts()
# no issues found

 Approved    2656
 Rejected    1613
Name:  loan_status, dtype: int64

 Yes    2150
 No     2119
Name:  self_employed, dtype: int64



 Graduate        2144
 Not Graduate    2125
Name:  education, dtype: int64

##### Check for values outside of 3 standard deviations

In [428]:
numerical_df = df.drop(columns=[' loan_status', ' education', ' self_employed'])

std_factor = 3

stdv_df = numerical_df.std()   # creates a series of standard deviations for each column
avg_df = numerical_df.mean()

upper_limits = avg_df + std_factor * stdv_df     # creates a series of upper limits for each column
lower_limits = avg_df - std_factor * stdv_df

numerical_cols = numerical_df.columns

# Create a condition for numeric columns only
condition = (numerical_df[numerical_cols] > upper_limits) | (numerical_df[numerical_cols] < lower_limits)

# Update the values in the original DataFrame 'df' with the capped values
df[numerical_cols] = df[numerical_cols].where(~condition, other=upper_limits, axis=0)

# count the number of capped values
count = condition.sum().sum()

print(count)

# there were 33 outliers in the numerical data, about half of which were residential assets, the other were commercial assets
# the outliers were replaced with the upper limit of the data

33


##### Checking for inappropriate negative values

In [429]:
numerical_df = df.drop(columns=[' loan_status', ' education', ' self_employed'])
numerical_cols = numerical_df.columns

count = 0
for col in numerical_df.columns:
    for row in numerical_df.iterrows():
        if row[1][col] < 0:
            print(row)
            count += 1

df[numerical_cols] = df[numerical_cols].applymap(lambda x: x if x >= 0 else 0)

print(count)

# there were 28 instances of a negative value and all were in the residential asset column, and they were all the exact same value 
# so I am just going to change them all to 0

(59, loan_id                            60.0
 no_of_dependents                   4.0
 income_annum                 5500000.0
 loan_amount                 18200000.0
 loan_term                         16.0
 cibil_score                      797.0
 residential_assets_value     -100000.0
 commercial_assets_value      4900000.0
 luxury_assets_value         18600000.0
 bank_asset_value             4800000.0
Name: 59, dtype: float64)
(196, loan_id                          197.0
 no_of_dependents                  4.0
 income_annum                 400000.0
 loan_amount                 1500000.0
 loan_term                         2.0
 cibil_score                     669.0
 residential_assets_value    -100000.0
 commercial_assets_value      600000.0
 luxury_assets_value          900000.0
 bank_asset_value             500000.0
Name: 196, dtype: float64)
(559, loan_id                         560.0
 no_of_dependents                 2.0
 income_annum                200000.0
 loan_amount              

# Analysis and Feature Engineering

##### Total Collateral

In [430]:
collateral_df = df[[' residential_assets_value',  ' commercial_assets_value', ' bank_asset_value', ' luxury_assets_value']]
df[' total_collateral'] = collateral_df.apply(lambda x: x.sum(), axis=1)

##### Loan Collateral Ratio

In [431]:
df[' loan_coll_ratio'] = df[' loan_amount'] / df[' total_collateral']
df[' loan_coll_ratio'].describe()
# lower is better

count    4269.000000
mean        0.488241
std         0.145773
min         0.176471
25%         0.384217
50%         0.467290
75%         0.568182
max         1.333333
Name:  loan_coll_ratio, dtype: float64

##### Loan Amount by Income

In [432]:
df[' loan_income_ratio'] = df[' loan_amount'] / df[' income_annum']
df[' loan_income_ratio'].describe()
# lower is better

count    4269.000000
mean        2.984807
std         0.595496
min         1.500000
25%         2.464286
50%         3.000000
75%         3.500000
max         4.000000
Name:  loan_income_ratio, dtype: float64

##### Credit Score Odd Values (See Cibil score graph)

In [434]:
rejected_df = df[df[' loan_status'] == ' Rejected']

rejec_cibil_above_550 = rejected_df[rejected_df[' cibil_score'] >= 550]

rejec_cibil_above_550

# there are 13 values where the credit score is above 550 and was still rejected

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status,total_collateral,loan_coll_ratio,loan_income_ratio
67,68,0,Not Graduate,Yes,2700000,10100000,20,693,200000.0,1700000.0,5900000,1400000,Rejected,9200000.0,1.097826,3.740741
236,237,3,Graduate,Yes,900000,3500000,4,770,700000.0,0.0,2000000,600000,Rejected,3300000.0,1.060606,3.888889
348,349,0,Not Graduate,No,2000000,7000000,6,666,0.0,1200000.0,4300000,1400000,Rejected,6900000.0,1.014493,3.5
395,396,2,Graduate,Yes,800000,3200000,10,584,0.0,200000.0,2100000,700000,Rejected,3000000.0,1.066667,4.0
559,560,2,Graduate,Yes,200000,500000,6,885,0.0,0.0,300000,200000,Rejected,500000.0,1.0,2.5
911,912,0,Graduate,Yes,1300000,4800000,4,835,500000.0,100000.0,3300000,800000,Rejected,4700000.0,1.021277,3.692308
1446,1447,1,Graduate,No,300000,1200000,18,747,100000.0,0.0,700000,200000,Rejected,1000000.0,1.2,4.0
1950,1951,0,Graduate,Yes,200000,700000,10,587,0.0,100000.0,500000,100000,Rejected,700000.0,1.0,3.5
2856,2857,3,Not Graduate,Yes,8300000,31400000,6,674,1000000.0,1600000.0,17200000,6100000,Rejected,25900000.0,1.212355,3.783133
3180,3181,4,Graduate,No,3200000,12200000,14,683,0.0,1100000.0,7500000,2900000,Rejected,11500000.0,1.06087,3.8125


##### Approvals Below 550 Credit Score

In [435]:
approved_df = df[df[' loan_status'] == ' Approved']
rejected_df = df[df[' loan_status'] == ' Rejected']

aprov_cibil_below_550 = approved_df[approved_df[' cibil_score'] < 550]
rejec_cibil_below_550 = rejected_df[rejected_df[' cibil_score'] < 550]
aprov_cibil_below_550 = aprov_cibil_below_550.drop(columns=[' loan_status', ' education', ' self_employed'])
rejec_cibil_below_550 = rejec_cibil_below_550.drop(columns=[' loan_status', ' education', ' self_employed'])

total_means = numerical_df.mean()
approved_means = aprov_cibil_below_550.mean()
rejected_means = rejec_cibil_below_550.mean()

approved_df / total_means
approved_means - rejected_means

# out of 1785 rows that were below a 550 credit score, only 185 were approved (10.36%)

loan_id                      5.091850e+01
 no_of_dependents            3.150338e-02
 income_annum               -1.936875e+05
 loan_amount                 2.253062e+06
 loan_term                  -8.613176e+00
 cibil_score                -5.461334e+00
 residential_assets_value    8.436149e+04
 commercial_assets_value     2.325422e+05
 luxury_assets_value        -4.745389e+05
 bank_asset_value           -1.970895e+05
 total_collateral           -3.547247e+05
 loan_coll_ratio             8.155683e-02
 loan_income_ratio           5.803107e-01
dtype: float64

##### Approved Vs. Rejected Loans

In [436]:
rejected_df = df[df[' loan_status'] == ' Rejected']
approved_df = df[df[' loan_status'] == ' Approved']

rejected_df = rejected_df.drop(columns=[' loan_status', ' education', ' self_employed'])
approved_df = approved_df.drop(columns=[' loan_status', ' education', ' self_employed'])

median_rejected = rejected_df.median()
median_approved = approved_df.median()

median_difference = median_approved - median_rejected

median_difference

loan_id                          60.500000
 no_of_dependents                -0.500000
 income_annum               -100000.000000
 loan_amount                 100000.000000
 loan_term                       -2.000000
 cibil_score                    282.000000
 residential_assets_value   -600000.000000
 commercial_assets_value          0.000000
 luxury_assets_value        -400000.000000
 bank_asset_value           -100000.000000
 total_collateral           -900000.000000
 loan_coll_ratio                  0.016411
 loan_income_ratio                0.174691
dtype: float64

##### High Collateral and Income Rejection

In [437]:
odd_point = df[(df[' loan_coll_ratio'] == 0.25) & (df[' loan_income_ratio'] == 1.5)]

odd_point

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status,total_collateral,loan_coll_ratio,loan_income_ratio
674,675,0,Not Graduate,No,200000,300000,6,472,400000.0,100000.0,700000,0,Rejected,1200000.0,0.25,1.5


##### Zero Residential Assets

In [438]:
zero_res = numerical_df[numerical_df[' residential_assets_value'] == 0]
zero_res_means = zero_res.mean()
zero_res_means / total_means

# 89 rows with 0 residential assets value, 33 rejected, 56 approved

loan_id                      0.939027
 no_of_dependents            1.031643
 income_annum                0.589035
 loan_amount                 0.606310
 loan_term                   1.007094
 cibil_score                 1.006700
 residential_assets_value    0.000000
 commercial_assets_value     0.666789
 luxury_assets_value         0.557675
 bank_asset_value            0.561730
dtype: float64

##### Zero Commercial Assets

In [439]:
zero_commercial = numerical_df[numerical_df[' commercial_assets_value'] == 0]

zero_commercial_means = zero_commercial.mean()
zc_cibil_avg = zero_commercial_means[" cibil_score"]
total_cibil_avg = total_means[" cibil_score"]

zero_commercial_means / total_means

# 124 rows of zero commercial assets, 73 are approved and 51 are rejected

loan_id                      0.947720
 no_of_dependents            0.856516
 income_annum                0.577286
 loan_amount                 0.551542
 loan_term                   0.994558
 cibil_score                 1.017024
 residential_assets_value    0.657758
 commercial_assets_value     0.000000
 luxury_assets_value         0.572933
 bank_asset_value            0.546473
dtype: float64

##### Zero Bank Assets

In [440]:
zero_bank = df[df[' bank_asset_value'] == 0]

zero_bank_means = zero_bank.mean()
zc_cibil_avg = zero_bank_means[" cibil_score"]
total_cibil_avg = total_means[" cibil_score"]

zero_bank_means / total_means
# 8 rows with no bank assets, 3 were rejected, 5 were approved

  zero_bank_means = zero_bank.mean()


 bank_asset_value            0.000000
 cibil_score                 1.063655
 commercial_assets_value     0.017790
 income_annum                0.039533
 loan_amount                 0.035517
 loan_coll_ratio                  NaN
 loan_income_ratio                NaN
 loan_term                   1.055003
 luxury_assets_value         0.033055
 no_of_dependents            0.600309
 residential_assets_value    0.049016
 total_collateral                 NaN
loan_id                      0.954274
dtype: float64

##### Correlation and P-value

In [441]:

for col in numerical_cols:
    correlation_coefficient, p_value = pearsonr(df[' cibil_score'], df[col])

    print(col)
    print("Pearson correlation coefficient:", correlation_coefficient)
    print("p-value:", p_value)
    print()

loan_id
Pearson correlation coefficient: 0.016323386342227002
p-value: 0.28629286440127394

 no_of_dependents
Pearson correlation coefficient: -0.009998469677432271
p-value: 0.5136912487840075

 income_annum
Pearson correlation coefficient: -0.02303442169986018
p-value: 0.1323826070379217

 loan_amount
Pearson correlation coefficient: -0.017034787023534382
p-value: 0.2658086684763575

 loan_term
Pearson correlation coefficient: 0.007809878429870913
p-value: 0.609955527226155

 cibil_score
Pearson correlation coefficient: 1.0
p-value: 0.0

 residential_assets_value
Pearson correlation coefficient: -0.02414121331360626
p-value: 0.11477332645819846

 commercial_assets_value
Pearson correlation coefficient: 0.002559122155018209
p-value: 0.8672455979184643

 luxury_assets_value
Pearson correlation coefficient: -0.028617627946119942
p-value: 0.06153354602205643

 bank_asset_value
Pearson correlation coefficient: -0.01547827134015627
p-value: 0.31197944849413606

