In [1]:
import pandas as pd
from scipy.stats import pearsonr
from scipy.stats.mstats import winsorize

##### Load the Data

In [2]:
df = pd.read_csv(r'C:\Users\Blake Dennett\Downloads\Summer2023\loan_approval_dataset.csv')
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [3]:
df.describe()

Unnamed: 0,loan_id,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
count,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0
mean,2135.0,2.498712,5059124.0,15133450.0,10.900445,599.936051,7472617.0,4973155.0,15126310.0,4976692.0
std,1232.498479,1.69591,2806840.0,9043363.0,5.709187,172.430401,6503637.0,4388966.0,9103754.0,3250185.0
min,1.0,0.0,200000.0,300000.0,2.0,300.0,-100000.0,0.0,300000.0,0.0
25%,1068.0,1.0,2700000.0,7700000.0,6.0,453.0,2200000.0,1300000.0,7500000.0,2300000.0
50%,2135.0,3.0,5100000.0,14500000.0,10.0,600.0,5600000.0,3700000.0,14600000.0,4600000.0
75%,3202.0,4.0,7500000.0,21500000.0,16.0,748.0,11300000.0,7600000.0,21700000.0,7100000.0
max,4269.0,5.0,9900000.0,39500000.0,20.0,900.0,29100000.0,19400000.0,39200000.0,14700000.0


# Data Cleaning

##### Check for any missing values in the data

In [4]:
df.isna().any()
# no missing values

loan_id                      False
 no_of_dependents            False
 education                   False
 self_employed               False
 income_annum                False
 loan_amount                 False
 loan_term                   False
 cibil_score                 False
 residential_assets_value    False
 commercial_assets_value     False
 luxury_assets_value         False
 bank_asset_value            False
 loan_status                 False
dtype: bool

##### Check categorical features

In [5]:
print(df[' loan_status'].value_counts())
print()
print(df[' self_employed'].value_counts())
print()
df[' education'].value_counts()
# no issues found

 Approved    2656
 Rejected    1613
Name:  loan_status, dtype: int64

 Yes    2150
 No     2119
Name:  self_employed, dtype: int64



 Graduate        2144
 Not Graduate    2125
Name:  education, dtype: int64

##### Check for values outside of 3 standard deviations

In [6]:
numerical_df = df.drop(columns=[' loan_status', ' education', ' self_employed'])

std_factor = 3

stdv_df = numerical_df.std()   # creates a series of standard deviations for each column
avg_df = numerical_df.mean()

upper_limits = avg_df + std_factor * stdv_df     # creates a series of upper limits for each column
lower_limits = avg_df - std_factor * stdv_df

numerical_cols = numerical_df.columns

# Create a condition for numeric columns only
condition = (numerical_df[numerical_cols] > upper_limits) | (numerical_df[numerical_cols] < lower_limits)

# Update the values in the original DataFrame 'df' with the capped values
df[numerical_cols] = df[numerical_cols].where(~condition, other=upper_limits, axis=0)

# count the number of capped values
count = condition.sum().sum()

print(count)

# there were 33 outliers in the numerical data, about half of which were residential assets, the other were commercial assets
# the outliers were replaced with the upper limit of the data

33


##### Do a similar thing using the winsorize package

In [7]:
for col in numerical_cols:
    df[col] = winsorize(df[col], limits=(0.03, 0.97))

##### Checking for inappropriate negative values

In [8]:
numerical_df = df.drop(columns=[' loan_status', ' education', ' self_employed'])
numerical_cols = numerical_df.columns

count = 0
for col in numerical_df.columns:
    for row in numerical_df.iterrows():
        if row[1][col] < 0:
            print(row)
            count += 1

df[numerical_cols] = df[numerical_cols].applymap(lambda x: x if x >= 0 else 0)

print(count)

# there were 28 instances of a negative value and all were in the residential asset column, and they were all the exact same value 
# so I am just going to change them all to 0

0


# Analysis and Feature Engineering

##### Total Collateral

In [9]:
collateral_df = df[[' residential_assets_value',  ' commercial_assets_value', ' bank_asset_value', ' luxury_assets_value']]
df[' total_collateral'] = collateral_df.apply(lambda x: x.sum(), axis=1)

##### Loan Collateral Ratio

In [10]:
df[' loan_coll_ratio'] = df[' loan_amount'] / df[' total_collateral']
df[' loan_coll_ratio'].describe()
# lower is better

count    4.269000e+03
mean     6.666667e-01
std      1.110353e-16
min      6.666667e-01
25%      6.666667e-01
50%      6.666667e-01
75%      6.666667e-01
max      6.666667e-01
Name:  loan_coll_ratio, dtype: float64

##### Loan Amount by Income

In [11]:
df[' loan_income_ratio'] = df[' loan_amount'] / df[' income_annum']
df[' loan_income_ratio'].describe()
# lower is better

count    4.269000e+03
mean     2.400000e+00
std      4.441412e-16
min      2.400000e+00
25%      2.400000e+00
50%      2.400000e+00
75%      2.400000e+00
max      2.400000e+00
Name:  loan_income_ratio, dtype: float64

##### Credit Score Odd Values (See Cibil score graph)

In [12]:
rejected_df = df[df[' loan_status'] == ' Rejected']

rejec_cibil_above_550 = rejected_df[rejected_df[' cibil_score'] >= 550]

rejec_cibil_above_550

# there are 13 values where the credit score is above 550 and was still rejected

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status,total_collateral,loan_coll_ratio,loan_income_ratio


##### Approvals Below 550 Credit Score

In [13]:
approved_df = df[df[' loan_status'] == ' Approved']
rejected_df = df[df[' loan_status'] == ' Rejected']

aprov_cibil_below_550 = approved_df[approved_df[' cibil_score'] < 550]
rejec_cibil_below_550 = rejected_df[rejected_df[' cibil_score'] < 550]
aprov_cibil_below_550 = aprov_cibil_below_550.drop(columns=[' loan_status', ' education', ' self_employed'])
rejec_cibil_below_550 = rejec_cibil_below_550.drop(columns=[' loan_status', ' education', ' self_employed'])

total_means = numerical_df.mean()
approved_means = aprov_cibil_below_550.mean()
rejected_means = rejec_cibil_below_550.mean()

approved_df / total_means
approved_means - rejected_means

# out of 1785 rows that were below a 550 credit score, only 185 were approved (10.36%)

loan_id                      0.000000e+00
 no_of_dependents            0.000000e+00
 income_annum                0.000000e+00
 loan_amount                 0.000000e+00
 loan_term                   0.000000e+00
 cibil_score                 0.000000e+00
 residential_assets_value    0.000000e+00
 commercial_assets_value     0.000000e+00
 luxury_assets_value         0.000000e+00
 bank_asset_value            0.000000e+00
 total_collateral            0.000000e+00
 loan_coll_ratio             1.110223e-16
 loan_income_ratio           0.000000e+00
dtype: float64

##### Approved Vs. Rejected Loans

In [14]:
rejected_df = df[df[' loan_status'] == ' Rejected']
approved_df = df[df[' loan_status'] == ' Approved']

rejected_df = rejected_df.drop(columns=[' loan_status', ' education', ' self_employed'])
approved_df = approved_df.drop(columns=[' loan_status', ' education', ' self_employed'])

median_rejected = rejected_df.median()
median_approved = approved_df.median()

median_difference = median_approved - median_rejected

median_difference

loan_id                      0.0
 no_of_dependents            0.0
 income_annum                0.0
 loan_amount                 0.0
 loan_term                   0.0
 cibil_score                 0.0
 residential_assets_value    0.0
 commercial_assets_value     0.0
 luxury_assets_value         0.0
 bank_asset_value            0.0
 total_collateral            0.0
 loan_coll_ratio             0.0
 loan_income_ratio           0.0
dtype: float64

##### High Collateral and Income Rejection

In [15]:
odd_point = df[(df[' loan_coll_ratio'] == 0.25) & (df[' loan_income_ratio'] == 1.5)]
odd_point

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status,total_collateral,loan_coll_ratio,loan_income_ratio


##### Zero Residential Assets

In [16]:
zero_res = numerical_df[numerical_df[' residential_assets_value'] == 0]
zero_res_means = zero_res.mean()
zero_res_means / total_means

# 89 rows with 0 residential assets value, 33 rejected, 56 approved

loan_id                     NaN
 no_of_dependents           NaN
 income_annum               NaN
 loan_amount                NaN
 loan_term                  NaN
 cibil_score                NaN
 residential_assets_value   NaN
 commercial_assets_value    NaN
 luxury_assets_value        NaN
 bank_asset_value           NaN
dtype: float64

##### Zero Commercial Assets

In [17]:
zero_commercial = numerical_df[numerical_df[' commercial_assets_value'] == 0]

zero_commercial_means = zero_commercial.mean()
zc_cibil_avg = zero_commercial_means[" cibil_score"]
total_cibil_avg = total_means[" cibil_score"]

zero_commercial_means / total_means

# 124 rows of zero commercial assets, 73 are approved and 51 are rejected

loan_id                     NaN
 no_of_dependents           NaN
 income_annum               NaN
 loan_amount                NaN
 loan_term                  NaN
 cibil_score                NaN
 residential_assets_value   NaN
 commercial_assets_value    NaN
 luxury_assets_value        NaN
 bank_asset_value           NaN
dtype: float64

##### Zero Bank Assets

In [18]:
zero_bank = df[df[' bank_asset_value'] == 0]

zero_bank_means = zero_bank.mean()
zc_cibil_avg = zero_bank_means[" cibil_score"]
total_cibil_avg = total_means[" cibil_score"]

zero_bank_means / total_means
# 8 rows with no bank assets, 3 were rejected, 5 were approved

 bank_asset_value            NaN
 cibil_score                 NaN
 commercial_assets_value     NaN
 education                   NaN
 income_annum                NaN
 loan_amount                 NaN
 loan_coll_ratio             NaN
 loan_income_ratio           NaN
 loan_status                 NaN
 loan_term                   NaN
 luxury_assets_value         NaN
 no_of_dependents            NaN
 residential_assets_value    NaN
 self_employed               NaN
 total_collateral            NaN
loan_id                      NaN
dtype: object

##### Correlation and P-value

In [19]:

for col in numerical_cols:
    correlation_coefficient, p_value = pearsonr(df[' cibil_score'], df[col])

    print(col)
    print("Pearson correlation coefficient:", correlation_coefficient)
    print("p-value:", p_value)
    print()

loan_id
Pearson correlation coefficient: nan
p-value: nan

 no_of_dependents
Pearson correlation coefficient: nan
p-value: nan

 income_annum
Pearson correlation coefficient: nan
p-value: nan

 loan_amount
Pearson correlation coefficient: nan
p-value: nan

 loan_term
Pearson correlation coefficient: nan
p-value: nan

 cibil_score
Pearson correlation coefficient: nan
p-value: nan

 residential_assets_value
Pearson correlation coefficient: nan
p-value: nan

 commercial_assets_value
Pearson correlation coefficient: nan
p-value: nan

 luxury_assets_value
Pearson correlation coefficient: nan
p-value: nan

 bank_asset_value
Pearson correlation coefficient: nan
p-value: nan



