<h2>Problem Statement</h2>

<b>Can we predict if an applicant is approved for a loan, and if so, what factors influence loan approval?</b>

<h2>EDA</h2>

<h3>Cleaning & Pre-Processing</h3>
<i>All of these steps should've been done in the last assignment - sorry for doing them here!</i>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder, FunctionTransformer, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split

# Regression Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Evaluation metrics
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, root_mean_squared_error, accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score

# For combining pipelines after encoding
from sklearn.compose import make_column_selector as selector

sns.set(style="whitegrid")

In [2]:
df = pd.read_csv('playground-series-s4e10/train.csv')
df.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


In [3]:
# Renaming columns for easier understanding
df = df.rename(columns={'cb_person_default_on_file': 'history_of_default', 'cb_person_cred_hist_length': 'cred_hist_length'})
df.head() # Confirming changes

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,history_of_default,cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


In [4]:
df.describe()

Unnamed: 0,id,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cred_hist_length,loan_status
count,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0
mean,29322.0,27.550857,64046.17,4.701015,9217.556518,10.677874,0.159238,5.813556,0.142382
std,16929.497605,6.033216,37931.11,3.959784,5563.807384,3.034697,0.091692,4.029196,0.349445
min,0.0,20.0,4200.0,0.0,500.0,5.42,0.0,2.0,0.0
25%,14661.0,23.0,42000.0,2.0,5000.0,7.88,0.09,3.0,0.0
50%,29322.0,26.0,58000.0,4.0,8000.0,10.75,0.14,4.0,0.0
75%,43983.0,30.0,75600.0,7.0,12000.0,12.99,0.21,8.0,0.0
max,58644.0,123.0,1900000.0,123.0,35000.0,23.22,0.83,30.0,1.0


In [5]:
# From the correlation map from last assignment, person_age and credit_hist_length have a high correlation (> 0.88)
# This essentially shows us that these predictors measure the same thing. I will drop person_age
# Since credit_hist_length will be more directly related to loan decisions
df = df.drop(['person_age'], axis=1)

In [6]:
# Using Z-Scores to remove outliers
# These predictors are most likely to have outliers and affect the loan status
from scipy import stats
z_emp_length = np.abs(stats.zscore(df['person_emp_length']))
print(f'Z-Scores of Employment Length:\n {z_emp_length} \n')

df['z_emp_length'] = z_emp_length
df.head() # Confirming changes

Z-Scores of Employment Length:
 0        1.187200
1        0.328047
2        0.833130
3        2.348377
4        0.682117
           ...   
58640    0.075506
58641    1.187200
58642    0.580588
58643    0.682117
58644    0.682117
Name: person_emp_length, Length: 58645, dtype: float64 



Unnamed: 0,id,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,history_of_default,cred_hist_length,loan_status,z_emp_length
0,0,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0,1.1872
1,1,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0,0.328047
2,2,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0,0.83313
3,3,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0,2.348377
4,4,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0,0.682117


In [7]:
# +-3.00 of a Z-score is a good indicator for outliers
print(df.loc[df['z_emp_length'] >= 3.00]) # 627 rows
print(df.loc[df['z_emp_length'] <= -3.00]) # 0 rows

          id  person_income person_home_ownership  person_emp_length  \
86        86         112000              MORTGAGE               21.0   
104      104          46000              MORTGAGE               17.0   
184      184          65000              MORTGAGE               21.0   
222      222          82000              MORTGAGE               21.0   
278      278          85000              MORTGAGE               18.0   
...      ...            ...                   ...                ...   
58372  58372          96000                   OWN               17.0   
58421  58421          78000              MORTGAGE               17.0   
58460  58460          95000              MORTGAGE               17.0   
58527  58527         120000              MORTGAGE               17.0   
58609  58609          43000                  RENT               18.0   

             loan_intent loan_grade  loan_amnt  loan_int_rate  \
86               MEDICAL          A       5000           7.90   
104  

In [8]:
# Dropping outliers by keeping rows with Z-scores with < 3.00
df = df.loc[(df['z_emp_length'] < 3.00)]
df.shape # Confirming rows were dropped

(58018, 13)

In [9]:
df.describe(include='all')

Unnamed: 0,id,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,history_of_default,cred_hist_length,loan_status,z_emp_length
count,58018.0,58018.0,58018,58018.0,58018,58018,58018.0,58018.0,58018.0,58018,58018.0,58018.0,58018.0
unique,,,4,,6,7,,,,2,,,
top,,,RENT,,EDUCATION,A,,,,N,,,
freq,,,30429,,12200,20721,,,,49398,,,
mean,29302.941311,63904.98,,4.537575,,,9208.954583,10.681245,0.159392,,5.74794,0.142507,0.73698
std,16926.086284,37783.95,,3.58839,,,5551.280454,3.030996,0.091658,,3.975145,0.349573,0.528945
min,0.0,4200.0,,0.0,,,500.0,5.42,0.0,,2.0,0.0,0.075506
25%,14636.25,42000.0,,2.0,,,5000.0,7.88,0.09,,3.0,0.0,0.328047
50%,29302.5,58000.0,,4.0,,,8000.0,10.75,0.14,,4.0,0.0,0.682117
75%,43952.75,75000.0,,7.0,,,12000.0,12.99,0.21,,8.0,0.0,1.085671


In [10]:
# person_income is right skewed, so using a log transformation with make it normally distributed
df['log_person_income'] = np.log1p(df['person_income'])

In [12]:
target_cols = ['loan_status']
features = [col for col in df.columns if col not in target_cols]

cat_features = [col for col in features if df[col].dtype == "object"]
num_features = [col for col in features if df[col].dtype != "object"]

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", RobustScaler(), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
    ],
    sparse_threshold=0
)

X_processed = preprocessor.fit_transform(df[features])
new_feature_names = (
    num_features + 
    list(preprocessor.named_transformers_['cat'].get_feature_names_out(cat_features))
)
df_transformed = pd.DataFrame(X_processed, columns=new_feature_names)

print(df_transformed.head())

         id  person_income  person_emp_length  loan_amnt  loan_int_rate  \
0 -0.999522      -0.696970               -0.8  -0.285714       0.144814   
1 -0.999488      -0.060606                0.4  -0.571429       0.508806   
2 -0.999454      -0.884848                0.8  -0.285714      -0.362035   
3 -0.999420       0.363636                2.0   0.571429       0.070450   
4 -0.999386       0.060606               -0.4  -0.285714      -0.749511   

   loan_percent_income  cred_hist_length  z_emp_length  log_person_income  \
0             0.250000               2.0      0.666667          -0.871122   
1            -0.583333              -0.4     -0.467343          -0.060521   
2             0.583333               1.2      0.199324          -1.207383   
3             0.250000               0.2      2.199324           0.324330   
4            -0.333333              -0.2      0.000000           0.058469   

   person_home_ownership_MORTGAGE  ...  loan_intent_VENTURE  loan_grade_A  \
0        