In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

In [2]:
app_df=pd.read_csv('applicant.csv')
loan_df=pd.read_csv('loan.csv')

In [3]:
loandata_df=pd.merge(app_df,loan_df,on='applicant_id')

# Data Cleaning, dealing with null values

In [4]:
loandata_df.isnull().sum()

applicant_id                                                  0
Primary_applicant_age_in_years                                0
Gender                                                        0
Marital_status                                                0
Number_of_dependents                                          0
Housing                                                       0
Years_at_current_residence                                    0
Employment_status                                             0
Has_been_employed_for_at_least                               62
Has_been_employed_for_at_most                               253
Telephone                                                   596
Foreign_worker                                                0
Savings_account_balance                                     183
Balance_in_existing_bank_account_(lower_limit_of_bucket)    668
Balance_in_existing_bank_account_(upper_limit_of_bucket)    457
loan_application_id                     

### Columns with 50% or more null data will be dropped

In [5]:
loandata_df.drop('Other_EMI_plans', axis='columns',inplace=True)

In [15]:
loandata_df.drop('Telephone',axis='columns',inplace=True)

In [17]:
loandata_df.drop(['Balance_in_existing_bank_account_(lower_limit_of_bucket)','Balance_in_existing_bank_account_(upper_limit_of_bucket)'],
                 axis='columns',inplace=True)

### Preprocessing Columns with String inputs which should be read as float inputs

In [7]:
def remove_years(string):
    if type(string) is float:
        return np.nan
    return int(string.split()[0])

In [8]:
loandata_df['Has_been_employed_for_at_least']=loandata_df['Has_been_employed_for_at_least'].apply(func=remove_years)

In [9]:
loandata_df['Has_been_employed_for_at_most']=loandata_df['Has_been_employed_for_at_most'].apply(func=remove_years)

### Imputing Data for Columns with very few null values

In [11]:
loandata_df['Has_been_employed_for_at_least'].fillna(loandata_df['Has_been_employed_for_at_least'].median(),inplace=True)

In [12]:
loandata_df['Has_been_employed_for_at_most'].fillna(loandata_df['Has_been_employed_for_at_most'].median(),inplace=True)

In [13]:
loandata_df['Savings_account_balance'].fillna(loandata_df['Savings_account_balance'].mode(),inplace=True)

In [16]:
loandata_df['Property'].fillna(loandata_df['Property'].mode().iloc[0],inplace=True)

### Removing Rows where Purpose is a null value (only 12 rows)

In [14]:
loandata_df=loandata_df[loandata_df['Purpose'].notna()]

### Removing remaining null values 

 As this does not cause a class imbalance in the target variable, with the ratio of defaulters to non defaulters roughly the same as the original dataset, this is an acceptable view of the data for analysis

In [None]:
loandata_df.dropna(inplace=True)

In [19]:
loandata_df.isnull().sum()

applicant_id                                   0
Primary_applicant_age_in_years                 0
Gender                                         0
Marital_status                                 0
Number_of_dependents                           0
Housing                                        0
Years_at_current_residence                     0
Employment_status                              0
Has_been_employed_for_at_least                 0
Has_been_employed_for_at_most                  0
Foreign_worker                                 0
Savings_account_balance                        0
loan_application_id                            0
Months_loan_taken_for                          0
Purpose                                        0
Principal_loan_amount                          0
EMI_rate_in_percentage_of_disposable_income    0
Property                                       0
Has_coapplicant                                0
Has_guarantor                                  0
Number_of_existing_l

# Pre-processing for categorical variables

In [21]:
y=loandata_df.high_risk_applicant
X=loandata_df.drop(['applicant_id','loan_application_id','high_risk_applicant'],axis='columns')

In [22]:
ohe=OneHotEncoder()

In [23]:
col_trans=make_column_transformer( (OneHotEncoder(),['Gender','Marital_status','Housing','Employment_status',
                                                     'Savings_account_balance','Purpose',
                                                     'Property','Loan_history']) 
                                                      , remainder='passthrough')

In [24]:
X_t=col_trans.fit_transform(X)

# Training a regularized Logistic Regression Model

The Rationale behind using a regularized Logistic Regression framework is that the regularization will force most coefficients associated with the features will be close to 0, but the features that strongly affect the target value will have high or low coeffecients, so its easy to segment which features are more important 

In [25]:
logreg_ridged=RidgeClassifier(class_weight={0:0.2,1:0.8})

In [26]:
logreg_ridged.fit(X_t,y)

In [27]:
logreg_ridged.coef_

array([[-1.52164349e-02,  1.52164349e-02,  1.94472227e-01,
        -1.52164349e-02, -1.54517835e-02, -1.63804008e-01,
         3.53400383e-02, -9.54087093e-02,  6.00686710e-02,
         3.36381042e-03,  2.22720373e-02, -1.06802757e-01,
         8.11669096e-02, -4.66588408e-02,  2.47560639e-01,
         4.44185093e-02, -2.45320307e-01, -6.71116429e-04,
        -9.35323557e-02, -2.59845122e-01,  1.11223158e-01,
         3.25079384e-01, -1.23712828e-01,  2.26246533e-01,
         2.06712387e-01, -3.91500040e-01,  5.85460253e-02,
         2.15197038e-02, -8.00657291e-02,  3.31801865e-01,
        -4.27595023e-01, -5.21889637e-02, -4.56649591e-02,
         1.93647081e-01, -6.46330898e-03,  9.25485783e-02,
         1.58657204e-02, -3.39625190e-03, -5.08725383e-02,
         3.19839361e-01,  1.57897798e-02,  2.55646708e-08,
         9.75400584e-02,  2.44600292e-01, -2.09427329e-01,
         1.10661293e-01]])

In [35]:
ranked_features=np.argsort(logreg_ridged.coef_)

In [39]:
ranked_features

array([[30, 25, 19, 16, 44,  5, 22, 11,  7, 18, 28, 31, 38, 13, 32,  4,
         3,  0, 34, 37, 17, 41,  9,  1, 40, 36, 27, 10,  6, 15, 26,  8,
        12, 35, 42, 45, 20, 33,  2, 24, 23, 43, 14, 39, 21, 29]],
      dtype=int64)

### Top 3 segments

In [44]:
col_trans.get_feature_names_out()[29]

'onehotencoder__Loan_history_all loans at this bank paid back duly'

In [45]:
col_trans.get_feature_names_out()[21]

'onehotencoder__Purpose_education'

In [46]:
col_trans.get_feature_names_out()[39]

'remainder__Foreign_worker'

### Bottom 3 segments

In [49]:
col_trans.get_feature_names_out()[30]

'onehotencoder__Loan_history_critical/pending loans at other banks'

In [50]:
col_trans.get_feature_names_out()[25]

'onehotencoder__Purpose_used vehicle'

In [51]:
col_trans.get_feature_names_out()[19]

'onehotencoder__Purpose_career development'

Therefore, top 3 segments who would be considered credit worthy would be: 1) customers who have paid all their loans at the bank they're taking the loan from, 2) If the loan is for education purposes and 3) If they are a foreign Worker.

Whereas, segments that would not be deemed credit worthy would be, 1) customers who have critical or pending loans at other banks, 2) If the loan is for a used Vehicle, 3) if the loan is for the purposes of career development