In [94]:
import pandas as pd
import plotly.express as px

In [95]:
from sklearn.preprocessing import OrdinalEncoder

In [96]:
import warnings
warnings.filterwarnings(action='ignore')

In [97]:
pd.set_option('display.max_columns', 500)

---

# <center>**Feature Engineering**</center>

---

In [98]:
data = pd.read_csv('../../data/interim/loan_borrower.csv')

---

In [99]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   loan_id                     100000 non-null  int64  
 1   member_id                   100000 non-null  int64  
 2   date                        100000 non-null  object 
 3   purpose                     100000 non-null  object 
 4   is_joint_application        99029 non-null   float64
 5   loan_amount                 99983 non-null   float64
 6   term                        99983 non-null   object 
 7   term_in_months              99983 non-null   float64
 8   interest_rate               100000 non-null  float64
 9   monthly_payment             100000 non-null  int64  
 10  grade                       100000 non-null  object 
 11  loan_status                 100000 non-null  object 
 12  residential_state           100000 non-null  object 
 13  years_employmen

In [100]:
data.head()

Unnamed: 0,loan_id,member_id,date,purpose,is_joint_application,loan_amount,term,term_in_months,interest_rate,monthly_payment,grade,loan_status,residential_state,years_employment,home_ownership,annual_income,income_verified,dti_ratio,length_credit_history,n_total_credit_lines,n_open_credit_lines,n_open_credit_lines_1_year,revolving_balance,revolving_utilization_rate,n_derogatory_record,n_delinquency_2_years,n_charge_off_1_year,n_inquiries_6_months
0,1888978,2305095,2014-12-10,Debt Consolidation,0.0,25190.0,60 months,60.0,6.25,490,E3,Current,NM,10+ years,rent,56471,1,16.8,6,11,9.0,6,14301,49.02,0,19,10,0
1,1299695,2610493,2014-09-15,Debt Consolidation,0.0,21189.0,60 months,60.0,10.49,455,B3,Current,WA,2-5 years,rent,55038,0,19.99,22,8,7.0,4,18262,72.4,1,0,0,0
2,1875016,2491679,2014-09-11,Debt Consolidation,0.0,29908.0,60 months,60.0,9.11,622,B2,Current,MS,< 1 year,rent,56610,1,14.33,5,8,5.0,5,10799,66.27,0,1,1,0
3,1440478,2092798,2016-04-22,Home Improvement,0.0,13053.0,48 months,48.0,11.89,343,B3,Current,TX,6-9 years,own,54887,1,14.8,12,14,7.0,3,15272,61.05,1,0,0,3
4,1124634,2633077,2016-02-03,Debt Consolidation,0.0,24613.0,60 months,60.0,15.13,587,A3,Current,MA,2-5 years,rent,53522,1,10.14,4,21,19.0,10,19316,56.39,2,14,7,1


---

In [101]:
loan_borrower = data.copy()

### **Actions Required**

- Drop the loan_id, member_id, date, and term columns
- Drop rows where both loan_amount and term_in_months are missing
- Impute missing values in is_joint_application with the mode value
- Impute missing values in n_open_credit_lines with the values from n_open_credit_lines_1_year
- Encode years_employment with OrdinalEncoder
- Encode the rest of the categorical variables with get_dummies

In [102]:
loan_borrower = loan_borrower.drop(columns=['loan_id', 'member_id', 'date', 'term'])

In [103]:
loan_borrower = loan_borrower.loc[~loan_borrower['loan_amount'].isna() & ~loan_borrower['term_in_months'].isna()]

In [104]:
loan_borrower['is_joint_application'].fillna(0, inplace=True)

In [105]:
open_credit_na = loan_borrower.loc[loan_borrower['n_open_credit_lines'].isna(), 'n_open_credit_lines_1_year']
loan_borrower.loc[open_credit_na.index, 'n_open_credit_lines'] = open_credit_na

In [106]:
ordinal_encoder = OrdinalEncoder(categories=[['< 1 year', '1 year', '2-5 years', '6-9 years', '10+ years']])
years_employment = ordinal_encoder.fit_transform(loan_borrower.years_employment.apply(lambda x: [x]).to_list())

In [107]:
loan_borrower.drop(columns='years_employment', inplace=True)

In [108]:
loan_borrower = pd.get_dummies(loan_borrower)

In [109]:
loan_borrower['years_employment'] = pd.Series(years_employment.flatten())

In [110]:
cols = loan_borrower.columns.tolist()
loan_borrower = loan_borrower[cols[:8] + [cols[-1]] + cols[8:-1]]

In [111]:
loan_borrower.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99983 entries, 0 to 99999
Data columns (total 95 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   is_joint_application        99983 non-null  float64
 1   loan_amount                 99983 non-null  float64
 2   term_in_months              99983 non-null  float64
 3   interest_rate               99983 non-null  float64
 4   monthly_payment             99983 non-null  int64  
 5   annual_income               99983 non-null  int64  
 6   income_verified             99983 non-null  int64  
 7   dti_ratio                   99983 non-null  float64
 8   years_employment            99966 non-null  float64
 9   length_credit_history       99983 non-null  int64  
 10  n_total_credit_lines        99983 non-null  int64  
 11  n_open_credit_lines         99983 non-null  float64
 12  n_open_credit_lines_1_year  99983 non-null  int64  
 13  revolving_balance           999

---

### **Load the engineered dataset to the data/processed folder for training model**

In [None]:
loan_borrower.to_csv('../../data/processed/loan_borrower.csv', index=False, mode='w+')