In [256]:
import numpy as np
import pandas as pd

In [257]:
pd.set_option('display.max_columns', 500)

---

In [258]:
data_loan_prod = pd.read_csv('../../data/raw/Loan_Prod.txt', sep='\t')
data_borrower_prod = pd.read_csv('../../data/raw/Borrower_Prod.txt', sep='\t')

---

In [259]:
data_loan_prod.drop(columns='Unnamed: 11', inplace=True)
data_loan_prod.drop(index=3000, inplace=True)
data_borrower_prod.fillna(0, inplace=True)

In [260]:
data_loan_borrower_prod = pd.merge(data_loan_prod, data_borrower_prod, on='memberId')

---

In [261]:
from typing import Optional, Union
def calculate_loan_info(row: pd.Series, info: str) -> Optional[int]:
   """Calculate the missing loan amount or number of terms given its number
   of terms in months or loan amount, respectively, yearly interest rate, and 
   amortized monthly payment.

   - info == 'amount' / 'terms'
   """
   emi = row['monthly_payment']  # amortized equal monthly installment
   r = row['interest_rate'] / 1200  # monthly interest rate

   try:
      if info == 'amount':
         n = row['term_in_months']  # number of terms 
         principal = emi * (1 - (1/(1+r)) ** n) / r  # total loan amount
         return np.round(principal)
      elif info == 'terms':
         p = row['loan_amount']  # total loan amount
         numerator = emi / (emi - p * r)
         terms_in_months = np.log(numerator) / np.log(1 + r)  # number of terms
         return np.round(terms_in_months)
      else:
         print('info has to be either "amount" or "terms". Try again.')
   except RuntimeError or ValueError as error:
      return np.NaN

In [262]:
loan_borrower_prod = data_loan_borrower_prod.copy()

In [263]:
loan_borrower_prod['monthlyPayment'].fillna(444, inplace=True)

**Column Names**

In [264]:
column_names = ['loan_id', 'member_id', 'date', 'purpose', 'is_joint_application', 'loan_amount', 'term', 'interest_rate', 'monthly_payment', 'grade', 'loan_status', 'residential_state', 'years_employment', 'home_ownership', 'annual_income', 'income_verified', 'dti_ratio', 'length_credit_history', 'n_total_credit_lines', 'n_open_credit_lines', 'n_open_credit_lines_1_year', 'revolving_balance', 'revolving_utilization_rate', 'n_derogatory_record', 'n_delinquency_2_years', 'n_charge_off_1_year', 'n_inquiries_6_months']

In [265]:
loan_borrower_prod.columns = column_names

**Columns**

**Strip All Categorical Variables of Whitespaces**

In [266]:
cat_columns = loan_borrower_prod.columns[data_loan_borrower_prod.dtypes == 'O']
for cat_col in cat_columns:
   loan_borrower_prod[cat_col] = loan_borrower_prod[cat_col].str.strip()

In [267]:
loan_borrower_prod['loan_amount'] = loan_borrower_prod['loan_amount'].str.strip().apply(lambda x: np.NaN if x == '' else float(x))

In [268]:
loan_borrower_prod['term_in_months'] = loan_borrower_prod['term'].str.split(' ').str[0].apply(lambda x: int(x) if not pd.isnull(x) else x) 

In [269]:
purpose_space = {'debtconsolidation': 'Debt Consolidation', 'homeimprovement': 'Home Improvement'}
loan_borrower_prod['purpose'] = loan_borrower_prod['purpose'].apply(lambda x: purpose_space[x] if x in purpose_space else x.capitalize())

In [270]:
missing_term = loan_borrower_prod.loc[loan_borrower_prod['term'].isna()].index
missing_amount = loan_borrower_prod.loc[loan_borrower_prod['loan_amount'].isna()].index

In [271]:
loan_info = ['loan_amount', 'term_in_months', 'interest_rate', 'monthly_payment']
loan_borrower_prod.loc[missing_term, 'term_in_months'] = loan_borrower_prod.loc[missing_term, loan_info].apply(lambda row: calculate_loan_info(row, info='terms'), axis=1)
loan_borrower_prod.loc[missing_amount, 'loan_amount'] = loan_borrower_prod.loc[missing_amount, loan_info].apply(lambda row: calculate_loan_info(row, info='amount'), axis=1)

  terms_in_months = np.log(numerator) / np.log(1 + r)  # number of terms


**Filter out records where terms cannot be calculated**

In [272]:
loan_borrower_prod = loan_borrower_prod.loc[~loan_borrower_prod['term_in_months'].isna()]

**Reorder Column Names**

In [273]:
cols = loan_borrower_prod.columns.to_list()
cols = cols[:7] + [cols[-1]] + cols[7:-1]
loan_borrower_prod = loan_borrower_prod[cols]

---

In [274]:
from sklearn.preprocessing import OrdinalEncoder

**Engineer Features for Production Model**

In [275]:
loan_borrower_prod = loan_borrower_prod.drop(columns=['loan_id', 'member_id', 'date', 'term'])

In [276]:
loan_borrower_prod.years_employment.unique()

array(['6-9 years', '2-5 years', '10+ years', '1 year', '< 1 year'],
      dtype=object)

In [277]:
ordinal_encoder = OrdinalEncoder(categories=[['< 1 year', '1 year', '2-5 years', '6-9 years', '10+ years']])
years_employment = ordinal_encoder.fit_transform(loan_borrower_prod.years_employment.apply(lambda x: [x]).to_list())

In [278]:
loan_borrower_prod.drop(columns='years_employment', inplace=True)

In [279]:
loan_borrower_prod = pd.get_dummies(loan_borrower_prod)

In [280]:
loan_borrower_prod['years_employment'] = years_employment.flatten()

In [281]:
cols = loan_borrower_prod.columns.tolist()
loan_borrower_prod = loan_borrower_prod[cols[:8] + [cols[-1]] + cols[8:-1]]

In [284]:
loan_borrower_prod.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2992 entries, 0 to 2999
Data columns (total 83 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   is_joint_application        2992 non-null   float64
 1   loan_amount                 2992 non-null   float64
 2   term_in_months              2992 non-null   float64
 3   interest_rate               2992 non-null   float64
 4   monthly_payment             2992 non-null   float64
 5   loan_status                 0 non-null      float64
 6   annual_income               2992 non-null   int64  
 7   income_verified             2992 non-null   int64  
 8   years_employment            2992 non-null   float64
 9   dti_ratio                   2992 non-null   float64
 10  length_credit_history       2992 non-null   int64  
 11  n_total_credit_lines        2992 non-null   int64  
 12  n_open_credit_lines         2992 non-null   int64  
 13  n_open_credit_lines_1_year  2992 

In [None]:
loan_borrower_prod.to_csv('../../data/processed/loan_borrower_prod.csv', index=False, mode='w+')