In [302]:
import numpy as np
import pandas as pd

In [303]:
pd.set_option('display.max_columns', 500)

---

In [304]:
data_loan_prod = pd.read_csv('../../data/raw/Loan_Prod.txt', sep='\t')
data_borrower_prod = pd.read_csv('../../data/raw/Borrower_Prod.txt', sep='\t')

---

In [305]:
data_loan_prod.drop(columns='Unnamed: 11', inplace=True)
data_loan_prod.drop(index=3000, inplace=True)
data_borrower_prod.fillna(0, inplace=True)

In [306]:
data_loan_borrower_prod = pd.merge(data_loan_prod, data_borrower_prod, on='memberId')

---

In [307]:
from typing import Optional, Union
def calculate_loan_info(row: pd.Series, info: str) -> Optional[int]:
   """Calculate the missing loan amount or number of terms given its number
   of terms in months or loan amount, respectively, yearly interest rate, and 
   amortized monthly payment.

   - info == 'amount' / 'terms'
   """
   emi = row['monthly_payment']  # amortized equal monthly installment
   r = row['interest_rate'] / 1200  # monthly interest rate

   try:
      if info == 'amount':
         n = row['term_in_months']  # number of terms 
         principal = emi * (1 - (1/(1+r)) ** n) / r  # total loan amount
         return np.round(principal)
      elif info == 'terms':
         p = row['loan_amount']  # total loan amount
         numerator = emi / (emi - p * r)
         terms_in_months = np.log(numerator) / np.log(1 + r)  # number of terms
         return np.round(terms_in_months)
      else:
         print('info has to be either "amount" or "terms". Try again.')
   except RuntimeError or ValueError as error:
      return np.NaN

In [308]:
loan_borrower_prod = data_loan_borrower_prod.copy()

In [309]:
loan_borrower_prod['monthlyPayment'].fillna(444, inplace=True)

**Column Names**

In [310]:
column_names = ['loan_id', 'member_id', 'date', 'purpose', 'is_joint_application', 'loan_amount', 'term', 'interest_rate', 'monthly_payment', 'grade', 'loan_status', 'residential_state', 'years_employment', 'home_ownership', 'annual_income', 'income_verified', 'dti_ratio', 'length_credit_history', 'n_total_credit_lines', 'n_open_credit_lines', 'n_open_credit_lines_1_year', 'revolving_balance', 'revolving_utilization_rate', 'n_derogatory_record', 'n_delinquency_2_years', 'n_charge_off_1_year', 'n_inquiries_6_months']

In [311]:
loan_borrower_prod.columns = column_names

**Columns**

**Strip All Categorical Variables of Whitespaces**

In [312]:
cat_columns = loan_borrower_prod.columns[data_loan_borrower_prod.dtypes == 'O']
for cat_col in cat_columns:
   loan_borrower_prod[cat_col] = loan_borrower_prod[cat_col].str.strip()

In [313]:
loan_borrower_prod['loan_amount'] = loan_borrower_prod['loan_amount'].str.strip().apply(lambda x: np.NaN if x == '' else float(x))

In [314]:
loan_borrower_prod['term_in_months'] = loan_borrower_prod['term'].str.split(' ').str[0].apply(lambda x: int(x) if not pd.isnull(x) else x) 

In [315]:
purpose_space = {'debtconsolidation': 'Debt Consolidation', 'homeimprovement': 'Home Improvement'}
loan_borrower_prod['purpose'] = loan_borrower_prod['purpose'].apply(lambda x: purpose_space[x] if x in purpose_space else x.capitalize())

In [316]:
missing_term = loan_borrower_prod.loc[loan_borrower_prod['term'].isna()].index
missing_amount = loan_borrower_prod.loc[loan_borrower_prod['loan_amount'].isna()].index

In [317]:
loan_info = ['loan_amount', 'term_in_months', 'interest_rate', 'monthly_payment']
loan_borrower_prod.loc[missing_term, 'term_in_months'] = loan_borrower_prod.loc[missing_term, loan_info].apply(lambda row: calculate_loan_info(row, info='terms'), axis=1)
loan_borrower_prod.loc[missing_amount, 'loan_amount'] = loan_borrower_prod.loc[missing_amount, loan_info].apply(lambda row: calculate_loan_info(row, info='amount'), axis=1)

  terms_in_months = np.log(numerator) / np.log(1 + r)  # number of terms


**Filter out records where terms cannot be calculated**

In [318]:
loan_borrower_prod = loan_borrower_prod.loc[~loan_borrower_prod['term_in_months'].isna()]

**Reorder Column Names**

In [319]:
cols = loan_borrower_prod.columns.to_list()
cols = cols[:7] + [cols[-1]] + cols[7:-1]
loan_borrower_prod = loan_borrower_prod[cols]

In [320]:
loan_borrower_prod.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2992 entries, 0 to 2999
Data columns (total 28 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_id                     2992 non-null   float64
 1   member_id                   2992 non-null   float64
 2   date                        2992 non-null   object 
 3   purpose                     2992 non-null   object 
 4   is_joint_application        2992 non-null   float64
 5   loan_amount                 2992 non-null   float64
 6   term                        2976 non-null   object 
 7   term_in_months              2992 non-null   float64
 8   interest_rate               2992 non-null   float64
 9   monthly_payment             2992 non-null   float64
 10  grade                       2992 non-null   object 
 11  loan_status                 0 non-null      float64
 12  residential_state           2992 non-null   object 
 13  years_employment            2992 

In [326]:
missing_term = loan_borrower_prod[loan_borrower_prod['term'].isna()].index
loan_borrower_prod.loc[missing_term, 'term'] = loan_borrower_prod.loc[missing_term, 'term_in_months'].apply(lambda x: f'{int(x)} months')

In [328]:
loan_borrower_prod.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2992 entries, 0 to 2999
Data columns (total 28 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_id                     2992 non-null   float64
 1   member_id                   2992 non-null   float64
 2   date                        2992 non-null   object 
 3   purpose                     2992 non-null   object 
 4   is_joint_application        2992 non-null   float64
 5   loan_amount                 2992 non-null   float64
 6   term                        2992 non-null   object 
 7   term_in_months              2992 non-null   float64
 8   interest_rate               2992 non-null   float64
 9   monthly_payment             2992 non-null   float64
 10  grade                       2992 non-null   object 
 11  loan_status                 0 non-null      float64
 12  residential_state           2992 non-null   object 
 13  years_employment            2992 

In [329]:
loan_borrower_prod.drop(columns='term_in_months', inplace=True)

In [330]:
loan_borrower_prod.head()

Unnamed: 0,loan_id,member_id,date,purpose,is_joint_application,loan_amount,term,interest_rate,monthly_payment,grade,loan_status,residential_state,years_employment,home_ownership,annual_income,income_verified,dti_ratio,length_credit_history,n_total_credit_lines,n_open_credit_lines,n_open_credit_lines_1_year,revolving_balance,revolving_utilization_rate,n_derogatory_record,n_delinquency_2_years,n_charge_off_1_year,n_inquiries_6_months
0,10000000.0,16334480.0,7/2/2016,Debt Consolidation,0.0,23765.0,60 months,9.38,498.0,D1,,IL,6-9 years,mortgage,50975,0,17.71,1,11,9,5,15868,83.01,0,0,5,7.0
1,10000001.0,16334481.0,7/3/2016,Debt Consolidation,0.0,24302.0,60 months,6.84,479.0,C3,,TN,2-5 years,mortgage,51887,1,23.84,5,19,16,12,12709,65.63,0,0,14,0.0
2,10000002.0,16334482.0,7/4/2016,Debt Consolidation,0.0,18395.0,60 months,15.67,444.0,E1,,PA,10+ years,mortgage,53254,1,22.53,2,22,19,12,16928,78.35,0,20,0,6.0
3,10000003.0,16334483.0,7/5/2016,Debt Consolidation,0.0,19621.0,48 months,7.48,474.0,A1,,TX,1 year,own,59643,1,15.26,6,10,7,6,17301,61.74,0,0,12,0.0
4,10000004.0,16334484.0,7/6/2016,Debt Consolidation,0.0,20577.0,48 months,9.26,515.0,E3,,CA,< 1 year,mortgage,60630,1,9.67,4,16,15,10,16655,64.82,0,7,0,0.0


In [None]:
loan_borrower_prod.to_csv('../../data/interim/loan_borrower_prod.csv', index=False, mode='w+')

---