In [1]:
import numpy as np
import pandas as pd

In [22]:
pd.set_option('display.max_columns', 500)

---

In [2]:
data_loan_prod = pd.read_csv('../../data/raw/Loan_Prod.txt', sep='\t')
data_borrower_prod = pd.read_csv('../../data/raw/Borrower_Prod.txt', sep='\t')

---

In [4]:
data_loan_prod.drop(columns='Unnamed: 11', inplace=True)
data_loan_prod.drop(index=3000, inplace=True)
data_borrower_prod.fillna(0, inplace=True)

In [19]:
data_loan_borrower_prod = pd.merge(data_loan_prod, data_borrower_prod, on='memberId')

---

In [26]:
from typing import Optional
def calculate_loan_info(row: pd.Series, info: str) -> Optional[int]:
   """Calculate the missing loan amount or number of terms given its number
   of terms in months or loan amount, respectively, yearly interest rate, and 
   amortized monthly payment.

   - info == 'amount' / 'terms'
   """
   emi = row['monthly_payment']  # amortized equal monthly installment
   r = row['interest_rate'] / 1200  # monthly interest rate

   if info == 'amount':
      n = row['term_in_months']  # number of terms 
      principal = emi * (1 - (1/(1+r)) ** n) / r  # total loan amount
      return np.round(principal)
   elif info == 'terms':
      p = row['loan_amount']  # total loan amount
      numerator = emi / (emi - p * r)
      terms_in_months = np.log(numerator) / np.log(1 + r)  # number of terms
      return np.round(terms_in_months)
   else:
      print('info has to be either "amount" or "terms". Try again.')

In [20]:
loan_borrower_prod = data_loan_borrower_prod.copy()

In [31]:
loan_borrower_prod['monthlyPayment'].fillna(444, inplace=True)

In [34]:
column_names = ['loan_id', 'member_id', 'date', 'purpose', 'is_joint_application', 'loan_amount', 'term', 'interest_rate', 'monthly_payment', 'grade', 'loan_status', 'residential_state', 'years_employment', 'home_ownership', 'annual_income', 'income_verified', 'dti_ratio', 'length_credit_history', 'n_total_credit_lines', 'n_open_credit_lines', 'n_open_credit_lines_1_year', 'revolving_balance', 'revolving_utilization_rate', 'n_derogatory_record', 'n_delinquency_2_years', 'n_charge_off_1_year', 'n_inquiries_6_months']

In [35]:
loan_borrower_prod.columns = column_names

In [39]:
loan_borrower_prod['term_in_months'] = loan_borrower_prod['term'].str.split(' ').str[0].apply(lambda x: int(x) if not pd.isnull(x) else x) 

In [41]:
purpose_space = {'debtconsolidation': 'Debt Consolidation', 'homeimprovement': 'Home Improvement'}
loan_borrower_prod['purpose'] = loan_borrower_prod['purpose'].apply(lambda x: purpose_space[x] if x in purpose_space else x.capitalize())

In [42]:
loan_borrower_prod.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3000 entries, 0 to 2999
Data columns (total 28 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_id                     3000 non-null   float64
 1   member_id                   3000 non-null   float64
 2   date                        3000 non-null   object 
 3   purpose                     3000 non-null   object 
 4   is_joint_application        3000 non-null   float64
 5   loan_amount                 3000 non-null   object 
 6   term                        2976 non-null   object 
 7   interest_rate               3000 non-null   float64
 8   monthly_payment             3000 non-null   float64
 9   grade                       3000 non-null   object 
 10  loan_status                 0 non-null      float64
 11  residential_state           3000 non-null   object 
 12  years_employment            3000 non-null   object 
 13  home_ownership              3000 

In [45]:
missing_term = loan_borrower_prod.loc[loan_borrower_prod['term'].isna()].index

In [46]:
loan_info = ['loan_amount', 'term_in_months', 'interest_rate', 'monthly_payment']
loan_borrower_prod.loc[missing_term, 'term_in_months'] = loan_borrower_prod.loc[missing_term, loan_info].apply(lambda row: calculate_loan_info(row, info='terms'), axis=1)

TypeError: can't multiply sequence by non-int of type 'float'