### Bank Loans test

In [93]:
import numpy as np
import pandas as pd

In [94]:
loans = pd.read_csv('loans.csv', header=0, sep = ',')
pd.options.display.max_columns = 100 # display all columns
loans.head(5)
#loans.describe()

Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_length,installment_rate,personal_status,other_debtors,residence_history,property,age,installment_plan,housing,existing_credits,default,dependents,telephone,foreign_worker,job
0,< 0 DM,6,critical,radio/tv,1169,unknown,> 7 yrs,4,single male,none,4,real estate,67,none,own,2,1,1,yes,yes,skilled employee
1,1 - 200 DM,48,repaid,radio/tv,5951,< 100 DM,1 - 4 yrs,2,female,none,2,real estate,22,none,own,1,2,1,none,yes,skilled employee
2,unknown,12,critical,education,2096,< 100 DM,4 - 7 yrs,2,single male,none,3,real estate,49,none,own,1,1,2,none,yes,unskilled resident
3,< 0 DM,42,repaid,furniture,7882,< 100 DM,4 - 7 yrs,2,single male,guarantor,4,building society savings,45,none,for free,1,1,2,none,yes,skilled employee
4,< 0 DM,24,delayed,car (new),4870,< 100 DM,1 - 4 yrs,3,single male,none,4,unknown/none,53,none,for free,2,2,2,none,yes,skilled employee


In [95]:
#loans.columns[-1]
print("checking_balance:",loans['checking_balance'].unique())
print("savings_balance:",loans['savings_balance'].unique())
print("months_loan_duration:",loans['months_loan_duration'].unique()) # can be considered continuous
print("credit_history:",loans['credit_history'].unique()) # consolidate: critical, repaid, delayed [labels]
print("purpose:",loans['purpose'].unique())
print("employment_length:",loans['employment_length'].unique()) # encode as numbers from 0 (unemployed) to 4(>7 yrs)
print("installment_rate:",loans['installment_rate'].unique())
print("personal_status:",loans['personal_status'].unique()) # split into gender and personal status
# .... check the rest of the columns
print("jobs:",loans['job'].unique())

checking_balance: ['< 0 DM' '1 - 200 DM' 'unknown' '> 200 DM']
savings_balance: ['unknown' '< 100 DM' '501 - 1000 DM' '> 1000 DM' '101 - 500 DM']
months_loan_duration: [ 6 48 12 42 24 36 30 15  9 10  7 60 18 45 11 27  8 54 20 14 33 21 16  4 47
 13 22 39 28  5 26 72 40]
credit_history: ['critical' 'repaid' 'delayed' 'fully repaid' 'fully repaid this bank']
purpose: ['radio/tv' 'education' 'furniture' 'car (new)' 'car (used)' 'business'
 'domestic appliances' 'repairs' 'others' 'retraining']
employment_length: ['> 7 yrs' '1 - 4 yrs' '4 - 7 yrs' 'unemployed' '0 - 1 yrs']
installment_rate: [4 2 3 1]
personal_status: ['single male' 'female' 'divorced male' 'married male']
jobs: ['skilled employee' 'unskilled resident' 'mangement self-employed'
 'unemployed non-resident']


In [96]:
loans['checking_balance'].value_counts()

unknown       394
< 0 DM        274
1 - 200 DM    269
> 200 DM       63
Name: checking_balance, dtype: int64

In [97]:
loans['savings_balance'].value_counts()

< 100 DM         603
unknown          183
101 - 500 DM     103
501 - 1000 DM     63
> 1000 DM         48
Name: savings_balance, dtype: int64

In [98]:
loans['credit_history'].value_counts()

repaid                    530
critical                  293
delayed                    88
fully repaid this bank     49
fully repaid               40
Name: credit_history, dtype: int64

In [99]:
# parse the personal status into gender and status
# female, male or N(one)
# we already know the options from above, all lower case, so no further text processing
def getGender(txt):
    if "female" in txt:
        return "F"
    if "male" in txt:
        return "M"
    return "N"

# married, single, divorced or U(nknown)
def getStatus(txt):
    if "maried" in txt:
        return "M"
    if "single" in txt:
        return "S"
    if "divorced" in txt:
        return "D"    
    return "U"

def getHistory(txt):
    if "repaid" in txt:
        return 1  
    return 0

In [100]:
loans["gender"] = loans["personal_status"].apply(getGender)
loans["pstatus"] = loans["personal_status"].apply(getStatus)
loans["label"] = loans["credit_history"].apply(getHistory)

In [101]:
loans.head(5)

Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_length,installment_rate,personal_status,other_debtors,residence_history,property,age,installment_plan,housing,existing_credits,default,dependents,telephone,foreign_worker,job,gender,pstatus,label
0,< 0 DM,6,critical,radio/tv,1169,unknown,> 7 yrs,4,single male,none,4,real estate,67,none,own,2,1,1,yes,yes,skilled employee,M,S,0
1,1 - 200 DM,48,repaid,radio/tv,5951,< 100 DM,1 - 4 yrs,2,female,none,2,real estate,22,none,own,1,2,1,none,yes,skilled employee,F,U,1
2,unknown,12,critical,education,2096,< 100 DM,4 - 7 yrs,2,single male,none,3,real estate,49,none,own,1,1,2,none,yes,unskilled resident,M,S,0
3,< 0 DM,42,repaid,furniture,7882,< 100 DM,4 - 7 yrs,2,single male,guarantor,4,building society savings,45,none,for free,1,1,2,none,yes,skilled employee,M,S,1
4,< 0 DM,24,delayed,car (new),4870,< 100 DM,1 - 4 yrs,3,single male,none,4,unknown/none,53,none,for free,2,2,2,none,yes,skilled employee,M,S,0


In [102]:
# not sure about existing_credits
dummy_columns = ['gender','checking_balance','purpose','savings_balance','employment_length',
                 'installment_rate','other_debtors','residence_history', 'property','installment_plan',
                'housing','telephone','foreign_worker','job','gender','pstatus']
new_loans = pd.get_dummies(loans, columns=dummy_columns, drop_first=True)
new_loans.head(5)

Unnamed: 0,months_loan_duration,credit_history,amount,personal_status,age,existing_credits,default,dependents,label,gender_M,checking_balance_< 0 DM,checking_balance_> 200 DM,checking_balance_unknown,purpose_car (new),purpose_car (used),purpose_domestic appliances,purpose_education,purpose_furniture,purpose_others,purpose_radio/tv,purpose_repairs,purpose_retraining,savings_balance_501 - 1000 DM,savings_balance_< 100 DM,savings_balance_> 1000 DM,savings_balance_unknown,employment_length_1 - 4 yrs,employment_length_4 - 7 yrs,employment_length_> 7 yrs,employment_length_unemployed,installment_rate_2,installment_rate_3,installment_rate_4,other_debtors_guarantor,other_debtors_none,residence_history_2,residence_history_3,residence_history_4,property_other,property_real estate,property_unknown/none,installment_plan_none,installment_plan_stores,housing_own,housing_rent,telephone_yes,foreign_worker_yes,job_skilled employee,job_unemployed non-resident,job_unskilled resident,gender_M.1,pstatus_S,pstatus_U
0,6,critical,1169,single male,67,2,1,1,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,1,0,1,0,1,0,1,1,1,0,0,1,1,0
1,48,repaid,5951,female,22,1,2,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,1,0,1,0,1,0,0,1,1,0,0,0,0,1
2,12,critical,2096,single male,49,1,1,2,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,1,0,1,0,0,1,0,0,1,1,1,0
3,42,repaid,7882,single male,45,1,1,2,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0
4,24,delayed,4870,single male,53,2,2,2,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0
