## Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly
import plotly.graph_objs as go
import plotly.io as pio

pio.templates.default = 'plotly_white'
pd.set_option('display.float_format', '{:.5f}'.format)

## Read data

In [2]:
df = pd.read_csv("../data/interim/loans_int.csv")


Columns (7) have mixed types.Specify dtype option on import or set low_memory=False.



In [3]:
data_dict = pd.read_csv('../references/data-dictionary.csv')

## Objectives

- Understand the data
- Check data for completeness na's and cells with whitespace and resolve
- Check object data type and resolve

Overall picture:

- Want to answer: How to predict if an individual will default or pay their full loan?
- Model to be used as a tool to support analysts => simple model to use and understand
- Worth assessing the monetary value of model

## Data dictionary - data understanding

- When someone applies for a loan we would not know their loan status! So this would be a type of data leakage and should not be present in the dataset 

- 

In [12]:
df.head()

Unnamed: 0,account_id,installment,loan_amount,interest_rate,term,purpose,issue_date,description,title,home_ownership,...,credit_card_usage,credit_card_balance,total_current_balance,nr_accounts,loan_status,amount_payed,year,district,postcode_district,credit_score
0,125968,829.1,25000.0,11.89,36 months,debt_consolidation,Aug-2009,Due to a lack of personal finance education an...,Debt consolidation for on-time payer,RENT,...,52.1,28854.0,,42.0,Fully Paid,29324.32,2009,Liverpool,L69,710.69972
1,128479,40.5,1200.0,13.11,36 months,debt_consolidation,Mar-2010,"If funded, I would use this loan consolidate t...",zxcvb,OWN,...,40.4,2584.0,,31.0,Fully Paid,1457.31,2010,Redbridge,IG9,416.08693
2,128650,366.86,10800.0,13.57,36 months,debt_consolidation,Nov-2009,I currently have a personal loan with Citifina...,Nicolechr1978,RENT,...,25.6,3511.0,,40.0,Fully Paid,13195.27,2009,Rugby,CV21,354.22883
3,129758,264.11,7200.0,19.05,36 months,debt_consolidation,Aug-2012,"Credit cards are out of here, I am tired of be...",caminijio,RENT,...,90.1,3874.0,154930.0,25.0,Fully Paid,9100.80466,2012,West Lindsey,DN21,697.21416
4,130240,102.92,3000.0,14.26,36 months,credit_card,Sep-2009,I am seeking to refinance a credit account whi...,Rejecting new cardmember agreement,MORTGAGE,...,39.5,4740.0,,23.0,Fully Paid,3703.38,2009,South Derbyshire,DE11,799.4287


In [4]:
for idx in range(data_dict.shape[0]):
    variable = data_dict.iloc[idx,0]
    description = data_dict.iloc[idx,1]
    print(f'{variable}:\n{description}\n\n')

annual_income:
The self-reported annual income provided by the borrower during registration.


delinquency_2y:
The number of 30+ days past-due incidences of delinquency in the borrower's credit file for the past 2 years.


description:
Loan description provided by the borrower.


debt_to_income:
A ratio calculated using the borrower's total monthly debt payments on the total debt obligations, excluding mortgage and the requested loan, divided by the borrower’s self-reported monthly income.


earliest_credit_line:
The month the borrower's earliest reported credit line was opened.


employment_length:
Employment length in years. Possible values are between 0 and 10 where 0 means less than one year and 10 means ten or more years.


job_title:
The job title supplied by the borrower when applying for the loan.


home_ownership:
The home ownership status provided by the borrower during registration. The values are: RENT, OWN, MORTGAGE, OTHER.


inquiries_6m:
The number of inquiries in past 6

## Data completeness

In [40]:
df_nan = [col for col in df.columns if df[col].isnull().any()==True]
print(df_nan)

['description', 'title', 'employment_length', 'job_title', 'last_record_months', 'last_delinquency_months', 'last_derog_months', 'credit_card_usage', 'total_current_balance', 'district']


## Data types

In [39]:
df_obj_type = [col for col in df.columns if df[col].dtypes=='object']
print(df_obj_type)

['term', 'purpose', 'issue_date', 'description', 'title', 'home_ownership', 'annual_income', 'employment_length', 'job_title', 'earliest_credit_line', 'loan_status', 'district', 'postcode_district']


## Resolve completeness

- Need to resolve variables with nan in columns
- Intuitively speaking if 

In [57]:
df[['description']].isna().sum()

description    159498
dtype: int64

In [71]:
dict_cols_resolve = {}
dropped = []
impute = []
sample_drop = []

for col in df_nan:
    num_nan = df[[col]].isna().sum()
    total = df.shape[0]
    nan_pct = (num_nan/total * 100).values[0] 
    
    if nan_pct >= 50:
        dropped.append(col)
    elif nan_pct < 50 and nan_pct >=20:
        impute.append(col)
    else:
        sample_drop.append(col)

dict_cols_resolve['Drop'] = dropped
dict_cols_resolve['Impute'] = impute
dict_cols_resolve['Drop Samples'] = sample_drop

print(dict_cols_resolve)

{'Drop': ['description', 'last_record_months', 'last_delinquency_months', 'last_derog_months'], 'Impute': ['total_current_balance'], 'Drop Samples': ['title', 'employment_length', 'job_title', 'credit_card_usage', 'district']}


## Resolve Datatypes