# imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.io as pio

pio.templates.default = 'plotly_white'
pd.set_option('display.float_format', '{:.5f}'.format)

# read data

In [2]:
df = pd.read_csv("../data/interim/loans_int.csv")


Columns (7) have mixed types.Specify dtype option on import or set low_memory=False.



# functions to clean data

In [3]:
def lower_case_cols(df):
    cols_to_lower = df.select_dtypes(include='object').columns
    for col in cols_to_lower:
        df[col] = df[col].str.lower()
    
    return 

def remove_whitespace(df):
    cols_to_clean = df.select_dtypes(include='object').columns
    for col in cols_to_clean:
        df[col] = df[col].str.strip()
    
    return 


def make_col_numeric(df, col):
    df[col] = df[col].astype(int)
    
    return 

def income_to_numeric(annual_income):
    if len(annual_income.split(' ')) == 1:
        return float(annual_income)
    if len(annual_income.split(' ')) > 1:
        return float(annual_income.split(' ')[-1])
    else:
        return np.nan    

def to_datetime(df, col):
    df[col] = pd.to_datetime(df[col])
    
    return 

    
def categorise_employment_length(employment_length):
    prefix = 'EL'
    
    if employment_length in ['10+ years']:
        return f'{prefix}_10plus'
    if employment_length in ['< 1 year']:
        return f'{prefix}_less1'
    
    if employment_length in ['1 year','2 years','3 years']:
        return f'{prefix}_1to3'
    if employment_length in ['4 years','5 years','6 years']:
        return f'{prefix}_4to6'
    if employment_length in ['7 years','8 years','9 years']:
        return f'{prefix}_7to9'
    
def categorise_home_ownership(home_ownership):
    if home_ownership in ['mortgage','rent','own']:
        return home_ownership
    else:
        return f'other'

def categorise_inquiries(inquiries_6m):
    if inquiries_6m==0:
        return f'no inquiry'
    if inquiries_6m==1:
        return f'1 inquiry'
    else:
        return f'2+ inquiry'
    
def categorise_purpose(purpose):
    
    if purpose not in ['debt_consolidation','credit_card']:
        return f'other'
    else:
        return purpose
    
def categorise_term(term):
    return float(term.split(' ')[0])


def make_binary_class(loan_status):
    if loan_status == 'fully paid':
        return 0
    if loan_status == 'ongoing':
        return np.nan
    
    if loan_status == 'charged off':
        return 1
    if loan_status == 'late (> 90 days)':
        return 1
    if loan_status == 'default':
        return 1
    
def create_credit_age(df, col, cutoff):
    df['credit_age'] = cutoff - df[col].dt.year
    
    return 


def impute_col(df, col, segment, method):
    if col and segment not in df.columns:
        raise ValueError(f'columns not in dataframe, was given {col} {segment}')
    
    if method == 'median':
        fill_method = method
        fill_value = df[col].median()
    elif method == 'mean':
        fill_method = method
        fill_value_df[col].mean()
    else:
        raise ValueError(f'method takes only "median" or "mean", was given {method}')
    
    filler = df.groupby(f'{segment}')[col].transform(fill_method)
    df[col].fillna(filler, inplace=True)
    
    if df[col].isna().any() == True:
        df[col].fillna(fill_value, inplace=True)
    else:
        pass
    
    return

def drop_na_cols(df, pct_thresh):
    row, col = df.shape
    threshold = row*pct_thresh
    
    df.dropna(axis=1, thresh=threshold, inplace=True)
    
    return 

def drop_na_rows(df):
    df.dropna(axis=0, inplace=True)
    
    return 

def drop_cols(df, cols_to_drop):
    df.drop(labels = cols_to_drop, axis=1, inplace = True)
    return

In [4]:
columns = {
    
    'drop': [
        'title',
        'job_title',
        'district',
        'issue_date',
        'postcode_district',
        'loan_status',
        'year',
        'earliest_credit_line',
        'amount_payed'
    ]
    
}

# data cleaning

 - Remove highly correlated variables 
 - To decide which variable to remove check other correlation with other variables
 
 - proceed to train model via back/fwrd selection or recusive feature selection
 - make sure methods are motivated vs other methods

In [5]:
df.shape

(237436, 32)

In [6]:
drop_na_cols(df, 0.5)

lower_case_cols(df)
remove_whitespace(df)
make_col_numeric(df, 'credit_score')
to_datetime(df, 'earliest_credit_line')
df.annual_income = df.annual_income.apply(income_to_numeric)

df.employment_length = df.employment_length.apply(categorise_employment_length)
df.home_ownership = df.home_ownership.apply(categorise_home_ownership)
df.inquiries_6m = df.inquiries_6m.apply(categorise_inquiries)
df.purpose = df.purpose.apply(categorise_purpose)
df.term = df.term.apply(categorise_term)

impute_col(df, 'total_current_balance', 'district', 'median')

create_credit_age(df, 'earliest_credit_line', 2015)
df['class'] = df.loan_status.apply(make_binary_class)


drop_cols(df, columns['drop'])
drop_na_rows(df)

In [7]:
df.shape

(222655, 21)

In [8]:
df.columns

Index(['account_id', 'installment', 'loan_amount', 'interest_rate', 'term',
       'purpose', 'home_ownership', 'annual_income', 'employment_length',
       'public_records', 'delinquency_2y', 'inquiries_6m', 'open_accounts',
       'debt_to_income', 'credit_card_usage', 'credit_card_balance',
       'total_current_balance', 'nr_accounts', 'credit_score', 'credit_age',
       'class'],
      dtype='object')

In [9]:
df.head()

Unnamed: 0,account_id,installment,loan_amount,interest_rate,term,purpose,home_ownership,annual_income,employment_length,public_records,...,inquiries_6m,open_accounts,debt_to_income,credit_card_usage,credit_card_balance,total_current_balance,nr_accounts,credit_score,credit_age,class
0,125968,829.1,25000.0,11.89,36.0,debt_consolidation,rent,85000.0,EL_less1,0.0,...,no inquiry,10.0,19.48,52.1,28854.0,99434.5,42.0,710,21,0.0
1,128479,40.5,1200.0,13.11,36.0,debt_consolidation,own,54000.0,EL_10plus,0.0,...,no inquiry,5.0,5.47,40.4,2584.0,71876.0,31.0,416,30,0.0
2,128650,366.86,10800.0,13.57,36.0,debt_consolidation,rent,32000.0,EL_4to6,0.0,...,1 inquiry,14.0,11.63,25.6,3511.0,89230.0,40.0,354,19,0.0
3,129758,264.11,7200.0,19.05,36.0,debt_consolidation,rent,58000.0,EL_7to9,0.0,...,no inquiry,6.0,2.05,90.1,3874.0,154930.0,25.0,697,21,0.0
4,130240,102.92,3000.0,14.26,36.0,credit_card,mortgage,80800.0,EL_1to3,0.0,...,no inquiry,13.0,14.97,39.5,4740.0,87881.0,23.0,799,17,0.0


In [10]:
df.dtypes

account_id                 int64
installment              float64
loan_amount              float64
interest_rate            float64
term                     float64
purpose                   object
home_ownership            object
annual_income            float64
employment_length         object
public_records           float64
delinquency_2y           float64
inquiries_6m              object
open_accounts            float64
debt_to_income           float64
credit_card_usage        float64
credit_card_balance      float64
total_current_balance    float64
nr_accounts              float64
credit_score               int64
credit_age                 int64
class                    float64
dtype: object

In [11]:
df.nunique()

account_id               222655
installment               41930
loan_amount                1320
interest_rate               413
term                          2
purpose                       3
home_ownership                4
annual_income             16439
employment_length             5
public_records               14
delinquency_2y               24
inquiries_6m                  3
open_accounts                58
debt_to_income             3963
credit_card_usage          1193
credit_card_balance       45886
total_current_balance    125006
nr_accounts                 105
credit_score               1464
credit_age                   61
class                         2
dtype: int64

In [12]:
df.columns[df.isna().any()]

Index([], dtype='object')