# Data Preparation

## Import Libraries

In [None]:
import numpy as np
import pandas as pd

## Import Data

In [None]:
loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')

In [None]:
loan_data = loan_data_backup.copy()

## Explore Data

In [None]:
loan_data

In [None]:
pd.options.display.max_columns = None

In [None]:
loan_data

In [None]:
loan_data.head()

In [None]:
loan_data.tail()

In [None]:
loan_data.columns.values

In [None]:
loan_data.info()

## General Preprocessing

### Preprocessing few continuous variables

In [None]:
loan_data['emp_length'].unique()

In [None]:
loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\+ years', '')
loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0))
loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a',  str(0))
loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '')
loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '')

In [None]:
type(loan_data['emp_length_int'][0])

In [None]:
loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])

In [None]:
type(loan_data['emp_length_int'][0])

In [None]:
loan_data['earliest_cr_line']

In [None]:
loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = '%b-%y')

In [None]:
type(loan_data['earliest_cr_line_date'][0])

In [None]:
pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']

In [None]:
loan_data['mths_since_earliest_cr_line'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']) / np.timedelta64(1, 'M')))

In [None]:
loan_data['mths_since_earliest_cr_line'].describe()

In [None]:
loan_data.loc[: , ['earliest_cr_line', 'earliest_cr_line_date', 'mths_since_earliest_cr_line']][loan_data['mths_since_earliest_cr_line'] < 0]

In [None]:
loan_data['mths_since_earliest_cr_line'][loan_data['mths_since_earliest_cr_line'] < 0] = loan_data['mths_since_earliest_cr_line'].max()

In [None]:
min(loan_data['mths_since_earliest_cr_line'])

### Homework

In [None]:
loan_data['term']

In [None]:
loan_data['term'].describe()

In [None]:
loan_data['term_int'] = loan_data['term'].str.replace(' months', '')

In [None]:
loan_data['term_int']

In [None]:
type(loan_data['term_int'][25])

In [None]:
loan_data['term_int'] = pd.to_numeric(loan_data['term'].str.replace(' months', ''))
loan_data['term_int']

In [None]:
type(loan_data['term_int'][0])

In [None]:
loan_data['issue_d']

In [None]:
loan_data['issue_d_date'] = pd.to_datetime(loan_data['issue_d'], format = '%b-%y')
loan_data['mths_since_issue_d'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['issue_d_date']) / np.timedelta64(1, 'M')))
loan_data['mths_since_issue_d'].describe()

### Preprocessing few discrete variables

In [None]:
loan_data.info()

In [None]:
pd.get_dummies(loan_data['grade'])

In [None]:
pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':')

In [None]:
loan_data_dummies = [pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':'),
                     pd.get_dummies(loan_data['sub_grade'], prefix = 'sub_grade', prefix_sep = ':'),
                     pd.get_dummies(loan_data['home_ownership'], prefix = 'home_ownership', prefix_sep = ':'),
                     pd.get_dummies(loan_data['verification_status'], prefix = 'verification_status', prefix_sep = ':'),
                     pd.get_dummies(loan_data['loan_status'], prefix = 'loan_status', prefix_sep = ':'),
                     pd.get_dummies(loan_data['purpose'], prefix = 'purpose', prefix_sep = ':'),
                     pd.get_dummies(loan_data['addr_state'], prefix = 'addr_state', prefix_sep = ':'),
                     pd.get_dummies(loan_data['initial_list_status'], prefix = 'initial_list_status', prefix_sep = ':')]

In [None]:
loan_data_dummies = pd.concat(loan_data_dummies, axis = 1)

In [None]:
type(loan_data_dummies)

In [None]:
loan_data = pd.concat([loan_data, loan_data_dummies], axis = 1)

In [None]:
loan_data.columns.values

### Check for missing values and clean

In [None]:
loan_data.isnull()

In [None]:
pd.options.display.max_rows = None
loan_data.isnull().sum()

In [None]:
pd.options.display.max_rows = 100

In [None]:
loan_data['total_rev_hi_lim'].fillna(loan_data['funded_amnt'], inplace = True)

In [None]:
loan_data['total_rev_hi_lim'].isnull().sum()

### Homework

In [None]:
loan_data['annual_inc'].fillna(loan_data['annual_inc'].mean(), inplace=True)

In [None]:
loan_data['mths_since_earliest_cr_line'].fillna(0, inplace=True)
loan_data['acc_now_delinq'].fillna(0, inplace=True)
loan_data['total_acc'].fillna(0, inplace=True)
loan_data['pub_rec'].fillna(0, inplace=True)
loan_data['open_acc'].fillna(0, inplace=True)
loan_data['inq_last_6mths'].fillna(0, inplace=True)
loan_data['delinq_2yrs'].fillna(0, inplace=True)
loan_data['emp_length_int'].fillna(0, inplace=True)

# PD model

## Data preparation

### Dependent Variable. Good/ Bad (Default) Definition. Default and Non-default Accounts.

In [None]:
loan_data['loan_status'].unique()

In [None]:
loan_data['loan_status'].value_counts()

In [None]:
loan_data['loan_status'].value_counts() / loan_data['loan_status'].count()

In [None]:
loan_data['good_bad'] = np.where(loan_data['loan_status'].isin(['Charged Off', 'Default',
                                                               'Does not meet the credit policy. Status:Charged Off',
                                                               'Late (31-120 days)']), 0, 1)

In [None]:
loan_data['good_bad']