In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
accepted_path = '/content/drive/My Drive/Datasets/accepted_2007_to_2018Q4.csv.gz'

Import Libraries

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix,precision_recall_curve
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
from sklearn.calibration import calibration_curve

In [4]:
df = pd.read_csv(accepted_path, compression='gzip', low_memory=False)
# Take a sample of the data
df = df.sample(frac=0.1, random_state=42)
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
392949,39651438,,32000.0,32000.0,32000.0,60 months,10.49,687.65,B,B3,...,,,Cash,N,,,,,,
1273506,16411620,,9600.0,9600.0,9600.0,36 months,12.99,323.42,C,C1,...,,,Cash,N,,,,,,
324024,45122316,,4000.0,4000.0,4000.0,36 months,6.68,122.93,A,A3,...,,,Cash,N,,,,,,
2066630,125356772,,6025.0,6025.0,6025.0,36 months,10.91,197.0,B,B4,...,,,Cash,N,,,,,,
477199,128490686,,25000.0,25000.0,25000.0,60 months,26.3,752.96,E,E5,...,,,Cash,N,,,,,,


In [5]:
df['loan_status'].head()

Unnamed: 0,loan_status
392949,Current
1273506,Fully Paid
324024,Fully Paid
2066630,Fully Paid
477199,Current


In [6]:
# Add necessary variables
# Determining the loans that are bad from loan_status column --dependent variable (y)
bad_loan = ["Charged Off", "Default", "Does not meet the credit policy. Status:Charged Off", "In Grace Period",
            "Late (16-30 days)", "Late (31-120 days)"]
df['loan_condition_int'] = df['loan_status'].apply(lambda status: 1 if status in bad_loan else 0).astype(int)
df['loan_condition'] = np.where(df['loan_condition_int'] == 0, 'Good Loan', 'Bad Loan')

#emp_length
emp_length_mapping = {
    '10+ years': 10,
    '9 years': 9,
    '8 years': 8,
    '7 years': 7,
    '6 years': 6,
    '5 years': 5,
    '4 years': 4,
    '3 years': 3,
    '2 years': 2,
    '1 year': 1,
    '< 1 year': 0.5,
    'n/a': 0
}
df['emp_length_int'] = df['emp_length'].map(emp_length_mapping)
df.head().transpose()

# region
state_to_region = {
    'CA': 'West', 'OR': 'West', 'UT': 'West', 'WA': 'West', 'CO': 'West',
    'NV': 'West', 'AK': 'West', 'MT': 'West', 'HI': 'West', 'WY': 'West', 'ID': 'West',
    'AZ': 'SouthWest', 'TX': 'SouthWest', 'NM': 'SouthWest', 'OK': 'SouthWest',
    'GA': 'SouthEast', 'NC': 'SouthEast', 'VA': 'SouthEast', 'FL': 'SouthEast', 'KY': 'SouthEast',
    'SC': 'SouthEast', 'LA': 'SouthEast', 'AL': 'SouthEast', 'WV': 'SouthEast', 'DC': 'SouthEast',
    'AR': 'SouthEast', 'DE': 'SouthEast', 'MS': 'SouthEast', 'TN': 'SouthEast',
    'IL': 'MidWest', 'MO': 'MidWest', 'MN': 'MidWest', 'OH': 'MidWest', 'WI': 'MidWest',
    'KS': 'MidWest', 'MI': 'MidWest', 'SD': 'MidWest', 'IA': 'MidWest', 'NE': 'MidWest',
    'IN': 'MidWest', 'ND': 'MidWest',
    'CT': 'NorthEast', 'NY': 'NorthEast', 'PA': 'NorthEast', 'NJ': 'NorthEast', 'RI': 'NorthEast',
    'MA': 'NorthEast', 'MD': 'NorthEast', 'VT': 'NorthEast', 'NH': 'NorthEast', 'ME': 'NorthEast'
}
df['region'] = df['addr_state'].map(state_to_region)
df.head().transpose()

Unnamed: 0,392949,1273506,324024,2066630,477199
id,39651438,16411620,45122316,125356772,128490686
member_id,,,,,
loan_amnt,32000.0,9600.0,4000.0,6025.0,25000.0
funded_amnt,32000.0,9600.0,4000.0,6025.0,25000.0
funded_amnt_inv,32000.0,9600.0,4000.0,6025.0,25000.0
...,...,...,...,...,...
settlement_term,,,,,
loan_condition_int,0,0,0,0,0
loan_condition,Good Loan,Good Loan,Good Loan,Good Loan,Good Loan
emp_length_int,10.0,,4.0,10.0,10.0
