In [1]:
import pandas as pd
import numpy as np

In [2]:
loans = pd.read_csv('lending-club-data.csv', low_memory=False)

In [3]:
print loans.columns

Index([u'id', u'member_id', u'loan_amnt', u'funded_amnt', u'funded_amnt_inv',
       u'term', u'int_rate', u'installment', u'grade', u'sub_grade',
       u'emp_title', u'emp_length', u'home_ownership', u'annual_inc',
       u'is_inc_v', u'issue_d', u'loan_status', u'pymnt_plan', u'url', u'desc',
       u'purpose', u'title', u'zip_code', u'addr_state', u'dti',
       u'delinq_2yrs', u'earliest_cr_line', u'inq_last_6mths',
       u'mths_since_last_delinq', u'mths_since_last_record', u'open_acc',
       u'pub_rec', u'revol_bal', u'revol_util', u'total_acc',
       u'initial_list_status', u'out_prncp', u'out_prncp_inv', u'total_pymnt',
       u'total_pymnt_inv', u'total_rec_prncp', u'total_rec_int',
       u'total_rec_late_fee', u'recoveries', u'collection_recovery_fee',
       u'last_pymnt_d', u'last_pymnt_amnt', u'next_pymnt_d',
       u'last_credit_pull_d', u'collections_12_mths_ex_med',
       u'mths_since_last_major_derog', u'policy_code', u'not_compliant',
       u'status', u'inactiv

In [4]:
# safe_loans =  1 => safe
# safe_loans = -1 => risky
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)

In [5]:
target = 'safe_loans'
features = ['grade',                     # grade of the loan (categorical)
            'sub_grade_num',             # sub-grade of the loan as a number from 0 to 1
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'payment_inc_ratio',         # ratio of the monthly payment to income
            'delinq_2yrs',               # number of delinquincies
             'delinq_2yrs_zero',          # no delinquincies in last 2 years
            'inq_last_6mths',            # number of creditor inquiries in last 6 months
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'open_acc',                  # number of open credit accounts
            'pub_rec',                   # number of derogatory public records
            'pub_rec_zero',              # no derogatory public records
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
            'int_rate',                  # interest rate of the loan
            'total_rec_int',             # interest received to date
            'annual_inc',                # annual income of borrower
            'funded_amnt',               # amount committed to the loan
            'funded_amnt_inv',           # amount committed by investors for the loan
            'installment',               # monthly payment owed by the borrower
           ]

In [6]:
loans_with_na = loans.copy()

In [7]:
loans = loans[[target] + features].dropna()

In [9]:
# Count the number of rows with missing data
num_rows_with_na = len(loans_with_na)
num_rows = len(loans)
print 'Original %s observations; keeping %s ' % (num_rows_with_na, num_rows)

Original 122607 observations; keeping 122578 


In [10]:
train_index = pd.read_json('module-8-assignment-1-train-idx.json')
train_index.columns = ['indexvalue']
train_idx = train_index.indexvalue.tolist()

validation_index = pd.read_json('module-8-assignment-1-validation-idx.json')
validation_index.columns = ['indexvalue']
validation_idx = validation_index.indexvalue.tolist()

train_data = loans.iloc[train_idx]
validation_data = loans.iloc[validation_idx]

In [23]:
train_data.shape, validation_data.shape, train_data[features].shape, validation_data[features].shape

((37219, 25), (9284, 25), (37219, 24), (9284, 24))

In [14]:
train_data.columns

Index([u'safe_loans', u'grade', u'sub_grade_num', u'short_emp',
       u'emp_length_num', u'home_ownership', u'dti', u'purpose',
       u'payment_inc_ratio', u'delinq_2yrs', u'delinq_2yrs_zero',
       u'inq_last_6mths', u'last_delinq_none', u'last_major_derog_none',
       u'open_acc', u'pub_rec', u'pub_rec_zero', u'revol_util',
       u'total_rec_late_fee', u'int_rate', u'total_rec_int', u'annual_inc',
       u'funded_amnt', u'funded_amnt_inv', u'installment'],
      dtype='object')

In [17]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(max_depth=6, n_estimators=5)

In [20]:
#Prepare data

#One-Hot-Encoding
from sklearn.feature_extraction import DictVectorizer

def encode_data(data, categorical_type):
    
    data_cat = data[categorical_type].T.to_dict().values()
    vect = DictVectorizer(sparse=False)
    data_vector = vect.fit_transform(data_cat)
    
    #Merge/replace vector data into existing dataframe
    #data_num = data.drop(categorical_type + outcome_var, axis=1)
    #X_train = np.concatenate((data_num.values, data_vector), axis=1)
    
    return data_vector

In [21]:
def df_encode_data(data, categorical_type):
    return pd.get_dummies(data[categorical_type])

In [24]:
category = features

X_train_vector = df_encode_data(train_data[features], features)
validation_vector = df_encode_data(validation_data[features], features)

In [25]:
X_train_vector.shape,validation_vector.shape

((37219, 44), (9284, 44))

In [26]:
X_train_vector.columns

Index([u'sub_grade_num', u'short_emp', u'emp_length_num', u'dti',
       u'payment_inc_ratio', u'delinq_2yrs', u'delinq_2yrs_zero',
       u'inq_last_6mths', u'last_delinq_none', u'last_major_derog_none',
       u'open_acc', u'pub_rec', u'pub_rec_zero', u'revol_util',
       u'total_rec_late_fee', u'int_rate', u'total_rec_int', u'annual_inc',
       u'funded_amnt', u'funded_amnt_inv', u'installment', u'grade_A',
       u'grade_B', u'grade_C', u'grade_D', u'grade_E', u'grade_F', u'grade_G',
       u'home_ownership_MORTGAGE', u'home_ownership_OTHER',
       u'home_ownership_OWN', u'home_ownership_RENT', u'purpose_car',
       u'purpose_credit_card', u'purpose_debt_consolidation',
       u'purpose_home_improvement', u'purpose_house',
       u'purpose_major_purchase', u'purpose_medical', u'purpose_moving',
       u'purpose_other', u'purpose_small_business', u'purpose_vacation',
       u'purpose_wedding'],
      dtype='object')

In [27]:
for col in train_data.columns:
    print str(col) + ' ' + str(train_data[col].dtype)

safe_loans int64
grade object
sub_grade_num float64
short_emp int64
emp_length_num int64
home_ownership object
dti float64
purpose object
payment_inc_ratio float64
delinq_2yrs float64
delinq_2yrs_zero float64
inq_last_6mths float64
last_delinq_none int64
last_major_derog_none int64
open_acc float64
pub_rec float64
pub_rec_zero float64
revol_util float64
total_rec_late_fee float64
int_rate float64
total_rec_int float64
annual_inc float64
funded_amnt int64
funded_amnt_inv int64
installment float64


In [32]:
X_train = X_train_vector
X_validation = validation_vector
y_train = train_data[target]
y_validation = validation_data[target]

In [33]:
X_train.shape, X_validation.shape, y_train.shape, y_validation.shape

((37219, 44), (9284, 44), (37219,), (9284,))

In [34]:
gbc.fit(X_train, y_train)

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=6, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=5, presort='auto',
              random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [35]:
y_pred = gbc.predict(X_validation)

In [37]:
errors = y_pred == y_validation

In [39]:
len(errors[errors==False])

3143

In [42]:
# Mistake % and Accuracy %
float(len(errors[errors==False])) / float(len(errors)), float(len(errors[errors==True])) / float(len(errors))

(0.3385394226626454, 0.6614605773373546)

In [43]:
# Accuracy score
gbc.score(X_validation, y_validation)

0.66146057733735464

In [44]:
# Confusion matrix
pd.crosstab(y_validation, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,-1,1,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,3020,1652,4672
1,1491,3121,4612
All,4511,4773,9284


In [52]:
c = 5
print "With number of tree = %i, the accuracy score = %f" % (c, gbc.score(X_validation, y_validation))

With number of tree = 5, the accuracy score = 0.661461


In [54]:
for n in [10,50,100,200,500]:
    gbtree = GradientBoostingClassifier(max_depth=6, n_estimators=n)
    gbtree.fit(X_train, y_train)
    print "With number of tree = %i, the accuracy score = %f" % (n, gbtree.score(X_validation, y_validation))

With number of tree = 10, the accuracy score = 0.666415
With number of tree = 50, the accuracy score = 0.684511
With number of tree = 100, the accuracy score = 0.689143
With number of tree = 200, the accuracy score = 0.685480
With number of tree = 500, the accuracy score = 0.689573
