# ML Final Project 

## using graphlab dataset with loan data


In [20]:
import graphlab
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

## import dataset

In [38]:
loans = graphlab.SFrame('lending-club-data.gl/')

### output all columns

In [39]:
loans.column_names()

['id',
 'member_id',
 'loan_amnt',
 'funded_amnt',
 'funded_amnt_inv',
 'term',
 'int_rate',
 'installment',
 'grade',
 'sub_grade',
 'emp_title',
 'emp_length',
 'home_ownership',
 'annual_inc',
 'is_inc_v',
 'issue_d',
 'loan_status',
 'pymnt_plan',
 'url',
 'desc',
 'purpose',
 'title',
 'zip_code',
 'addr_state',
 'dti',
 'delinq_2yrs',
 'earliest_cr_line',
 'inq_last_6mths',
 'mths_since_last_delinq',
 'mths_since_last_record',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc',
 'initial_list_status',
 'out_prncp',
 'out_prncp_inv',
 'total_pymnt',
 'total_pymnt_inv',
 'total_rec_prncp',
 'total_rec_int',
 'total_rec_late_fee',
 'recoveries',
 'collection_recovery_fee',
 'last_pymnt_d',
 'last_pymnt_amnt',
 'next_pymnt_d',
 'last_credit_pull_d',
 'collections_12_mths_ex_med',
 'mths_since_last_major_derog',
 'policy_code',
 'not_compliant',
 'status',
 'inactive_loans',
 'bad_loans',
 'emp_length_num',
 'grade_num',
 'sub_grade_num',
 'delinq_2yrs_zero',
 'pub_rec

## modify the data
### the column "bad_loans" is the target feature,but the origin data is divided into 0 or 1,which 0 means safe and 1 means bad
### we will modify the feature into 1 or -1 ,1 means safe and -1 means bad


In [40]:
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans = loans.remove_column('bad_loans')
target = 'safe_loans'

In [41]:
features = ['grade',                     # grade of the loan (categorical)
            'sub_grade_num',             # sub-grade of the loan as a number from 0 to 1
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'payment_inc_ratio',         # ratio of the monthly payment to income
            'delinq_2yrs',               # number of delinquincies 
            'delinq_2yrs_zero',          # no delinquincies in last 2 years
            'inq_last_6mths',            # number of creditor inquiries in last 6 months
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'open_acc',                  # number of open credit accounts
            'pub_rec',                   # number of derogatory public records
            'pub_rec_zero',              # no derogatory public records
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
            'int_rate',                  # interest rate of the loan
            'total_rec_int',             # interest received to date
            'annual_inc',                # annual income of borrower
            'funded_amnt',               # amount committed to the loan
            'funded_amnt_inv',           # amount committed by investors for the loan
            'installment',               # monthly payment owed by the borrower
           ]

In [42]:
# number of data
len(loans)

122607

### remove missing data

In [43]:
loans, loans_with_na = loans[[target] + features].dropna_split()
# missing data number
len(loans_with_na)

29

In [46]:
# remaining data
len(loans)

122578

In [49]:
safe_loans = loans[loans[target] == 1]
risky_loans = loans[loans[target] == -1]
print "safe loans:",len(safe_loans)
print "risky loans",len(risky_loans)

safe loans: 99431
risky loans 23147


### balance the data,because the data with safe loan is much more than data with risky loans

In [50]:
percentage = len(risky_loans)/float(len(safe_loans))
safe_loans_bal = safe_loans.sample(percentage, seed = 1)
risky_loans_bal = risky_loans
loans_bal = safe_loans_bal.append(risky_loans_bal)
print len(safe_loans_bal),len(risky_loans_bal)

23356 23147


In [59]:
loans_bal

safe_loans,grade,sub_grade_num,short_emp,emp_length_num,home_ownership,dti,purpose
1,B,0.6,0,11,OWN,11.18,credit_card
1,B,0.2,0,3,MORTGAGE,29.44,credit_card
1,B,0.6,1,1,RENT,12.19,credit_card
1,A,0.8,0,6,MORTGAGE,14.03,debt_consolidation
1,C,1.0,0,8,RENT,6.35,credit_card
1,B,0.4,0,11,RENT,11.8,credit_card
1,B,0.8,0,2,RENT,10.62,debt_consolidation
1,A,0.4,0,6,RENT,10.85,debt_consolidation
1,B,0.2,0,2,RENT,8.11,credit_card
1,B,0.4,0,3,RENT,19.14,credit_card

payment_inc_ratio,delinq_2yrs,delinq_2yrs_zero,inq_last_6mths,last_delinq_none,last_major_derog_none,open_acc
7.93824,0,1,0,1,1,8
6.30496,0,1,0,1,1,8
13.4952,0,1,0,1,1,8
15.9331,0,1,0,1,1,12
8.68129,0,1,1,1,1,6
11.8218,0,1,2,1,1,9
6.52882,0,1,1,1,1,7
5.79,0,1,0,1,1,5
10.0878,0,1,1,1,1,11
7.60482,0,1,3,1,1,10

pub_rec,pub_rec_zero,revol_util,total_rec_late_fee,int_rate,total_rec_int,annual_inc,funded_amnt,funded_amnt_inv
0,1,82.4,0.0,11.71,1902.56,50000,10000,10000
0,1,93.9,0.0,9.91,823.48,92000,15000,15000
0,1,59.1,0.0,11.71,1622.21,25000,8500,8500
0,1,27.4,0.0,7.9,3061.08,75000,31825,31825
0,1,60.5,0.0,15.96,1848.94,34000,7000,7000
0,1,57.2,0.0,10.65,2137.46,41000,12400,12400
0,1,66.5,0.0,12.42,1125.28,36852,6000,6000
0,1,36.4,0.0,6.62,1157.38,70000,11000,11000
0,1,52.1,0.0,9.91,1748.21,46000,12000,12000
0,1,59.1,0.0,10.65,575.94,51400,10000,10000

installment
330.76
483.38
281.15
995.82
245.97
403.91
200.5
337.75
386.7
325.74


## transfer the data to numpy

In [52]:
loans_data = loans_bal.to_numpy()
loans_data.shape

(46503, 25)

In [57]:
loans_data

array([['1', 'B', '0.6', ..., '10000', '10000', '330.76'],
       ['1', 'B', '0.2', ..., '15000', '15000', '483.38'],
       ['1', 'B', '0.6', ..., '8500', '8500', '281.15'],
       ..., 
       ['-1', 'E', '1.0', ..., '6000', '6000', '170.53'],
       ['-1', 'D', '0.6', ..., '8525', '8525', '217.65'],
       ['-1', 'D', '1.0', ..., '22000', '22000', '582.5']], 
      dtype='|S21')

### save as csv

In [69]:
loans_bal.export_csv("loans_data.csv")

## Train Model