In [41]:
import pandas as pd
%matplotlib inline
import seaborn as sns
import re
import numpy as np
import matplotlib.pyplot as plt

In [42]:
date_cols = ['issue_d','earliest_cr_line','last_pymnt_d','next_pymnt_d','last_credit_pull_d']
df = pd.read_csv('../Niteo/Lending/loan.csv/loan.csv',parse_dates=date_cols)

In [43]:
df1=df.copy()

In [44]:
relevant_cols = ['id','member_id','loan_amnt','term','installment',\
                'emp_title','emp_length','home_ownership','annual_inc',\
                'loan_status','purpose','zip_code','addr_state',\
                'dti','delinq_2yrs','earliest_cr_line','inq_last_6mths',\
                'open_acc','pub_rec','revol_bal','revol_util','total_acc']

In [45]:
df = df[relevant_cols]

In [46]:
#Filter for Sub Grade C1
compl_loan = [loan for loan in df.loan_status.value_counts().index if loan not
              in \
                      ['Current','Issued','Does not meet the credit policy. Status:Fully Paid']]
df = df[df.loan_status.isin(compl_loan)]

In [47]:
default_loans = [loan for loan in df.loan_status.unique() if loan not in ['Fully Paid']]

In [48]:
df.loan_status = df.loan_status.apply(lambda x:'Default' if x in default_loans else 'Fully Paid')

In [49]:
df.loan_status.value_counts()

Fully Paid    207723
Default        67429
Name: loan_status, dtype: int64

In [50]:
#Missing Values Treatment
df.isnull().sum()[df.isnull().sum()>0]

emp_title           15280
delinq_2yrs             3
earliest_cr_line        3
inq_last_6mths          3
open_acc                3
pub_rec                 3
revol_util            219
total_acc               3
dtype: int64

In [51]:
df.shape

(275152, 22)

In [52]:
df.emp_title.fillna('Unknown',inplace=True)
df.revol_util.fillna(df.revol_util.mean(),inplace=True)

In [53]:
df = df.dropna()

In [54]:
df.isnull().sum()[df.isnull().sum()>0]

Series([], dtype: int64)

In [55]:
df.purpose.value_counts()

debt_consolidation    162372
credit_card            54005
home_improvement       16213
other                  15501
major_purchase          6685
small_business          5156
car                     3794
medical                 3089
moving                  2210
wedding                 1979
house                   1783
vacation                1720
educational              357
renewable_energy         285
Name: purpose, dtype: int64

In [56]:
#Feature Engineering
df.columns

Index([u'id', u'member_id', u'loan_amnt', u'term', u'installment',
       u'emp_title', u'emp_length', u'home_ownership', u'annual_inc',
       u'loan_status', u'purpose', u'zip_code', u'addr_state', u'dti',
       u'delinq_2yrs', u'earliest_cr_line', u'inq_last_6mths', u'open_acc',
       u'pub_rec', u'revol_bal', u'revol_util', u'total_acc'],
      dtype='object')

In [57]:
df.zip_code=df.zip_code.apply(lambda x:x[:-2])

In [58]:
df.term = df.term.apply(lambda x:x[:3]).astype(int)

In [59]:
df.emp_length = df.emp_length.str.replace('n/a','0')
df.emp_length = df.emp_length.apply(lambda x:re.findall(r'[0-9]+',x)[0]).astype(int)

In [60]:
df.emp_length = df.emp_length.replace(0,df.emp_length.mean())

In [61]:
    #Normalize Annual Income
df['log_annual_inc']=np.log(df.annual_inc)

In [62]:
df['earliest_cr_line_year']=df.earliest_cr_line.dt.year
df['earliest_cr_line_month']=df.earliest_cr_line.dt.month
df.drop('earliest_cr_line',inplace=True,axis=1)

In [67]:
df.to_csv('../Niteo/Lending_Clean.csv')

In [23]:
#Model Building
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.ensemble import BaggingClassifier,ExtraTreesClassifier,GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score,train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.multiclass import OneVsRestClassifier,OneVsOneClassifier
from sklearn.svm import LinearSVC,SVC
from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.grid_search import GridSearchCV

In [24]:
print list(df.columns)

['id', 'member_id', 'loan_amnt', 'term', 'installment', 'emp_title', 'emp_length', 'home_ownership', 'annual_inc', 'loan_status', 'purpose', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'log_annual_inc', 'earliest_cr_line_year', 'earliest_cr_line_month']


In [25]:
features = ['installment','emp_title','emp_length',\
           'home_ownership','open_acc','purpose','zip_code','addr_state',\
           'dti','delinq_2yrs','inq_last_6mths','pub_rec','revol_bal',\
           'revol_util','total_acc','log_annual_inc','earliest_cr_line_year',\
           'earliest_cr_line_month']

In [26]:
#Create Target var to 0 and 1
df.loan_status=df.loan_status.map({'Fully Paid':0,'Default':1})

In [27]:
df.loan_status.value_counts()

0    207723
1     67426
Name: loan_status, dtype: int64

In [28]:
#Balance Dataset
df = pd.concat([df[df.loan_status==0].sample(70000),df[df.loan_status==1]])

In [29]:
X=df[features]
y=df.loan_status

In [35]:
print "Label Encoding..."
for var in X.columns:
    if X[var].dtypes == object:
        print var
        lb=LabelEncoder()
        full_var_data = X[var].astype('str')
        lb.fit(np.array(full_var_data))
        X[var]=lb.transform(np.array(X[var]).astype('str'))

Label Encoding...
emp_title


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


home_ownership
purpose
zip_code
addr_state


In [36]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=17)

In [37]:
y_train.value_counts()

0    55971
1    53969
Name: loan_status, dtype: int64

In [38]:
y_test.value_counts()

0    14029
1    13457
Name: loan_status, dtype: int64

In [39]:
X_test.head()

Unnamed: 0,installment,emp_title,emp_length,home_ownership,open_acc,purpose,zip_code,addr_state,dti,delinq_2yrs,inq_last_6mths,pub_rec,revol_bal,revol_util,total_acc,log_annual_inc,earliest_cr_line_year,earliest_cr_line_month
377151,634.98,58859,5.0,5,6.0,2,94,34,8.19,0.0,6.0,0.0,12971.0,61.8,15.0,11.112448,1997,5
760929,790.82,41143,10.0,1,22.0,2,92,34,33.75,0.0,0.0,0.0,203683.0,44.9,33.0,11.695247,1974,6
35904,385.14,19160,1.0,1,11.0,2,44,6,11.76,0.0,2.0,0.0,54707.0,50.6,24.0,11.082143,1988,8
156828,349.43,23516,10.0,5,8.0,2,310,9,22.2,0.0,0.0,0.0,1995.0,39.9,17.0,10.308953,1999,2
202993,641.62,63457,10.0,4,7.0,2,329,1,16.91,0.0,6.0,0.0,8024.0,47.2,17.0,11.225243,1986,9


In [None]:
svm = SVC()
model=svm.fit(X_train,y_train)
pred = model.predict(X_test)
score = accuracy_score(pred,y_test)
print score
svm_class = classification_report(pred,y_test)
print svm_class