In [2]:
import requests
import os
import pandas as pd
import numpy as np
import json
from dateutil import relativedelta as rdelta
from datetime import date
from datetime import datetime
from afinn import Afinn


#sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,roc_curve, auc
from sklearn.preprocessing import StandardScaler

#visualization
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
#authorizing
key_df = pd.read_csv("/Users/ajaykliyara/Documents/lc_key.txt",header=None)
key = key_df.iloc[:,:].values[0][0]

In [4]:
#reading from API loan listing
s = requests.Session()
s.headers.update({'Authorization': key})
r = s.get('https://api.lendingclub.com/api/investor/v1/loans/listing')


In [5]:
data = json.loads(r.text)

In [104]:
None is None

True

In [110]:
def process_data(loan_data):
    #initial dataframe
    loan_df_ini = pd.DataFrame([loan_data])
    
    #dataframe to store processed data
    loan_df = pd.DataFrame()
    #loan_df = dict()
    
    #home ownership
    ownership_lookup = {'MORTGAGE': 2, 'NONE': 4, 'OTHER': 3, 'OWN': 1, 'RENT': 0}
    loan_df['homeOwnership'] = loan_df_ini['homeOwnership']
    loan_df['homeOwnership'].replace(ownership_lookup, inplace=True)
    loan_df=loan_df.rename(columns = {'homeOwnership':'home_ownership'})
    loan_df['home_ownership']
    
    #verification status
    if(str.upper(loan_df_ini['isIncV'].values[0]) == 'NOT VERIFIED'):
        loan_df['verification_status_Not Verified'] = 1
    else:
        loan_df['verification_status_Not Verified'] = 0

    if(str.upper(loan_df_ini['isIncV'].values[0]) == 'VERIFIED'):
        loan_df['verification_status_Verified'] = 1
    else:
        loan_df['verification_status_Verified'] = 0

    if(str.upper(loan_df_ini['isIncV'].values[0]) == 'SOURCE VERIFIED'):
        loan_df['verification_status_Source Verified'] = 1
    else:
        loan_df['verification_status_Source Verified'] = 0
        
        
    #has delinq hist
    loan_df['has_delinquent_hist'] = loan_df_ini['mthsSinceLastDelinq'].apply(lambda s : int(s!=None))
    
    #mths_since_last_record
    loan_df['has_record_hist'] = loan_df_ini['mthsSinceLastRecord'].apply(lambda s : int(s!=None))
    
    #loan amount
    loan_df['loan_amnt'] = loan_df_ini['loanAmount']
    
    #per_act_open
    loan_df['per_act_open'] = loan_df_ini.openAcc/loan_df_ini.totalAcc
    
    #inqLast6Mths 
    loan_df['inq_last_6mths'] = loan_df_ini['inqLast6Mths']
    
    
    #avg_fico
    loan_df['avg_fico'] = (loan_df_ini['ficoRangeHigh'] +  loan_df_ini['ficoRangeLow'])/2
    
    
    #revol_util
    loan_df['revol_util'] = loan_df_ini['revolUtil']
    
    #revol_bal revolBal
    loan_df['revol_bal'] = loan_df_ini['revolBal'] 
    
    #dti
    loan_df['dti'] = loan_df_ini['dti'] 
    
    #years_since_first_credit
    loan_df_ini['earliestCrLine'].unique()
    ear_cl_df = loan_df_ini['earliestCrLine'].apply(lambda s : datetime.strptime(s[:7],'%Y-%m'))
    yr_diff = [ rdelta.relativedelta(date(2016,1,1),pd.Timestamp(i).to_pydatetime()).years for i in ear_cl_df.values] 
    ear_cl_df = pd.DataFrame(yr_diff)
    ear_cl_df.columns = ['years_since_first_credit']
    loan_df['years_since_first_credit'] = ear_cl_df['years_since_first_credit']
    
    
    #int_rate
    loan_df['int_rate'] = loan_df_ini['intRate']
    
    #print(loan_df_ini['empLength'].values[0])
    #print(type(loan_df_ini['empLength'].values[0]))
    if loan_df_ini['empLength'].values[0] is None:
        print(loan_df_ini['empLength'].values[0])
        loan_df['emp_length'] = -1
    else:
        loan_df['emp_length'] = loan_df_ini['empLength']
    #loan_df['emp_length'].replace(emp_dict, inplace=True)
    
    #desc_senti_score
    afinn = Afinn()
    def sent_score(s):
        if s == None:
            return 0
        else:
            afinn.score(s)

    s_score = [sent_score(st) for st in loan_df_ini['desc'] ]
    loan_df['desc_senti_score'] = s_score
    
    
    #reading zipcode data
    zip_code_agg = pd.read_csv('data/zip3_agg.csv')
    #join with final df
    loan_df_ini['zip_3'] = [int(zip[:3]) for zip in loan_df_ini.addrZip.values]
    join_df = pd.merge(loan_df_ini,zip_code_agg,how='inner',on='zip_3')
    join_df = join_df.rename(columns={'Median':'zip_median_income','Pop':'zip_pop'})
    loan_df['zip_median_income'] = join_df['zip_median_income']
    loan_df['zip_pop'] = join_df['zip_pop']
    
    
    #delinq2Yrs delinq_2yrs
    loan_df['delinq_2yrs'] = loan_df_ini['delinq2Yrs']
    
    #grade
    if(str.upper(loan_df_ini['grade'].values[0]) == 'A'):
        loan_df['grade_A'] = 1
    else:
        loan_df['grade_A'] = 0

    if(str.upper(loan_df_ini['grade'].values[0]) == 'B'):
        loan_df['grade_B'] = 1
    else:
        loan_df['grade_B'] = 0


    if(str.upper(loan_df_ini['grade'].values[0]) == 'C'):
        loan_df['grade_C'] = 1
    else:
        loan_df['grade_C'] = 0


    if(str.upper(loan_df_ini['grade'].values[0]) == 'D'):
        loan_df['grade_D'] = 1
    else:
        loan_df['grade_D'] = 0

    if(str.upper(loan_df_ini['grade'].values[0]) == 'E'):
        loan_df['grade_E'] = 1
    else:
        loan_df['grade_E'] = 0

    if(str.upper(loan_df_ini['grade'].values[0]) == 'E'):
        loan_df['grade_E'] = 1
    else:
        loan_df['grade_E'] = 0

    if(str.upper(loan_df_ini['grade'].values[0]) == 'F'):
        loan_df['grade_F'] = 1
    else:
        loan_df['grade_F'] = 0

    if(str.upper(loan_df_ini['grade'].values[0]) == 'G'):
        loan_df['grade_G'] = 1
    else:
        loan_df['grade_G'] = 0
        
        
    #term term_ 36 months term_ 60 months
    if(loan_df_ini['term'].values[0] == 36):
        loan_df['term_ 36 months'] = 1
    else:
        loan_df['term_ 36 months'] = 0

    if(loan_df_ini['term'].values[0] == 60):
        loan_df['term_ 60 months'] = 1
    else:
        loan_df['term_ 60 months'] = 0
        
        
    #purpose
    if(loan_df_ini['purpose'].values[0] == 'credit_card'):
        loan_df['purpose_credit_card'] = 1
    else:
        loan_df['purpose_credit_card'] = 0

    if(loan_df_ini['purpose'].values[0] == 'debt_consolidation'):
        loan_df['purpose_debt_consolidation'] = 1
    else:
        loan_df['purpose_debt_consolidation'] = 0

    if(loan_df_ini['purpose'].values[0] == 'moving'):
        loan_df['purpose_moving'] = 1
    else:
        loan_df['purpose_moving'] = 0

    if(loan_df_ini['purpose'].values[0] == 'other'):
        loan_df['purpose_other'] = 1
    else:
        loan_df['purpose_other'] = 0

    if(loan_df_ini['purpose'].values[0] == 'home_improvement'):
        loan_df['purpose_home_improvement'] = 1
    else:
        loan_df['purpose_home_improvement'] = 0

    if(loan_df_ini['purpose'].values[0] == 'major_purchase'):
        loan_df['purpose_major_purchase'] = 1
    else:
        loan_df['purpose_major_purchase'] = 0


    if(loan_df_ini['purpose'].values[0] == 'car'):
        loan_df['purpose_car'] = 1
    else:
        loan_df['purpose_car'] = 0

    if(loan_df_ini['purpose'].values[0] == 'medical'):
        loan_df['purpose_medical'] = 1
    else:
        loan_df['purpose_medical'] = 0

    if(loan_df_ini['purpose'].values[0] == 'small_business'):
        loan_df['purpose_small_business'] = 1
    else:
        loan_df['purpose_small_business'] = 0

    if(loan_df_ini['purpose'].values[0] == 'educational'):
        loan_df['purpose_educational'] = 1
    else:
        loan_df['purpose_educational'] = 0

    if(loan_df_ini['purpose'].values[0] == 'vacation'):
        loan_df['purpose_vacation'] = 1
    else:
        loan_df['purpose_vacation'] = 0

    if(loan_df_ini['purpose'].values[0] == 'wedding'):
        loan_df['purpose_wedding'] = 1
    else:
        loan_df['purpose_wedding'] = 0


    if(loan_df_ini['purpose'].values[0] == 'house'):
        loan_df['purpose_house'] = 1
    else:
        loan_df['purpose_house'] = 0

    if(loan_df_ini['purpose'].values[0] == 'renewable_energy'):
        loan_df['purpose_renewable_energy'] = 1
    else:
        loan_df['purpose_renewable_energy'] = 0
        
    #bankruptcies
    loan_df['bankruptcies'] = loan_df_ini['pubRecBankruptcies']
    
    #id
    loan_df['loan_id'] = loan_df_ini['id']
    
    #print(loan_df)
    
    return loan_df.copy()
    



In [88]:
#total number of listings
len(data['loans'])

138

In [111]:
#parsing loan listing to dataframe suitable for predicting
loan_list = []
for loan_data in data['loans']:
    loan_list.append(process_data(loan_data))


None
None
None
None
None
None
None
None
None
None
None
None


In [112]:
loan_df = pd.concat(loan_list)
loan_df.head(2)

Unnamed: 0,home_ownership,verification_status_Not Verified,verification_status_Verified,verification_status_Source Verified,has_delinquent_hist,has_record_hist,loan_amnt,per_act_open,inq_last_6mths,avg_fico,...,purpose_car,purpose_medical,purpose_small_business,purpose_educational,purpose_vacation,purpose_wedding,purpose_house,purpose_renewable_energy,bankruptcies,loan_id
0,0,0,0,0,1,0,5250.0,0.692308,0,672.0,...,0,0,0,0,0,0,0,0,0,93890963
0,2,0,1,0,0,0,18000.0,0.533333,0,672.0,...,0,0,0,0,0,0,0,0,0,94045662


In [114]:
loan_df.shape

(138, 45)

In [115]:
loan_df.emp_length.unique()

array([ 24, 120,  72,  96,  12,  48,  -1, 108,  36,   0,  84,  60])

In [116]:
#creating a copy of data
loan_df_test = loan_df.copy()

#### Normalize Data

In [117]:
#reading training set
df_processed = pd.read_csv('data/loan_fteng_data_text.csv')
df_processed.shape

(39301, 67)

In [118]:
#all features
all_features = list(df_processed.columns)
tfidf_features = ['bill', 'card', 'consolid', 'credit', 'current', 'debt', 'fund', 'help',
       'interest', 'job', 'loan', 'money', 'month', 'pay', 'payment', 'plan',
       'rate', 'thank', 'time', 'work', 'year']
del_columns = ['funded_amnt_inv'] + tfidf_features
#Using intution to pick key predictors
predictors_intution = ['home_ownership','verification_status_Not Verified',
       'verification_status_Source Verified', 'verification_status_Verified','has_delinquent_hist','has_record_hist']
# % and whole numbers that can be standardized
predictors_standardizable = ['loan_amnt','per_act_open','inq_last_6mths','avg_fico','revol_util','revol_bal','dti','years_since_first_credit','int_rate','emp_length','desc_senti_score','zip_median_income', 'zip_pop','delinq_2yrs','bankruptcies']
predictors_remaining = [i for i in all_features if i not in predictors_intution and i not in predictors_standardizable and i not in del_columns]

In [119]:
#validating if all columns are there
test = predictors_intution + predictors_standardizable + predictors_remaining
len(test)

45

In [120]:
predictors_remaining

['loan_status',
 'grade_A',
 'grade_B',
 'grade_C',
 'grade_D',
 'grade_E',
 'grade_F',
 'grade_G',
 'term_ 36 months',
 'term_ 60 months',
 'purpose_car',
 'purpose_credit_card',
 'purpose_debt_consolidation',
 'purpose_educational',
 'purpose_home_improvement',
 'purpose_house',
 'purpose_major_purchase',
 'purpose_medical',
 'purpose_moving',
 'purpose_other',
 'purpose_renewable_energy',
 'purpose_small_business',
 'purpose_vacation',
 'purpose_wedding']

In [121]:
full_df = df_processed.loc[:,test]
predictor_columns = list(full_df.columns)
predictor_columns.remove('loan_status')
len(predictor_columns)

44

In [122]:
#standardizing data
for col in predictors_standardizable:
    print(col)
    valstrain=full_df.loc[:,col]
    valstest=loan_df.loc[:,col]
    scaler=StandardScaler().fit(valstrain.reshape(-1,1))
    outtrain=scaler.transform(valstrain.reshape(-1,1))
    outtest=scaler.fit_transform(valstest.reshape(-1,1))
    #out=np.empty(mask.shape[0])
    #out.flat[mask]=outtrain
    #out.flat[~mask]=outtest
    full_df[col]=outtrain
    loan_df[col]=outtest

loan_amnt
per_act_open
inq_last_6mths
avg_fico
revol_util
revol_bal
dti
years_since_first_credit
int_rate
emp_length
desc_senti_score
zip_median_income
zip_pop
delinq_2yrs
bankruptcies




In [124]:
#predictor vs response
X =full_df[predictor_columns]
Y =full_df['loan_status']

#### balancing data

In [125]:
#balancing dataset
jtrain=np.arange(0, Y.shape[0])
n_pos=len(jtrain[Y.values==1])
n_neg=len(jtrain[Y.values==0])
print(n_pos, n_neg)


5652 33649


In [126]:
ineg = np.random.choice(jtrain[Y.values==0], n_pos, replace=False)
ineg

array([  509, 17587,  2905, ..., 16837,  9868,   618])

In [127]:
alli=np.concatenate((jtrain[Y.values==1], ineg))
alli.shape

(11304,)

In [128]:
X_new = X.iloc[alli,:]
Y_new = Y.iloc[alli]
X_new.shape, Y_new.shape

((11304, 44), (11304,))

In [129]:
#best classifier
clf = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [130]:
#fitting data
clf.fit(X_new,Y_new)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [131]:
#accuracy
y_predict = clf.predict(X)
np.mean(Y==y_predict)

0.63535279000534339

In [132]:
cm = confusion_matrix(Y,y_predict)
cm

array([[21263, 12386],
       [ 1945,  3707]])

In [133]:
#profit matrix
u = np.array([[ 2207.,    -0.],
       [-3901.,  3901.]])
u

array([[ 2207.,    -0.],
       [-3901.,  3901.]])

In [134]:
#calculating average profit per loan
def average_profit_pl(y, ypred, u):
    c=confusion_matrix(y,ypred)
    score=np.sum(c*u)/np.sum(c)
    return score

#profit 
average_profit_pl(Y_new, clf.predict(X_new), u)

1308.3926928520877

In [135]:
sen = round(cm[(1,1)]/(cm[(1,0)] + cm[(1,1)]),2)
sen

0.66000000000000003

In [137]:
fpr, tpr, thresholds=roc_curve(Y_new, clf.predict_proba(X_new)[:,1])
auc(fpr,tpr)

0.69722975389280306

#### Predicting New Loan

In [139]:
loan_df.columns

Index(['home_ownership', 'verification_status_Not Verified',
       'verification_status_Verified', 'verification_status_Source Verified',
       'has_delinquent_hist', 'has_record_hist', 'loan_amnt', 'per_act_open',
       'inq_last_6mths', 'avg_fico', 'revol_util', 'revol_bal', 'dti',
       'years_since_first_credit', 'int_rate', 'emp_length',
       'desc_senti_score', 'zip_median_income', 'zip_pop', 'delinq_2yrs',
       'grade_A', 'grade_B', 'grade_C', 'grade_D', 'grade_E', 'grade_F',
       'grade_G', 'term_ 36 months', 'term_ 60 months', 'purpose_credit_card',
       'purpose_debt_consolidation', 'purpose_moving', 'purpose_other',
       'purpose_home_improvement', 'purpose_major_purchase', 'purpose_car',
       'purpose_medical', 'purpose_small_business', 'purpose_educational',
       'purpose_vacation', 'purpose_wedding', 'purpose_house',
       'purpose_renewable_energy', 'bankruptcies', 'loan_id'],
      dtype='object')

In [148]:
#predicting new loan
X_test =loan_df[predictor_columns]
#Y_test =loan_df['loan_status']

#removing any
X_test = X_test.loc[~(X_test.home_ownership == 'ANY'),:]
loan_df = loan_df.loc[~(loan_df.home_ownership == 'ANY'),:]

In [152]:
clf.classes_

array([0, 1])

In [156]:
test_pred = clf.predict(X_test)
test_prob = clf.predict_proba(X_test)[:,1]

In [157]:
loan_df['pred'] = test_pred
loan_df['prob'] = test_prob

In [158]:
np.mean(test_pred)

0.35294117647058826

In [169]:
loan_df.loan_id[:3]

0    93890963
0    94045662
0    93881434
Name: loan_id, dtype: int64

In [214]:
def predict_loan():
    loan_id = input('Enter Loan ID obtained from LendingClub?\n\n')

    
    df = loan_df.loc[loan_df.loan_id == int(loan_id),:]
    
    if df.shape[0] == 0:
        print('Loan ID {} Invalid'.format(loan_id))
        return
    else:
        risk = input('\n Enter : \n 1 for Loans with high risk \n 2 for Loans with low risk \n\n')
        print('\n')
        
        if risk == '1':
            if 1-df.prob.values[0] >= 0.5:
                print('Go ahead and fund the loan. Remember carries high risk, probability of failure {}'.format(df.prob.values[0]))
            else:
                print('Do not fund the loan. Probaility of failure {} \n \n'.format(df.prob.values[0]))
                high_risk_df = loan_df.loc[loan_df.prob < 0.5 ,['loan_id','loan_amnt','int_rate','prob']]
                if high_risk_df.shape[0] == 0:
                    print('\n Currently there are no High risk loans available to recommend \n')
                else:
                    print('\n Here are the recommendation of available loans for selected Risk Level : High \n')
                    print(high_risk_df)
                
        elif risk == '2':
            if 1-df.prob.values[0] >= 0.8:
                print('Go ahead and fund the loan. Low Risk, probability of failure {}'.format(df.prob.values[0]))
            else:
                print('Do not fund the loan. Probaility of failure {}'.format(df.prob.values[0]))
                low_risk_df = loan_df.loc[loan_df.prob < 0.2 ,['loan_id','loan_amnt','int_rate','prob']]
                if low_risk_df.shape[0] == 0:
                    print('\n Currently there are no Low risk loans available to recommend \n')
                else:
                    print('\n Here are the recommendation of available loans for selected Risk Level : Low \n')
                    print(low_risk_df)
        else:
            print('\n Invalid Entry. Restart \n')
            predict_loan()

In [215]:
predict_loan()

Enter Loan ID obtained from LendingClub?

93890963

 Enter : 
 1 for Loans with high risk 
 2 for Loans with low risk 

2


Do not fund the loan. Probaility of failure 0.4851765793286381

 Here are the recommendation of available loans for selected Risk Level : Low 

    loan_id  loan_amnt  int_rate      prob
0  94283076  -1.328810 -1.272497  0.183893
0  94449481  -1.063573 -1.613410  0.141648
0  94911281  -1.004914 -1.351048  0.188034
0  95207123  -0.196451 -1.613410  0.180808
0  94951209  -0.043430 -1.613410  0.151892
0  94941287  -0.007725 -1.272497  0.194057
0  94743648   2.506927 -1.272497  0.133162


In [None]:
#replace loan df with the one before standardization.
#sort dataframe in decreasign order of interest

In [213]:
#loan_df.loc[loan_df.prob < 0.5 ,['loan_id','loan_amnt','int_rate','prob']]
