In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

In [2]:
# Reading the files
train = pd.read_csv('train_data.csv', encoding = "ISO-8859-1", low_memory=False)
test = pd.read_csv('test_data.csv')

In [3]:
# Store the loan_status values in target
target = train.loan_status.values

# Drop the column loan_status from train_data
train.drop('loan_status', axis = 1, inplace = True)

In [4]:
size_train = train.shape[0]
size_test = test.shape[0]

# Append test_data to train_data
full_df = train.append(test)

### Data Cleansing

In [5]:
# Drop unrequired columns
drop_cols = ['emp_title', 'batch_enrolled', 'title', 'pymnt_plan']
full_df.drop(drop_cols, axis = 1, inplace = True)

In [6]:
# Helper functions to clean data

def clean_emp_length(raw):
    if raw == 'n/a' :
        return np.nan
    elif '+' in raw :
        return 10
    elif '<' in raw :
        return 0
    else :
        return int(re.sub("\D", "", raw))
    
def clean_last_week_pay(raw) :
    try :
        return int(re.sub("\D", "", raw))
    except :
        return -99999

def clean_verification_status_joint(raw):
    if type(raw) == str:
        return 1
    else :
        return -99999

In [7]:
full_df['term'] = full_df['term'].apply(lambda x : int(re.sub("\D", "", x)))
full_df['emp_length'] = full_df['emp_length'].apply(clean_emp_length)
full_df['last_week_pay'] = full_df['last_week_pay'].apply(clean_last_week_pay)
full_df['verification_status_joint'] = full_df['verification_status_joint'].apply(clean_verification_status_joint)

In [8]:
# Find columns with dtype object

cols_list = full_df.columns
cols_obj = []
obj_list = full_df.dtypes.values
for col, obj in zip(cols_list, obj_list):
    if obj == object:
        cols_obj.append(col)

In [9]:
# Change string values to number categories

from sklearn import preprocessing

le = preprocessing.LabelEncoder()
for col in cols_obj:
    full_df[col] = le.fit_transform(full_df[col])

In [10]:
# Fill null values
cols_fill = ['emp_length', 'annual_inc','inq_last_6mths', 'tot_coll_amt', 'tot_cur_bal']
for col in cols_fill:
    full_df.loc[full_df[col].isnull(), col] = -99999

### Feature Engineering

In [11]:
full_df['total_interest'] = full_df['funded_amnt']*full_df['int_rate']*full_df['term']/(100*12)

In [12]:
full_df['unpaid_int_perc'] = (full_df['total_interest'] - full_df['total_rec_int'])/full_df['total_interest']

In [13]:
full_df['loan_annual_inc_ratio'] = full_df['loan_amnt']/full_df['annual_inc']

In [14]:
full_df['funded_amnt_diff'] = full_df['loan_amnt'] - full_df['funded_amnt']

In [15]:
full_df['funded_amnt_funded_amnt_inv_diff'] = full_df['funded_amnt'] - full_df['funded_amnt_inv']

In [16]:
full_df.head()

Unnamed: 0,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,grade,emp_length,home_ownership,annual_inc,...,application_type,verification_status_joint,last_week_pay,tot_coll_amt,tot_cur_bal,total_interest,unpaid_int_perc,loan_annual_inc_ratio,funded_amnt_diff,funded_amnt_funded_amnt_inv_diff
0,45670841,35000,35000,35000.0,36,18.84,4,10.0,1,250000.0,...,0,-99999,44,0.0,1172167.0,19782.0,0.753143,0.14,0,0.0
1,69604705,18000,18000,18000.0,60,7.89,0,0.0,1,90000.0,...,0,-99999,4,0.0,427348.0,7101.0,0.984444,0.2,0,0.0
2,55346416,16175,16175,16175.0,36,13.99,2,0.0,5,46000.0,...,0,-99999,31,0.0,27540.0,6788.6475,0.820799,0.35163,0,0.0
3,18614915,10000,10000,10000.0,60,15.61,3,10.0,1,90000.0,...,0,-99999,83,0.0,62780.0,7805.0,0.717438,0.111111,0,0.0
4,18473862,17325,17325,17325.0,36,17.57,3,4.0,5,78000.0,...,0,-99999,87,0.0,15720.0,9132.0075,0.567355,0.222115,0,0.0


In [17]:
train_clean = full_df[:size_train]
test_clean = full_df[size_train:]

### Gradient Booster Classifier

In [18]:
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

kf = KFold(n_splits=10, random_state=1, shuffle=True)

accuracy_sum = 0
for train_index, test_index in kf.split(train_clean):
    gbk = GradientBoostingClassifier()
    gbk.fit(train_clean.iloc[train_index].values, target[train_index])
    pred = gbk.predict(train_clean.iloc[test_index].values)
    acc_gbk = round(accuracy_score(pred, target[test_index]) * 100, 2)
#     print(pred)
    print("accuracy:", acc_gbk)
    accuracy_sum += acc_gbk
    

accuracy_avg = round(accuracy_sum/10, 2)
print("avg_accuracy:", accuracy_avg)

accuracy: 88.62
accuracy: 88.59
accuracy: 88.85
accuracy: 88.73
accuracy: 88.64
accuracy: 89.48
accuracy: 88.35
accuracy: 88.99
accuracy: 88.26
accuracy: 88.87
avg_accuracy: 88.74


### Final Prediction On Test Data

In [19]:
gbk = GradientBoostingClassifier()
gbk.fit(train_clean.values, target)

# To predict probability values
# pred = gbk.predict_proba(test_clean.values)[:,1]

# To predict class values
pred = gbk.predict(test_clean.values)

print(pred)

[0 1 1 ..., 1 0 1]


In [20]:
len(pred) == test_clean.shape[0]

True

In [21]:
# Add prediction column to test_data.csv

final_data = pd.read_csv('test_data.csv')
final_data['loan_status'] = pred

In [22]:
final_data.head()

Unnamed: 0,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,batch_enrolled,int_rate,grade,emp_title,emp_length,...,zip_code,inq_last_6mths,total_rec_int,total_rec_late_fee,application_type,verification_status_joint,last_week_pay,tot_coll_amt,tot_cur_bal,loan_status
0,46202640,35000,35000,35000.0,60 months,,12.69,C,Owner,2 years,...,303xx,0,3445.95,0.0,INDIVIDUAL,,44th week,0.0,37591.0,0
1,635220,16000,16000,15825.0,36 months,BAT5785461,16.82,E,TurboChef Technologies Inc,1 year,...,750xx,5,4467.49,0.0,INDIVIDUAL,,161th week,,,1
2,327746,9600,9600,1964.43,36 months,BAT5278942,13.87,D,St. Elizabeth Medical Centers,10+ years,...,410xx,3,2152.8,16.375499,INDIVIDUAL,,139th week,,,1
3,46182333,9000,9000,9000.0,36 months,BAT4268557,7.89,A,salesman,10+ years,...,284xx,0,493.45,0.0,INDIVIDUAL,,39th week,725.0,24270.0,0
4,71128448,21000,21000,21000.0,60 months,BAT6004464,7.89,A,ROUTE SALES,10+ years,...,322xx,0,128.88,0.0,INDIVIDUAL,,4th week,0.0,248545.0,0


In [23]:
final_data.to_csv('prediction.csv', index = False)