In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time

%matplotlib inline
pd.set_option('max_colwidth',1000)

In [2]:
# Load the loan data
df = pd.read_pickle('data_cleaned.pkl')
df.reset_index(drop=True, inplace=True)

# Load the dictionary from Lending Club data
loan_dict = pd.read_excel(open('LCDataDictionary.xlsx','rb'), sheet_name='LoanStats')

# Remove loan_status as it is one of the items we will predict
df.drop('loan_status', axis=1, inplace=True)

In [3]:
# Features and their meanings.
for f in df.columns:
    print(f)
    print(loan_dict.loc[loan_dict['LoanStatNew'] == f]['Description'])
    print('------')

loan_amnt
41    The listed amount of the loan applied for by the borrower. If at some point in time, the credit department reduces the loan amount, then it will be reflected in this value.
Name: Description, dtype: object
------
funded_amnt
23    The total amount committed to that loan at that point in time.
Name: Description, dtype: object
------
funded_amnt_inv
24    The total amount committed by investors for that loan at that point in time.
Name: Description, dtype: object
------
term
94    The number of payments on the loan. Values are in months and can be either 36 or 60.
Name: Description, dtype: object
------
int_rate
34    Interest Rate on the loan
Name: Description, dtype: object
------
installment
33    The monthly payment owed by the borrower if the loan originates.
Name: Description, dtype: object
------
grade
25    LC assigned loan grade
Name: Description, dtype: object
------
sub_grade
92    LC assigned loan subgrade
Name: Description, dtype: object
------
emp_length
19 

In [4]:
# Of the above, the following are not available during the loan process. So, dropping them.
to_drop_list = ('out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_amnt', 'last_pymnt_d_month', 'last_pymnt_d_year') 
for f in to_drop_list:
    print(f)
    print(loan_dict.loc[loan_dict['LoanStatNew'] == f]['Description'])
    df = df.drop(f, 1)

out_prncp
80    Remaining outstanding principal for total amount funded
Name: Description, dtype: object
out_prncp_inv
81    Remaining outstanding principal for portion of total amount funded by investors
Name: Description, dtype: object
total_pymnt
105    Payments received to date for total amount funded
Name: Description, dtype: object
total_pymnt_inv
106    Payments received to date for portion of total amount funded by investors
Name: Description, dtype: object
total_rec_prncp
109    Principal received to date
Name: Description, dtype: object
total_rec_int
107    Interest received to date
Name: Description, dtype: object
total_rec_late_fee
108    Late fees received to date
Name: Description, dtype: object
recoveries
89    post charge off gross recovery
Name: Description, dtype: object
collection_recovery_fee
11    post charge off collection fee
Name: Description, dtype: object
last_pymnt_amnt
39    Last total payment amount received
Name: Description, dtype: object
last_pymnt_d_mon

In [5]:
# Let's get all the categorical features
cat_features = df.select_dtypes(include=['object']).columns
print(cat_features)
# for y in df.select_dtypes(include=['object']).columns:
    # print(y, df[y].dtype)

Index(['grade', 'sub_grade', 'emp_length', 'home_ownership',
       'verification_status', 'purpose', 'addr_state', 'initial_list_status'],
      dtype='object')


In [6]:
# Now encode the categorical features

for y in cat_features:
    df = df.join(pd.get_dummies(df[y], prefix=y))
    df.drop(y, axis=1, inplace=True)

In [7]:
# Split the data in features and target label
int_rate = df['int_rate']
features = df.drop('int_rate', axis=1)

In [8]:
from sklearn.cross_validation import train_test_split

# Split the features and int_rate data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, int_rate, test_size = 0.2, random_state = 0)

print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))



Training set has 615832 samples.
Testing set has 153958 samples.


In [9]:
from sklearn.metrics import mean_squared_error
import math

def train_predict(learner, sample_size, X_train, y_train, X_test, y_test): 
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       - sample_size: the size of samples (number) to be drawn from training set
       - X_train: features training set
       - y_train: income training set
       - X_test: features testing set
       - y_test: income testing set
    '''
    
    results = {}
    
    # TODO: Fit the learner to the training data using slicing with 'sample_size'
    start = time() # Get start time
    learner.fit(X_train[:sample_size], y_train[:sample_size])
    end = time() # Get end time
    
    # TODO: Calculate the training time
    results['train_time'] = end - start
        
    # TODO: Get the predictions on the test set,
    #       then get predictions on the first 300 training samples
    start = time() # Get start time
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train[:300])
    end = time() # Get end time
       
    # TODO: Calculate the total prediction time
    results['pred_time'] = end - start
            
    # TODO: Compute accuracy on the first 300 training samples
    results['rmsd_train'] = math.sqrt(mean_squared_error(y_train, learner.predict(X_train)))
        
    # TODO: Compute accuracy on test set
    results['rmsd_test'] = math.sqrt(mean_squared_error(y_test, learner.predict(X_test)))
    
    # TODO: Compute accuracy on test set
    results['score_train'] = learner.score(X_train, y_train)
    results['score_test'] = learner.score(X_test, y_test)
    
    # Success
    print("{} trained on {} samples.".format(learner.__class__.__name__, sample_size))
    
    y_test.reset_index(drop=True, inplace=True)
    join_df = pd.concat([y_test, pd.Series(learner.predict(X_test))], axis=1)
    print(join_df)
        
    # Return the results
    return results

In [10]:
from sklearn.linear_model import LinearRegression

clf = LinearRegression()

result = train_predict(clf, len(y_train), X_train, y_train, X_test, y_test)
result

LinearRegression trained on 615832 samples.
        int_rate          0
0           9.75  11.150931
1          11.53  11.575572
2           6.89   6.439062
3          15.27  14.451910
4          13.65  13.134515
5           6.89   6.771915
6          17.09  15.957289
7          15.57  20.163815
8          13.11  12.496151
9          12.99  12.989650
10         16.49  17.934903
11          6.97   6.988404
12         15.99  15.235336
13         15.61  15.375147
14         12.49  12.505585
15          9.17   9.279707
16          8.90   8.558552
17          6.62   6.581677
18         10.99  10.499396
19         11.67  11.449944
20          9.99  10.659759
21         10.99  11.133884
22          7.66   8.050691
23          9.76   9.072993
24         10.15  10.019304
25         13.67  14.319516
26         14.99  14.436336
27          9.17   8.511939
28         17.99  17.356198
29          9.17   9.290013
...          ...        ...
153928     13.33  13.720895
153929     17.56  17.359113
1539

{'train_time': 20.14993119239807,
 'pred_time': 0.7215864658355713,
 'rmsd_train': 0.8058626066319577,
 'rmsd_test': 0.8070056447306481,
 'score_train': 0.9693721333826163,
 'score_test': 0.9693350278533293}