In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time

%matplotlib inline
pd.set_option('max_colwidth',1000)

In [2]:
# Load the loan data
df = pd.read_pickle('data_cleaned.pkl')
df.reset_index(drop=True, inplace=True)

In [3]:
# Categorical features from data-cleanup.ipynb
cat_features = ['grade', 'sub_grade', 'emp_length', 'home_ownership', 'verification_status', 
                'purpose', 'addr_state', 'initial_list_status', 'application_type', 'disbursement_method',
                'loan_status']

for y in cat_features:
    df = df.join(pd.get_dummies(df[y], prefix=y))
    df.drop(y, axis=1, inplace=True)

In [4]:
# Split the data in features and target label
int_rate = df['int_rate']
features = df.drop('int_rate', axis=1)

In [5]:
from sklearn.cross_validation import train_test_split

# Split the features and int_rate data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, int_rate, test_size = 0.2, random_state = 0)

print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))



Training set has 664548 samples.
Testing set has 166137 samples.


In [6]:
from sklearn.metrics import mean_squared_error
import math

def train_predict(learner, sample_size, X_train, y_train, X_test, y_test): 
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       - sample_size: the size of samples (number) to be drawn from training set
       - X_train: features training set
       - y_train: income training set
       - X_test: features testing set
       - y_test: income testing set
    '''
    
    results = {}
    
    # TODO: Fit the learner to the training data using slicing with 'sample_size'
    start = time() # Get start time
    learner.fit(X_train[:sample_size], y_train[:sample_size])
    end = time() # Get end time
    
    # TODO: Calculate the training time
    results['train_time'] = end - start
        
    # TODO: Get the predictions on the test set,
    #       then get predictions on the first 300 training samples
    start = time() # Get start time
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train[:300])
    end = time() # Get end time
       
    # TODO: Calculate the total prediction time
    results['pred_time'] = end - start
            
    # TODO: Compute accuracy on the first 300 training samples
    results['rmsd_train'] = math.sqrt(mean_squared_error(y_train, learner.predict(X_train)))
        
    # TODO: Compute accuracy on test set
    results['rmsd_test'] = math.sqrt(mean_squared_error(y_test, learner.predict(X_test)))
    
    # TODO: Compute accuracy on test set
    results['score_train'] = learner.score(X_train, y_train)
    results['score_test'] = learner.score(X_test, y_test)
    
    # Success
    print("{} trained on {} samples.".format(learner.__class__.__name__, sample_size))
    
    y_test.reset_index(drop=True, inplace=True)
    join_df = pd.concat([y_test, pd.Series(learner.predict(X_test))], axis=1)
    print(join_df)
        
    # Return the results
    return results

In [7]:
from sklearn.linear_model import LinearRegression

clf = LinearRegression()

result = train_predict(clf, len(y_train), X_train, y_train, X_test, y_test)
result

LinearRegression trained on 664548 samples.
        int_rate          0
0             13  13.784202
1             16  16.828891
2             11  11.727507
3              6   6.015516
4             19  20.322893
5              7   7.232547
6              7   7.369115
7              8   7.683018
8             19  17.710133
9             24  23.888291
10            12  11.718896
11            24  25.035283
12            17  16.148187
13            16  16.177130
14            14  13.644842
15             8   8.214519
16            30  24.477509
17            17  15.833370
18             6   5.149336
19            19  18.990720
20            12  11.917043
21            15  14.306424
22            16  16.154196
23            16  15.726226
24             7   7.098337
25            12  12.030966
26            10   9.210803
27            10   9.656572
28            11  10.934964
29            15  14.631769
...          ...        ...
166107        18  19.327682
166108        10   9.812098
1661

{'train_time': 3.2793219089508057,
 'pred_time': 0.1743617057800293,
 'rmsd_train': 0.8853781456372407,
 'rmsd_test': 0.8872887395342114,
 'score_train': 0.9631325799234893,
 'score_test': 0.963158387391249}