In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time

%matplotlib inline
pd.set_option('max_colwidth',1000)

In [2]:
# Load the loan data
df = pd.read_pickle('data_cleaned.pkl')
df.reset_index(drop=True, inplace=True)

In [3]:
# Categorical features from data-cleanup.ipynb
cat_features = ['grade', 'sub_grade', 'emp_length', 'home_ownership', 'verification_status', 
                'purpose', 'addr_state', 'initial_list_status', 'application_type', 'disbursement_method',
                'loan_status']

for y in cat_features:
    df = df.join(pd.get_dummies(df[y], prefix=y))
    df.drop(y, axis=1, inplace=True)

In [4]:
# Split the data in features and target label
int_rate = df['int_rate']
features = df.drop('int_rate', axis=1)

In [5]:
from sklearn.cross_validation import train_test_split

# Split the features and int_rate data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, int_rate, test_size = 0.2, random_state = 0)

print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))



Training set has 664548 samples.
Testing set has 166137 samples.


In [6]:
from sklearn.metrics import mean_squared_error
import math

def train_predict(learner, sample_size, X_train, y_train, X_test, y_test): 
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       - sample_size: the size of samples (number) to be drawn from training set
       - X_train: features training set
       - y_train: income training set
       - X_test: features testing set
       - y_test: income testing set
    '''
    
    results = {}
    
    # TODO: Fit the learner to the training data using slicing with 'sample_size'
    start = time() # Get start time
    learner.fit(X_train[:sample_size], y_train[:sample_size])
    end = time() # Get end time
    
    # TODO: Calculate the training time
    results['train_time'] = end - start
        
    # TODO: Get the predictions on the test set,
    #       then get predictions on the first 300 training samples
    start = time() # Get start time
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train[:300])
    end = time() # Get end time
       
    # TODO: Calculate the total prediction time
    results['pred_time'] = end - start
            
    # TODO: Compute accuracy on the first 300 training samples
    results['rmsd_train'] = math.sqrt(mean_squared_error(y_train, learner.predict(X_train)))
        
    # TODO: Compute accuracy on test set
    results['rmsd_test'] = math.sqrt(mean_squared_error(y_test, learner.predict(X_test)))
    
    # TODO: Compute accuracy on test set
    results['score_train'] = learner.score(X_train, y_train)
    results['score_test'] = learner.score(X_test, y_test)
    
    # Success
    print("{} trained on {} samples.".format(learner.__class__.__name__, sample_size))
    
    y_test.reset_index(drop=True, inplace=True)
    join_df = pd.concat([y_test, pd.Series(learner.predict(X_test))], axis=1)
    print(join_df)
        
    # Return the results
    return results

In [7]:
from sklearn.linear_model import LinearRegression

clf = LinearRegression()

result = train_predict(clf, len(y_train), X_train, y_train, X_test, y_test)
result

LinearRegression trained on 664548 samples.
        int_rate          0
0             11  11.311425
1             10   9.453917
2             13  13.961180
3              7   7.198704
4             13  13.931096
5             12  12.235610
6             14  13.186475
7              6   5.804977
8              6   6.573946
9              8   7.717676
10            20  20.353740
11             9   8.827323
12            17  17.143427
13            15  14.525209
14            11  10.350809
15            16  15.916940
16             7   7.535429
17             6   6.208537
18            10  10.260861
19            14  13.478954
20            12  11.918019
21            10   9.439450
22            12  12.263324
23            13  13.064919
24             8   7.618672
25            13  13.040381
26            13  13.773460
27            13  12.994089
28             9   9.773574
29            17  16.371885
...          ...        ...
166107        14  14.099928
166108        22  22.823580
1661

{'train_time': 38.606746673583984,
 'pred_time': 1.8010108470916748,
 'rmsd_train': 0.8861268533480376,
 'rmsd_test': 0.88417360179591,
 'score_train': 0.9630688771414035,
 'score_test': 0.963422512110516}