In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time

%matplotlib inline
pd.set_option('max_colwidth',1000)

In [2]:
# Load the loan data
df = pd.read_pickle('data_cleaned.pkl')
df.reset_index(drop=True, inplace=True)

# Remove loan_status as it is one of the items we will predict
df.drop('loan_status', axis=1, inplace=True)

In [3]:
# Let's get all the categorical features
cat_features = df.select_dtypes(include=['object']).columns
print(cat_features)
# for y in df.select_dtypes(include=['object']).columns:
    # print(y, df[y].dtype)

Index(['grade', 'sub_grade', 'emp_length', 'home_ownership',
       'verification_status', 'pymnt_plan', 'purpose', 'addr_state',
       'initial_list_status', 'application_type'],
      dtype='object')


In [4]:
# Now encode the categorical features

for y in cat_features:
    df = df.join(pd.get_dummies(df[y], prefix=y))
    df.drop(y, axis=1, inplace=True)

In [5]:
# Split the data in features and target label
int_rate = df['int_rate']
features = df.drop('int_rate', axis=1)

In [6]:
from sklearn.cross_validation import train_test_split

# Split the features and int_rate data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, int_rate, test_size = 0.2, random_state = 0)

print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))



Training set has 628476 samples.
Testing set has 157119 samples.


In [7]:
from sklearn.metrics import mean_squared_error
import math

def train_predict(learner, sample_size, X_train, y_train, X_test, y_test): 
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       - sample_size: the size of samples (number) to be drawn from training set
       - X_train: features training set
       - y_train: income training set
       - X_test: features testing set
       - y_test: income testing set
    '''
    
    results = {}
    
    # TODO: Fit the learner to the training data using slicing with 'sample_size'
    start = time() # Get start time
    learner.fit(X_train[:sample_size], y_train[:sample_size])
    end = time() # Get end time
    
    # TODO: Calculate the training time
    results['train_time'] = end - start
        
    # TODO: Get the predictions on the test set,
    #       then get predictions on the first 300 training samples
    start = time() # Get start time
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train[:300])
    end = time() # Get end time
       
    # TODO: Calculate the total prediction time
    results['pred_time'] = end - start
            
    # TODO: Compute accuracy on the first 300 training samples
    results['rmsd_train'] = math.sqrt(mean_squared_error(y_train, learner.predict(X_train)))
        
    # TODO: Compute accuracy on test set
    results['rmsd_test'] = math.sqrt(mean_squared_error(y_test, learner.predict(X_test)))
    
    # TODO: Compute accuracy on test set
    results['score_train'] = learner.score(X_train, y_train)
    results['score_test'] = learner.score(X_test, y_test)
    
    # Success
    print("{} trained on {} samples.".format(learner.__class__.__name__, sample_size))
    
    y_test.reset_index(drop=True, inplace=True)
    join_df = pd.concat([y_test, pd.Series(learner.predict(X_test))], axis=1)
    print(join_df)
        
    # Return the results
    return results

In [8]:
from sklearn.linear_model import LinearRegression

clf = LinearRegression()

result = train_predict(clf, len(y_train), X_train, y_train, X_test, y_test)
result

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').