In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time

%matplotlib inline
pd.set_option('max_colwidth',1000)

In [43]:
# Load the loan data
df = pd.read_pickle('data_cleaned.pkl')
df.reset_index(drop=True, inplace=True)

In [44]:
# Categorical features from data-cleanup.ipynb
cat_features = ['grade', 'sub_grade', 'emp_length', 'home_ownership', 'verification_status', 
                'purpose', 'addr_state', 'initial_list_status', 'application_type', 'disbursement_method']

for y in cat_features:
    # print(y + " has " + str(len(df[y].unique())) + " unique values")
    df = df.join(pd.get_dummies(df[y], prefix=y))
    df.drop(y, axis=1, inplace=True)

In [45]:
# Total number of records
n_records = len(df.index)

# Number of records where loan status is 'Fully Paid'
n_fully_paid = len(df[df.loan_status == 'Fully Paid'].index)

# Number of records where loan status is 'Charged Off
n_charged_off = len(df[df.loan_status == 'Charged Off'].index)

# Percentage of loan default
loan_default = 100 * float(n_charged_off) / n_records

print("Total number of records: {}".format(n_records))
print("Total number of good loans: {}".format(n_fully_paid))
print("Total number of bad loans: {}".format(n_charged_off))
print("Percentage of bad loans: {}".format(loan_default))

Total number of records: 830685
Total number of good loans: 659424
Total number of bad loans: 171261
Percentage of bad loans: 20.616840318532297


In [46]:
# Split the data in features and target label
loan_status_raw = df['loan_status']
features = df.drop('loan_status', axis=1)

# Encode the loan_status_raw to numerical values
# Fully Paid = 1
# Charged Off = 0
loan_status = loan_status_raw.apply(lambda x: int(x == 'Fully Paid'))

In [47]:
from sklearn.cross_validation import train_test_split

# Split the features and loan_status data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, loan_status, test_size = 0.2, random_state = 0)

print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 664548 samples.
Testing set has 166137 samples.


In [48]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import fbeta_score

def train_predict(learner, sample_size, X_train, y_train, X_test, y_test): 
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       - sample_size: the size of samples (number) to be drawn from training set
       - X_train: features training set
       - y_train: income training set
       - X_test: features testing set
       - y_test: income testing set
    '''
    
    results = {}
    
    # TODO: Fit the learner to the training data using slicing with 'sample_size'
    start = time() # Get start time
    learner.fit(X_train[:sample_size], y_train[:sample_size])
    end = time() # Get end time
    
    # TODO: Calculate the training time
    results['train_time'] = end - start
        
    # TODO: Get the predictions on the test set,
    #       then get predictions on the first 300 training samples
    start = time() # Get start time
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train[:300])
    end = time() # Get end time
       
    # TODO: Calculate the total prediction time
    results['pred_time'] = end - start
            
    # TODO: Compute accuracy on the first 300 training samples
    results['acc_train'] = accuracy_score(y_train[:300], learner.predict(X_train[:300]))
        
    # TODO: Compute accuracy on test set
    results['acc_test'] = accuracy_score(y_test, learner.predict(X_test))
    
    # TODO: Compute F-score on the the first 300 training samples
    results['f_train'] = fbeta_score(y_train[:300], learner.predict(X_train[:300]), beta=0.5)
        
    # TODO: Compute F-score on the test set
    results['f_test'] = fbeta_score(y_test, learner.predict(X_test), beta=0.5)
       
    # Success
    print("{} trained on {} samples.".format(learner.__class__.__name__, sample_size))
        
    # Return the results
    return results

In [50]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()

result = train_predict(clf, len(y_train), X_train, y_train, X_test, y_test)
result

GaussianNB trained on 664548 samples.


{'train_time': 2.950104236602783,
 'pred_time': 0.6878616809844971,
 'acc_train': 0.45,
 'acc_test': 0.3935727742766512,
 'f_train': 0.6629834254143647,
 'f_test': 0.6091418829195067}