# Using Future Data As A Test Set
### Model #1: (Classification - Fully Paid vs. Default) 
### Model #2: ( Regression - Annualized Return %)

In [1]:
import pandas as pd
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt
from functions_and_objects import (impute_means_zeros_maxs_X,parse_percentage,
                                   clean_LC_data_classification_eval,preprocessing_future_test, 
                                   columns_list, nan_max_cols, nan_mean_cols, nan_zero_cols, dtype,
                                   one_hot_encode_current, concat_X_and_6ohe_dfs,
                                   prep_all_df_for_classification)
from functions_and_objects import (prep_df_for_regression_current,
                                   calc_annu_return,impute_annu_return_to_y,
                                   scale_current)

import joblib
from sklearn.metrics import (accuracy_score, precision_score, recall_score, #classification
                             f1_score, roc_auc_score, roc_curve,precision_recall_fscore_support, #classification
                             r2_score,mean_squared_error) #regression

%load_ext autoreload
%autoreload 2

### Import New Data from 2018-2019

In [2]:
raw_q12019 = pd.read_csv('~/peervest/lc_data/LoanStats_securev1_2019Q1.csv',header=1)
raw_q42018 = pd.read_csv('~/peervest/lc_data/LoanStats_securev1_2018Q4.csv',header=1)
raw_q32018 = pd.read_csv('~/peervest/lc_data/LoanStats_securev1_2018Q3.csv',header=1)
raw_q22018 = pd.read_csv('~/peervest/lc_data/LoanStats_securev1_2018Q2.csv',header=1)
raw_q12018 = pd.read_csv('~/peervest/lc_data/LoanStats_securev1_2018Q1.csv',header=1)
dfs_list = [raw_q42018,
            raw_q32018,
            raw_q22018,
            raw_q12018]

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


### Clean Data

In [3]:
#use eval instead of current because 
#we're making predictions on already completed loans & comparing
#rather than predicting blindly (which the app will do)

clean_lc_df_future = clean_LC_data_classification_eval(dfs_list) #takes in dfs_list, outputs clean_lc_df

  raw_lc_df['earliest_cr_line'] = pd.to_timedelta(pd.to_datetime(raw_lc_df['earliest_cr_line'])).dt.days


### Preprocess, no train-test-split: entire dataset is our test data

In [4]:
X_future, y_future = preprocessing_future_test(clean_lc_df_future)# takes in clean_lc_df_current, outputs X_current, y_current

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  y_future['loan_status'] = y_future['loan_status'].astype(int)


### One Hot Encode for both Models

In [5]:
(ohe_home_ownership, ohe_purpose, ohe_zip_code, 
 ohe_application_type, ohe_sub_grade, ohe_emp_title_2) = one_hot_encode_current(X_future)

In [6]:
X_future_classif = concat_X_and_6ohe_dfs(X_future, ohe_home_ownership, ohe_purpose, ohe_zip_code, 
                                         ohe_application_type, ohe_sub_grade, ohe_emp_title_2)
X_future_regr = concat_X_and_6ohe_dfs(X_future, ohe_home_ownership, ohe_purpose, ohe_zip_code, 
                                      ohe_application_type, ohe_sub_grade, ohe_emp_title_2)

### Prep X_future for Classification

- reset index

In [7]:
X_future_classif.set_index('index',inplace=True)

- drop non-numeric & OHE source columns

In [8]:
prep_all_df_for_classification(X_future_classif)

In [9]:
X_future_classif.shape

(67586, 1124)

### X_future is ready to input into Classification Model

###### Load Model JobLib

In [10]:
loaded_log_reg_v1 = joblib.load('log_reg_v1.joblib')

In [11]:
future_class_preds = loaded_log_reg_v1.predict(X_future_classif)
future_class_preds_proba = loaded_log_reg_v1.predict_proba(X_future_classif)

In [91]:
loaded_log_reg_v1.predict_proba(X_future_classif)

array([[2.89095603e-003, 9.97109044e-001],
       [1.80524177e-006, 9.99998195e-001],
       [1.37538903e-007, 9.99999862e-001],
       ...,
       [1.00000000e+000, 6.90269850e-235],
       [2.59676371e-004, 9.99740324e-001],
       [4.60361519e-004, 9.99539638e-001]])

In [12]:
print ("Precision Fully Paid: {}".format(precision_score(y_future,future_class_preds)))
print ("Recall Fully Paid: {}".format(recall_score(y_future,future_class_preds)))
print ("F-1 Score Fully Paid: {}".format(f1_score(y_future,future_class_preds)))
print ("ROC-AUC Score: {}".format(roc_auc_score(y_future,future_class_preds)))

Precision Fully Paid: 0.9999470815473356
Recall Fully Paid: 0.9998412614424044
F-1 Score Fully Paid: 0.9998941686951001
ROC-AUC Score: 0.9997828770248113


In [13]:
#precision, recall, f1-score
print ("Precision, Recall, F, & Support By Class [0,1] aka [Default,Fully Paid]: {}".format(
    precision_recall_fscore_support(y_future,future_class_preds)))

print ("Precision, Recall, F, & Support Weighted Average by Support: {}".format(
    precision_recall_fscore_support(y_future,future_class_preds,average='weighted')))

Precision, Recall, F, & Support By Class [0,1] aka [Default,Fully Paid]: (array([0.99917393, 0.99994708]), array([0.99972449, 0.99984126]), array([0.99944914, 0.99989417]), array([10889, 56697]))
Precision, Recall, F, & Support Weighted Average by Support: (0.9998225170729338, 0.9998224484360666, 0.9998224681900859, None)


- add confusion matrix
- add ROC Curve

### Prep X_future for Regression

- reset index

In [14]:
X_future_regr.set_index('index',inplace=True)

- calculate annual return & create new target dataframe

In [15]:
# returns the annualized return column as a DF (y_future_regr) & both target columns as a dataframe (y_future)
y_future_regr, y_future = impute_annu_return_to_y(X_future_regr,y_future)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  target_df['annu_return'] = calc_annu_return(input_df)


- drop non-numeric & OHE source columns

In [16]:
prep_df_for_regression_current(X_future_regr)

In [17]:
X_future_regr_scaled = scale_current(X_future_regr)

###### Load Ridge Regression Model JobLib

In [18]:
loaded_ridge_reg_v1 = joblib.load('ridge_lin_reg_v1.joblib')

In [19]:
future_return_preds = loaded_ridge_reg_v1.predict(X_future_regr_scaled)

### Evaluate Regression Model - No Filter/All Completed Loans

In [57]:
print("FUTURE TEST - ERROR METRICS:")
print("R-Squared: {}".format(r2_score(y_future_regr, future_return_preds)))
print("Mean Squared Error: {}".format(mean_squared_error(y_future_regr, future_return_preds)))
print("Root Mean Squared Error: {}".format(sqrt(mean_squared_error(y_future_regr, future_return_preds))))

FUTURE TEST - ERROR METRICS:
R-Squared: 0.6899769300520536
Mean Squared Error: 0.043807439892345
Root Mean Squared Error: 0.2093022692001809


### Conclusion:
- My Logistic Classification Model is generalizable with future data 
- My Ridge Regression Model is also generalizable with future data
- The Regression Model performs worse when you filter out by Probability of Default, though it will be more useful to train a model like that first

##### Extra Scratch Work

### Evaluate Regression Model - Probability of Default Filter

In [59]:
y_future_regr['prob_default'] = future_class_preds_proba[:,0].reshape(-1,1)

In [63]:
#Add Predicted Probability of Default column to X_future_regr
future_return_preds_def = pd.DataFrame(np.hstack((future_return_preds,future_class_preds_proba[:,0].reshape(-1,1))),
                                       index=X_future_regr_scaled.index,
                                       columns=['annu_return','prob_default'])

In [102]:
future_return_preds_filter = future_return_preds_def['annu_return'][future_return_preds_def['prob_default'] < 0.3]
y_future_regr_filter = y_future_regr['annu_return'][y_future_regr['prob_default'] < 0.3]

In [104]:
print("FUTURE TEST - ERROR METRICS - PROBABILITY of DEFAULT FILTER taking < 0.3:")
print("R-Squared: {}".format(r2_score(y_future_regr_filter, future_return_preds_filter)))
print("Mean Squared Error: {}".format(mean_squared_error(y_future_regr_filter, future_return_preds_filter)))
print("Root Mean Squared Error: {}".format(sqrt(mean_squared_error(y_future_regr_filter, future_return_preds_filter))))

FUTURE TEST - ERROR METRICS - PROBABILITY of DEFAULT FILTER taking < 0.3:
R-Squared: -0.4493404600061739
Mean Squared Error: 0.003906704780431792
Root Mean Squared Error: 0.06250363813756597


In [101]:
y_future[y_future['loan_status'] == 0].sort_values(by='annu_return',ascending=False).head(22)

Unnamed: 0,loan_status,annu_return
39790,0,1.120226
24309,0,0.612155
47174,0,0.476052
61623,0,0.382404
36896,0,0.231999
47931,0,0.195198
46818,0,0.1843
65316,0,0.165008
43972,0,0.15669
44689,0,0.150734


- Positive annu_return values where loan_status==0 likely indicates a successful debt collection process