# Using Current Data To Make Recommendations
### Model #1: (Classification - Fully Paid vs. Default) 
### Model #2: ( Regression - Annualized Return %)

In [1]:
import pandas as pd
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt
from functions_and_objects import (impute_means_zeros_maxs_X,parse_percentage,
                                   clean_new_LC_data_classification_current,preprocessing_current, 
                                   columns_list, nan_max_cols, nan_mean_cols, nan_zero_cols, dtype,
                                   one_hot_encode_current, concat_X_and_6ohe_dfs,
                                   prep_all_df_for_classification)
from functions_and_objects import (prep_df_for_regression_current,
                                   calc_annu_return,impute_annu_return_to_y,
                                   scale_current)

import joblib
from sklearn.metrics import (accuracy_score, precision_score, recall_score, #classification
                             f1_score, roc_auc_score, roc_curve,precision_recall_fscore_support, #classification
                             r2_score,mean_squared_error) #regression

%load_ext autoreload
%autoreload 2

### Import New Data from 2018-2019

- These files contain complete loan data for all loans issued through the time period stated, including the current loan status (Current, Late, Fully Paid, etc.) and latest payment information. The file containing loan data through the "present" contains complete loan data for all loans issued through the previous completed calendar quarter.

In [2]:
raw_q12019 = pd.read_csv('~/peervest/lc_data/LoanStats_securev1_2019Q1.csv',header=1)
raw_q42018 = pd.read_csv('~/peervest/lc_data/LoanStats_securev1_2018Q4.csv',header=1)
raw_q32018 = pd.read_csv('~/peervest/lc_data/LoanStats_securev1_2018Q3.csv',header=1)
raw_q22018 = pd.read_csv('~/peervest/lc_data/LoanStats_securev1_2018Q2.csv',header=1)
raw_q12018 = pd.read_csv('~/peervest/lc_data/LoanStats_securev1_2018Q1.csv',header=1)
dfs_list = [raw_q42018,
            raw_q32018,
            raw_q22018,
            raw_q12018]

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


### Clean Data

In [3]:
#use cleaning function "current" because 
#we're making predictions on current loans
#rather than using labelled/completed data

clean_lc_df_current = clean_new_LC_data_classification_current(dfs_list) #takes in dfs_list, outputs clean_lc_df

  raw_lc_df['earliest_cr_line'] = pd.to_timedelta(pd.to_datetime(raw_lc_df['earliest_cr_line'])).dt.days


### Preprocess, no train-test-split: entire dataset is our test data

In [4]:
# takes in clean_lc_df_current, outputs X_current, y_current
X_current, y_current = preprocessing_current(clean_lc_df_current)

In [5]:
print(X_current.shape)
print(y_current.shape)

(361423, 101)
(361423, 1)


### One Hot Encode for both Models

In [6]:
(ohe_home_ownership, ohe_purpose, ohe_zip_code, 
 ohe_application_type, ohe_sub_grade, ohe_emp_title_2) = one_hot_encode_current(X_current)

In [7]:
X_current_classif = concat_X_and_6ohe_dfs(X_current, ohe_home_ownership, ohe_purpose, ohe_zip_code,
                                          ohe_application_type, ohe_sub_grade, ohe_emp_title_2)
X_current_regr = concat_X_and_6ohe_dfs(X_current, ohe_home_ownership, ohe_purpose, ohe_zip_code, 
                                       ohe_application_type, ohe_sub_grade, ohe_emp_title_2)

In [8]:
print(X_current_classif.shape)
print(X_current_regr.shape)

(361423, 1133)
(361423, 1133)


### Prep X_current for Classification

- reset index

In [9]:
X_current_classif.set_index('index',inplace=True)

- drop non-numeric & OHE source columns

In [10]:
prep_all_df_for_classification(X_current_classif) #drops columns inplace

In [11]:
X_current_classif.shape

(361423, 1124)

### X_current is ready to input into Classification Model

In [22]:
print(y_current.shape)
y_current.head()

(361423, 1)


Unnamed: 0,class_pred
0,
1,
2,
3,
4,


###### Load Model JobLib

In [15]:
loaded_log_reg_v1 = joblib.load('log_reg_v1.joblib')

In [17]:
current_class_preds = loaded_log_reg_v1.predict(X_current_classif)
current_class_preds_proba = loaded_log_reg_v1.predict_proba(X_current_classif)

In [18]:
len(current_class_preds_proba)

361423

In [28]:
loaded_log_reg_v1.classes_

array([0, 1])

In [50]:
current_class_preds_proba[:,0]

array([1., 1., 1., ..., 1., 1., 1.])

In [57]:
y_current['prob_default'] = current_class_preds_proba[:,0]

### Prep X_current for Regression

- reset index

In [52]:
X_current_regr.set_index('index',inplace=True)

- calculate annual return & create new target dataframe

In [53]:
# returns the annualized return column as a DF (y_future_regr) & both target columns as a dataframe (y_future)
y_current_regr, y_current = impute_annu_return_to_y(X_current_regr,y_current)

- drop non-numeric & OHE source columns

In [54]:
prep_df_for_regression_current(X_current_regr)

In [55]:
X_current_regr_scaled = scale_current(X_current_regr)

In [68]:
X_current_regr_scaled.shape

(361423, 1124)

###### Load Ridge Regression Model JobLib

In [65]:
loaded_ridge_reg_v1 = joblib.load('ridge_lin_reg_v1.joblib')

In [66]:
current_return_preds = loaded_ridge_reg_v1.predict(X_current_regr_scaled)

- Positive annu_return values where loan_status==0 likely indicates a successful debt collection process

In [71]:
y_current

Unnamed: 0,annu_return,prob_default
0,-0.963885,1.000000
1,-0.972364,1.000000
2,-0.859431,1.000000
3,-0.980185,1.000000
4,-0.982816,1.000000
5,-0.963888,1.000000
6,-0.988391,1.000000
7,-0.983512,1.000000
8,-0.984809,1.000000
9,-0.978654,1.000000


In [72]:
current_return_preds

array([[ 0.01616499],
       [-0.02710525],
       [ 0.25884256],
       ...,
       [ 0.09232062],
       [ 0.2400825 ],
       [ 0.29361946]])

In [73]:
y_predictions = y_current
y_predictions['return_preds'] = current_return_preds

In [74]:
y_predictions.drop(columns=['annu_return'])

Unnamed: 0,annu_return,prob_default,return_preds
0,-0.963885,1.000000,0.016165
1,-0.972364,1.000000,-0.027105
2,-0.859431,1.000000,0.258843
3,-0.980185,1.000000,-0.094744
4,-0.982816,1.000000,-0.454300
5,-0.963888,1.000000,-0.054143
6,-0.988391,1.000000,-0.583992
7,-0.983512,1.000000,-0.130022
8,-0.984809,1.000000,-0.273052
9,-0.978654,1.000000,-0.264439
