## Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as pdp
import matplotlib as mpl
mpl.rcParams['font.size'] = 15.0
%matplotlib inline
import imp
plt.style.use('seaborn-darkgrid')
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBRFClassifier,XGBClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score
from numpy.linalg import svd
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA, NMF
from os import listdir
import warnings
from xgboost import XGBRFClassifier
import tensorflow as tf
warnings.filterwarnings('ignore')
import ast

In [None]:
import src.LC_Clean_Assist as LCC
import src.LC_Plotter as LCP
import src.LC_Transformer as LCT
import src.LC_Models as LCM
imp.reload(LCP)
imp.reload(LCC)
imp.reload(LCT)
imp.reload(LCM)

## Loading Data

In [None]:
df = pd.read_csv('data/LC_Compiled.csv',low_memory=False)
print('{} Loans and {} Features'.format(df.shape[0],df.shape[1]))

## Data Exploration

In [None]:
df = LCC.clean_lc_for_plotting(df)
ls_df = df.groupby('loan_status').sum().reset_index()[['loan_status','loan_amnt']]
ls_df['loan_amnt'] = round(ls_df['loan_amnt'] / 1000000,1)
ls_df

In [None]:
LCP.plot_loan_breakdown_pie(df)
LCP.plot_grade_breakdown_pie(df)
grade_pie = df.groupby('grade').sum()[['loan_amnt','total_rec_prncp','out_prncp','delinq_amnt','charged_off_amnt']]
grade_pie.reset_index(inplace=True)
grade_pie['bad_debt'] = grade_pie['delinq_amnt'] + grade_pie['charged_off_amnt']
grade_pie[['total_rec_prncp','out_prncp','bad_debt']] = grade_pie[['total_rec_prncp','out_prncp','bad_debt']] /1000000
grade_pie[['grade','total_rec_prncp','out_prncp','bad_debt']]

In [None]:
LCP.choro_debt_state(df)
LCP.choro_debt_state_count(df)
df.groupby('addr_state').mean().reset_index()[['addr_state','loan_amnt']].sort_values(by='loan_amnt',ascending=False).head(3)

In [None]:
LCP.lc_time_series(df)
ts_group = df.groupby('issue_d').sum()
ts_group.reset_index(inplace=True)
max_idx = np.argmax(ts_group['loan_amnt'])
ts_group[ts_group.index == max_idx][['issue_d','loan_amnt']]

In [None]:
ts_group['year'] = ts_group['issue_d'].apply(lambda x: str(x).split('-')[0])
ts_group.groupby('year').sum()['loan_amnt'] / 1000000000

In [None]:
LCP.lc_individual_profile(df)

In [None]:
df = LCC.clean_lc_for_models(df)
returns = LCP.lc_returns(df)
returns

In [None]:
returns_36 = returns[returns['term'] == 36]
LCP.lc_plot_returns(returns_36,'Returns by Grade (36 Month Term)','images/36_month_returns.png')
returns_60 = returns[returns['term'] == 60]
LCP.lc_plot_returns(returns_60,'Returns by Grade (60 Month Term)','images/60_month_returns.png')

In [None]:
LCP.lc_plot_annualized_returns(returns_36,'Annualized Returns by Grade (36 Month Term)','images/36_month_annualized.png')
LCP.lc_plot_annualized_returns(returns_60,'Annualized Returns by Grade (60 Month Term)','images/60_month_annualized.png')

In [None]:
df['purpose'].value_counts()

In [None]:
df[df['total_pymnt'] - df['installment']*df['term'] == 0.0].shape[0]

In [None]:
df[(df['total_pymnt'] - (df['installment']*df['term'])) <= 100].shape[0]

In [None]:
df['expected'] = df['installment'] * df['term']

In [None]:
LCP.lc_proportions_time(df)

## Feature Engineering

In [None]:
scaled_df = LCT.lc_transform(df)
LCP.pca_plotter(scaled_df)

In [None]:
pca_df = LCT.get_pca_df(scaled_df,20)

## Grid Search

### Results
-  The grid search produces the following csv files:
    -  'models/logm_optimized.csv'
    -  'models/rfc_optimized.csv'
    -  'models/gbc_optimized.csv'    

In [None]:
logm_optimized = pd.read_csv('models/logm_optimized.csv')
rfc_optimized = pd.read_csv('models/rfc_optimized.csv')
gbc_df = pd.read_csv('models/gbc_optimized.csv')
combined = pd.concat([logm_optimized,rfc_optimized,gbc_df])
LCP.plot_36m_returns(combined)
LCP.plot_60m_returns(combined)
LCP.plot_prec_by_prop(combined)
LCP.plot_36m_deployed(combined)
LCP.plot_60m_deployed(combined)
LCP.plot_rets_v_acc(combined)
LCP.plot_rets_v_prec(combined)
LCP.profits_v_deployed(combined)

## Sharpe Ratios

In [None]:
sharpe_matrix = LCT.get_compiled_models(combined)
df_opt = pd.read_csv('models/sharpe_optimized.csv')
df_r = pd.read_csv('models/sharpe_large_deployed.csv')
df = pd.concat([df_opt,df_r])
df_sharpe = LCM.sharpe_calc_df(df)
LCP.plot_sharpe(df)

In [None]:
df_sharpe[df_sharpe['Sharpe_60'] == df['Sharpe_60'].max()]

In [None]:
df_sharpe[df_sharpe['Sharpe_36'] == df['Sharpe_36'].max()]

## Final Models

In [None]:
p1 = 0.085
p2 = 0.09
mod_36m_params = ast.literal_eval(df_sharpe[df_sharpe['Sharpe_36'] == df['Sharpe_36'].max()]['Parameters'].values[0])
mod_60m_params = ast.literal_eval(df_sharpe[df_sharpe['Sharpe_60'] == df['Sharpe_60'].max()]['Parameters'].values[0])
logr = LogisticRegression(**mod_36m_params)
rfc = RandomForestClassifier(**mod_60m_params)
rets = []
rets_36 = []
rets_60 = []
deployed = []
deployed_36 = []
deployed_60 = []
for j in range(1000):
    print('Running Iteration: {}'.format(j+1))
    overall_return, deployed_capital, returned_capital, t_36_rets,t_36_deployed,t_36_pl,t_60_rets,t_60_deployed,t_60_pl = LCM.final_system(pca_df,logr,rfc,p1,p2)
    rets.append(overall_return)
    rets_36.append(t_36_rets)
    rets_60.append(t_60_rets)
    deployed.append(deployed_capital)
    deployed_36.append(t_36_deployed)
    deployed_60.append(t_60_deployed)

In [None]:
fig = plt.figure(figsize = (40,40))
ax1 = fig.add_subplot(3,1,1)
ax1.hist(rets,color='darkblue')
ax1.tick_params('x',labelsize=40)
ax1.set_yticklabels([])
plt.title('Distribution of Blended Returns',fontsize=50,fontweight='bold')
ax2 = fig.add_subplot(3,1,2)
ax2.hist(rets_36,color='darkblue')
ax2.tick_params('x',labelsize=40)
ax2.set_yticklabels([])
plt.title('Distribution of 36 Month Returns',fontsize=50,fontweight='bold')
ax3 = fig.add_subplot(3,1,3)
ax3.hist(rets_60,color='darkblue')
ax3.tick_params('x',labelsize=40)
ax3.set_yticklabels([])
plt.title('Distribution of 60 Month Returns',fontsize=50,fontweight='bold');

In [None]:
fig = plt.figure(figsize=(40,40))
ax = fig.add_subplot(1,1,1)
ax.scatter(np.array(deployed) / 1000000000,rets,c='darkblue',alpha=0.5)
plt.title('Returns vs. Deployed Capital',fontsize=40,fontweight='bold')
plt.xlabel('Deployed Capital in Millions',fontsize=35)
plt.ylabel('Returns',fontsize=35)
ax.tick_params('x',labelsize=35)
ax.tick_params('y',labelsize=35);