In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

from imblearn.over_sampling import SMOTE

import pickle

In [2]:
data_train = pd.read_csv("cleaned_2014", low_memory = False);
data_test = pd.read_csv("cleaned_2015", low_memory = False);

In [3]:
data_test.head()

Unnamed: 0,zip_code,funded_amnt,int_rate,total_pymnt,annual_inc,dti,loan_status,revol_util,term,emp_length_1 year,...,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,amnt,paid
0,644.0,0.264706,0.1288,11316.89019,0.00962,0.009449,Fully Paid,0.417,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,0
1,30.0,0.5,0.0749,19771.097253,0.007895,0.025676,Fully Paid,0.332,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18000.0,0
2,125.0,0.558824,0.0532,21631.724329,0.006842,0.018849,Fully Paid,0.03,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20000.0,0
3,604.0,0.484559,0.1399,13449.65,0.004184,0.023604,Charged Off,0.523,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17475.0,1
4,761.0,0.588235,0.1344,24473.227622,0.005753,0.012152,Fully Paid,0.268,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21000.0,0


In [4]:
data_test.columns == data_train.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True])

In [5]:
def split_data(df, cols):
    x = df.drop(cols, axis = 1)
    y = df.paid
    return x, y

cols_to_drop_training = ['loan_status', 'paid', 'amnt', 'total_pymnt', 'zip_code']
x_train_initial, y_train_initial = split_data(data_train, cols_to_drop_training)
x_test, y_test = split_data(data_test, cols_to_drop_training)

In [6]:
sm = SMOTE(random_state=1, ratio = 1.0)
x_train, y_train = sm.fit_sample(x_train_initial, y_train_initial)

In [7]:
with open('Random_Forest_Model_100features.pkl', 'rb') as file:  
    rf_complex = pickle.load(file)

In [12]:
def create_df(data_1, data_2, penal = 0.5):
    df = pd.DataFrame(data_1)
    df['int_rate'] = data_2['int_rate']
    df['amnt'] = data_2['amnt']
    df['total_pymnt'] = data_2['total_pymnt']
    df['term'] = data_2['term']
    df['ZIP'] = data_2['zip_code']
    df['ROI'] = (((1 + df['int_rate'])*(1-df['proba']))+(df['proba']*drr*penal))-1

    df['Real_ROI'] = df['amnt']*(((df['total_pymnt']/df['amnt'])**(1/df['term']))-1)
    df['annualized_amnt'] = df['amnt']*(1/df['term'])
    return df

In [13]:
distress = data_train[data_train.paid == 1]
drr = (np.sum(distress.total_pymnt)/np.sum(distress.amnt))**(1/np.mean(distress.term))
print(drr)

0.8972755113514147


In [14]:
train_data = {'proba' : rf_complex.predict_proba(x_train)[:, 1], 'true_val' : y_train}
test_data = {'proba' : rf_complex.predict_proba(x_test)[:, 1], 'true_val' : y_test}

df_train = create_df(train_data, data_train)
df_test =  create_df(test_data, data_test)

In [15]:
baseline_train = np.mean(df_train.ROI)
baseline_test = np.mean(df_test.ROI)
print(baseline_train, baseline_test)
alpha_low = 0
alpha_high = -np.inf

-0.15573513135086037 -0.12289348717807039


In [25]:
df_train = df_train.dropna()
df_test = df_test.dropna()

In [27]:
df_train.to_csv('df_train', index = False)
df_test.to_csv('df_test', index = False)