# Preprocessing & Modeling

In [1]:
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np

#Imputation
from sklearn.linear_model import LinearRegression

#PCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

#Model Pre Processing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import precision_score, recall_score, accuracy_score

#Test
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'


#Gridsearch
from sklearn.model_selection import GridSearchCV

#Pickle
import pickle

import shap

In [2]:
from functions import process_bureau_data
from functions import perform_PCA
from functions import scale_data
from functions import get_top_30_corr
from functions import model_data

In [50]:
df = pd.read_csv('Assets/application_train.csv')
bureau = pd.read_csv('Assets/bureau.csv')

### Process Data for Model

In [4]:
#Processing bureau data from 
bureau_data = process_bureau_data(bureau)

#Scale Data to perform PCA
scaled_data = scale_data(bureau_data)

#Perform PCA On Bureau Data
pca_b = perform_PCA(scaled_data , 2)

### Bureau / Get Dummies

In [5]:
bureau_all = pd.concat([bureau_data['SK_ID_CURR'] , pca_b], axis = 1)

In [6]:
#Merge

df['TARGET_X_AMT_CREDIT'] = df.AMT_CREDIT * df.TARGET
dummy_df = pd.get_dummies(df)

### Top 30 correlated features for model

In [7]:
df_30 = get_top_30_corr(dummy_df)

### Fill Missing Data & Merge All Data

In [9]:
cols_with_missing  = df_30.isnull().sum()[df_30.isnull().sum() > 0].index
    
for col in cols_with_missing:
    df_30[col].fillna(df_30[col].mean(), inplace = True)

#Merge application data & Bureau
df_30 = pd.concat([df[['SK_ID_CURR']], df_30], axis = 1)
data_for_model = pd.merge(df_30 , bureau_all , on = 'SK_ID_CURR', how = 'inner')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
# for col in cols_with_missing[:1]:
    
#     lr = LinearRegression()
    
#     testdf = df_30[df_30[col].isnull()==True]
#     traindf = df_30[df_30[col].isnull()==False]
    
#     y = traindf[col]
#     traindf.drop(col,axis=1,inplace=True)
    
#     lr.fit(traindf,y)
#     testdf.drop(col,axis=1,inplace=True)
    
#     pred = lr.predict(testdf)
#     testdf[col]= pred

### Model Preperation

In [40]:
#Models with Credit scores
X = data_for_model.drop(columns=[ 'TARGET_X_AMT_CREDIT'])
y = data_for_model['TARGET_X_AMT_CREDIT']

X.dropna(inplace = True)
X = X.drop(columns = ['SK_ID_CURR'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


#Models w/o Credit

X_no_credit = data_for_model.drop(columns=[ 'TARGET_X_AMT_CREDIT' , 'EXT_SOURCE_3', 'EXT_SOURCE_2' , 'EXT_SOURCE_1'])
y_no_credit = data_for_model['TARGET_X_AMT_CREDIT']

X_train_nc, X_test_nc, y_train_nc, y_test_nc = train_test_split(X_no_credit, y_no_credit, test_size=0.2)

# Models

### Random Forests Regressor / GridsearchCV

In [22]:
# Model with Target Varibale - Target x Amt Credit
normal = model_data(X_train, X_test, y_train, y_test)

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV 1/2] END max_depth=10, min_samples_split=5, n_estimators=100;, score=-30137447468.998 total time= 1.1min
[CV 2/2] END max_depth=10, min_samples_split=5, n_estimators=100;, score=-30405236852.371 total time= 1.1min
[CV 1/2] END max_depth=10, min_samples_split=10, n_estimators=100;, score=-30119862655.158 total time= 1.1min
[CV 2/2] END max_depth=10, min_samples_split=10, n_estimators=100;, score=-30426453693.966 total time= 1.1min
[CV 1/2] END max_depth=10, min_samples_split=15, n_estimators=100;, score=-30151996913.809 total time= 1.1min
[CV 2/2] END max_depth=10, min_samples_split=15, n_estimators=100;, score=-30392395554.624 total time= 1.2min
[CV 1/2] END max_depth=None, min_samples_split=5, n_estimators=100;, score=-31155517270.555 total time= 2.3min
[CV 2/2] END max_depth=None, min_samples_split=5, n_estimators=100;, score=-31367068160.123 total time= 2.3min
[CV 1/2] END max_depth=None, min_samples_split=10, n_estimat

In [23]:
# Model with Target Varibale - Target x Amt Credit * .1
ten_percent = model_data(X_train, X_test * .1, y_train, y_test * .1)

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV 1/2] END max_depth=10, min_samples_split=5, n_estimators=100;, score=-30106036750.293 total time= 1.1min
[CV 2/2] END max_depth=10, min_samples_split=5, n_estimators=100;, score=-30398692721.742 total time= 1.1min
[CV 1/2] END max_depth=10, min_samples_split=10, n_estimators=100;, score=-30144394370.500 total time= 1.1min
[CV 2/2] END max_depth=10, min_samples_split=10, n_estimators=100;, score=-30422352709.233 total time= 1.1min
[CV 1/2] END max_depth=10, min_samples_split=15, n_estimators=100;, score=-30137422094.653 total time= 1.1min
[CV 2/2] END max_depth=10, min_samples_split=15, n_estimators=100;, score=-30434648693.278 total time= 1.1min
[CV 1/2] END max_depth=None, min_samples_split=5, n_estimators=100;, score=-31135918501.046 total time= 2.1min
[CV 2/2] END max_depth=None, min_samples_split=5, n_estimators=100;, score=-31410096577.948 total time= 2.1min
[CV 1/2] END max_depth=None, min_samples_split=10, n_estimat

In [31]:
# Model with Target Varibale - Target x Amt Credit Loss of 3% Fixed interest rate over compounded 30 years
fixed_interest = .03
years = 30


new_X_test = [principal * (1 + fixed_interest/ 12) ** (12*years) for principal in y_train]
new_y_test = [principal * (1 + fixed_interest / 12) ** (12*years) for principal in y_test]


compounded_model = model_data(X_train, new_X_test, y_train, new_y_test)

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV 1/2] END max_depth=10, min_samples_split=5, n_estimators=100;, score=-30113961170.631 total time= 1.2min
[CV 2/2] END max_depth=10, min_samples_split=5, n_estimators=100;, score=-30411535699.534 total time= 1.2min
[CV 1/2] END max_depth=10, min_samples_split=10, n_estimators=100;, score=-30173214230.602 total time= 1.2min
[CV 2/2] END max_depth=10, min_samples_split=10, n_estimators=100;, score=-30400142234.735 total time= 1.2min
[CV 1/2] END max_depth=10, min_samples_split=15, n_estimators=100;, score=-30122388668.340 total time= 1.2min
[CV 2/2] END max_depth=10, min_samples_split=15, n_estimators=100;, score=-30390310566.180 total time= 1.2min
[CV 1/2] END max_depth=None, min_samples_split=5, n_estimators=100;, score=-31146499927.996 total time= 2.3min
[CV 2/2] END max_depth=None, min_samples_split=5, n_estimators=100;, score=-31393449947.169 total time= 2.2min
[CV 1/2] END max_depth=None, min_samples_split=10, n_estimat

### Model w/o Credit Scores

In [41]:
# Model w/o credit scores with Target Varibale - Target x Amt Credit
normal_no_creidt = model_data(X_train_nc, X_test_nc, y_train_nc, y_test_nc)

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV 1/2] END max_depth=10, min_samples_split=5, n_estimators=100;, score=-30914677647.427 total time=  58.4s
[CV 2/2] END max_depth=10, min_samples_split=5, n_estimators=100;, score=-31379108777.310 total time= 1.0min
[CV 1/2] END max_depth=10, min_samples_split=10, n_estimators=100;, score=-30880810859.066 total time=  59.2s
[CV 2/2] END max_depth=10, min_samples_split=10, n_estimators=100;, score=-31394906204.456 total time= 1.0min
[CV 1/2] END max_depth=10, min_samples_split=15, n_estimators=100;, score=-30909752252.925 total time=  59.1s
[CV 2/2] END max_depth=10, min_samples_split=15, n_estimators=100;, score=-31392926925.107 total time=  57.6s
[CV 1/2] END max_depth=None, min_samples_split=5, n_estimators=100;, score=-31924205015.101 total time= 1.8min
[CV 2/2] END max_depth=None, min_samples_split=5, n_estimators=100;, score=-32417764733.126 total time= 1.9min
[CV 1/2] END max_depth=None, min_samples_split=10, n_estimat

In [48]:
pickle.dump(normal, open('Assets/original_model.pkl', 'wb'))
pickle.dump(ten_percent, open('Assets/ten_percent_model.pkl', 'wb'))
pickle.dump(compounded_model, open('Assets/compounded_model.pkl', 'wb'))

pickle.dump(normal_no_creidt , open('Assets/normal_no_creidt.pkl', 'wb'))

In [30]:
X.to_csv('Assets/X.csv')
y.to_csv('Assets/y.csv')