## Library Imports

In [9]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')
pd.options.display.float_format = '{:.2f}'.format

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [2]:
# Load data
client_profile_data = pd.read_csv('data/client_profile.csv')
test_data = pd.read_csv('data/test.csv')
train_data = pd.read_csv('data/train.csv')

In [3]:
df_train = pd.merge(train_data, client_profile_data, how='left', on='APPLICATION_NUMBER')
df_test = pd.merge(test_data, client_profile_data, how='left', on='APPLICATION_NUMBER')

### Based Model

In [4]:
features = df_train.drop(columns=['TARGET'])
target = df_train['TARGET']
numerical_features = features.select_dtypes(include=[np.number])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(numerical_features.fillna(-9999), target, test_size=0.2, random_state=1234, stratify=target)

In [10]:
# Function for creating model pipelines
from sklearn.pipeline import make_pipeline

pipelines = {
    'l2': make_pipeline(StandardScaler(), LogisticRegression(random_state=123)),
    'rf': make_pipeline(StandardScaler(), RandomForestClassifier(random_state=123)),
    'gb': make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=123))
}

In [32]:
l2_hyperparameters = {
    'logisticregression__C': [0.001]
}

# random forest hyperparameters
rf_hyperparameters = {
    'randomforestclassifier__n_estimators': [20],
    'randomforestclassifier__max_depth': [10],
    'randomforestclassifier__max_features': [0.33],
    'randomforestclassifier__min_samples_leaf': [10]
}

# gradient boosting hyperparameters
gb_hyperparameters = {
    'gradientboostingclassifier__n_estimators': [200],
    'gradientboostingclassifier__learning_rate': [0.05],
    'gradientboostingclassifier__max_depth': [3]
}

# Create hyperparameters dictionary
hyperparameters = {
    'l2': l2_hyperparameters,
    'rf': rf_hyperparameters, 
    'gb': gb_hyperparameters
}

In [33]:
# Helper for cross-validation
from sklearn.model_selection import GridSearchCV

# Create empty dictionary called fitted_models
fitted_models = {}

# Loop through model pipelines, tuning each one and saving it to fitted_models
for name, pipeline in pipelines.items():
    model = GridSearchCV(pipeline, hyperparameters[name], cv=10, n_jobs=-1)
    
    # Fit model on X_train, y_train
    model.fit(X_train, y_train)
    
    # Store model in fitted_models[name] 
    fitted_models[name] = model
    
    # Print '{name} has been fitted'
    print(name, 'has been fitted.')

l2 has been fitted.
rf has been fitted.
gb has been fitted.


In [34]:
for name, model in fitted_models.items():
    print( name, model.best_score_ )

l2 0.9191815980744404
rf 0.9191929526787327
gb 0.9193405573780332


In [35]:
from sklearn.metrics import mean_absolute_error, roc_auc_score

In [36]:
for name, model in fitted_models.items():
    pred = model.predict_proba(X_test)
    score = roc_auc_score(y_test, pred[:, 1])
    print(name, score)

l2 0.5917570380837482
rf 0.6945713550935708
gb 0.7020411053862919
