## Library Imports

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')
pd.options.display.float_format = '{:.2f}'.format

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from typing import List, Tuple

from sklearn.pipeline import make_pipeline

In [2]:
"""# Load data
client_profile_data = pd.read_csv('data/client_profile.csv')
test_data = pd.read_csv('data/test.csv')
train_data = pd.read_csv('data/train.csv')

df_train = pd.merge(train_data, client_profile_data, how='left', on='APPLICATION_NUMBER')
df_test = pd.merge(test_data, client_profile_data, how='left', on='APPLICATION_NUMBER')
 """

"# Load data\nclient_profile_data = pd.read_csv('data/client_profile.csv')\ntest_data = pd.read_csv('data/test.csv')\ntrain_data = pd.read_csv('data/train.csv')\n\ndf_train = pd.merge(train_data, client_profile_data, how='left', on='APPLICATION_NUMBER')\ndf_test = pd.merge(test_data, client_profile_data, how='left', on='APPLICATION_NUMBER')\n "

In [3]:
df_train = pd.read_csv('data/df_train_missing_clean.csv')
df_test = pd.read_csv('data/df_test_missing_clean.csv')

In [4]:
test_data = pd.read_csv('data/test.csv')

### Based Model

In [5]:
features = df_train.drop(columns=['TARGET'])
features_numeric = features.select_dtypes(include='number')
target = df_train['TARGET']

In [32]:
X_train, X_valid, y_train, y_valid = train_test_split(
    features, target, test_size=0.2, random_state=1234, stratify=target)

X_valid, X_test, y_valid, y_test = train_test_split(
    X_valid, y_valid, test_size=0.2, random_state=1234, stratify=y_valid)

print("x_train.shape = {} rows, {} cols".format(*X_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*X_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*X_test.shape))

x_train.shape = 88074 rows, 104 cols
x_valid.shape = 17615 rows, 104 cols
x_test.shape = 4404 rows, 104 cols


In [7]:
# Function for creating model pipelines
pipelines = {
    'l2': make_pipeline(StandardScaler(), LogisticRegression(random_state=123)),
    'rf': make_pipeline(RandomForestClassifier(random_state=123)),
    'gb': make_pipeline(GradientBoostingClassifier(random_state=123))
}

In [8]:
l2_hyperparameters = {
    'logisticregression__C': [0.001]
}

# random forest hyperparameters
rf_hyperparameters = {
    'randomforestclassifier__n_estimators': [20],
    'randomforestclassifier__max_depth': [10],
    'randomforestclassifier__max_features': [0.33],
    'randomforestclassifier__min_samples_leaf': [10]
}

# gradient boosting hyperparameters
gb_hyperparameters = {
    'gradientboostingclassifier__n_estimators': [200],
    'gradientboostingclassifier__learning_rate': [0.05],
    'gradientboostingclassifier__max_depth': [3]
}

# Create hyperparameters dictionary
hyperparameters = {
    'l2': l2_hyperparameters,
    'rf': rf_hyperparameters, 
    'gb': gb_hyperparameters
}

In [9]:
# Helper for cross-validation
from sklearn.model_selection import GridSearchCV

# Create empty dictionary called fitted_models
fitted_models = {}

# Loop through model pipelines, tuning each one and saving it to fitted_models
for name, pipeline in pipelines.items():
    model = GridSearchCV(pipeline, hyperparameters[name], cv=5, n_jobs=-1)
    
    # Fit model on X_train, y_train
    model.fit(X_train, y_train)
    
    # Store model in fitted_models[name] 
    fitted_models[name] = model
    
    # Print '{name} has been fitted'
    print(name, 'has been fitted.')

l2 has been fitted.
rf has been fitted.
gb has been fitted.


In [10]:
for name, model in fitted_models.items():
    print( name, model.best_score_ )

l2 0.9192270147662389
rf 0.9191816015058597
gb 0.919147538337582


In [11]:
from sklearn.metrics import mean_absolute_error, roc_auc_score, r2_score

In [12]:
for name, model in fitted_models.items():
    pred_train = model.predict_proba(X_train)
    pred_valid = model.predict_proba(X_valid)
    pred_test = model.predict_proba(X_test)
    train_score = roc_auc_score(y_train, pred_train[:, 1])
    valid_score = roc_auc_score(y_valid, pred_valid[:, 1])
    test_score = roc_auc_score(y_test, pred_test[:, 1])
    test_count = (model.predict(df_test)==1).sum()
    print(f"Train-score: {round(train_score, 3)}, Valid-score: {round(valid_score, 3)}, Test-score: {round(test_score, 3)}, Target count: {test_count}")

Train-score: 0.715, Valid-score: 0.709, Test-score: 0.7, Target count: 75
Train-score: 0.794, Valid-score: 0.703, Test-score: 0.686, Target count: 298
Train-score: 0.741, Valid-score: 0.724, Test-score: 0.706, Target count: 374


In [33]:
best_model = fitted_models['gb']
df_test_to_submit = test_data.drop(columns=['NAME_CONTRACT_TYPE']).copy()
df_test_to_submit['TARGET'] = best_model.predict(df_test)
df_test_to_submit.to_csv('data/data_to_submit.csv', index=None)

In [14]:
df_test_to_submit.shape

(165141, 2)

In [15]:
def calculate_feature_separating_ability(
    features: pd.DataFrame, target: pd.Series, fill_value: float = -9999) -> pd.DataFrame:
    scores = {}
    for feature in features:
        score = roc_auc_score(
            target, features[feature].fillna(fill_value)
        )
        scores[feature] = 2*score - 1

    scores = pd.Series(scores)
    scores = scores.sort_values(ascending=False)

    return scores

In [16]:
scores = calculate_feature_separating_ability(
    features, target
)
scores.head(n=40)

GENDER_M                                        0.07
EDUCATION_LEVEL_Secondary / secondary special   0.07
OWN_CAR_AGE                                     0.05
AMOUNT_ANNUITY_to_TOTAL_SALARY                  0.04
CHILDRENS                                       0.04
NAME_CONTRACT_TYPE_Cash                         0.03
MISSING_AMT_REQ_CREDIT_BUREAU_MON               0.03
MISSING_EXTERNAL_SCORING_RATING_3               0.03
FAMILY_SIZE_3+                                  0.03
FAMILY_STATUS_Single / not married              0.03
CHILDREN_1                                      0.03
NEW_and_BKI_ANNUITY_to_CREDIT                   0.03
MISSING_EXTERNAL_SCORING_RATING_1               0.02
AMT_REQ_CREDIT_BUREAU_YEAR                      0.02
FAMILY_STATUS_Civil marriage                    0.02
FAMILY_SIZE                                     0.02
NEW_and_BKI_ANNUITY_to_TOTAL_SALARY             0.01
CHILDREN_2                                      0.01
BKI_CLOSED_APPLICATION_NUMBER_COUNT           

In [17]:
"""RESULTS:
    BASED with fillna(-9999):
        Train-score: 0.59, Valid-score: 0.589, Test-score: 0.603
        Train-score: 0.798, Valid-score: 0.691, Test-score: 0.681
        Train-score: 0.731, Valid-score: 0.711, Test-score: 0.699
        Target: 0, 1, 83
        Score: 0.50283
"""

'RESULTS:\n    BASED with fillna(-9999):\n        Train-score: 0.59, Valid-score: 0.589, Test-score: 0.603\n        Train-score: 0.798, Valid-score: 0.691, Test-score: 0.681\n        Train-score: 0.731, Valid-score: 0.711, Test-score: 0.699\n        Target: 0, 1, 83\n        Score: 0.50283\n'

In [18]:
"""RESULTS:
    BASED with flagged missing features and filling missing:
        Train-score: 0.698, Valid-score: 0.692, Test-score: 0.677
        Train-score: 0.776, Valid-score: 0.698, Test-score: 0.682
        Train-score: 0.728, Valid-score: 0.709, Test-score: 0.699
        Target: 14, 162, 368
        Score: 0.50213
"""

'RESULTS:\n    BASED with flagged missing features and filling missing:\n        Train-score: 0.698, Valid-score: 0.692, Test-score: 0.677\n        Train-score: 0.776, Valid-score: 0.698, Test-score: 0.682\n        Train-score: 0.728, Valid-score: 0.709, Test-score: 0.699\n        Target: 14, 162, 368\n        Score: 0.50213\n'

In [19]:
"""RESULTS:
    BASED with flagged missing features, filling missing values, process categorical features and outliers:
        Train-score: 0.714, Valid-score: 0.711, Test-score: 0.704
        Train-score: 0.79, Valid-score: 0.702, Test-score: 0.687
        Train-score: 0.734, Valid-score: 0.719, Test-score: 0.707
        Target: 36, 100, 422
        Score: 0.50833
"""

'RESULTS:\n    BASED with flagged missing features, filling missing values, process categorical features and outliers:\n        Train-score: 0.714, Valid-score: 0.711, Test-score: 0.704\n        Train-score: 0.79, Valid-score: 0.702, Test-score: 0.687\n        Train-score: 0.734, Valid-score: 0.719, Test-score: 0.707\n        Target: 36, 100, 422\n        Score: 0.50833\n'

In [20]:
"""RESULTS:
    PREV + grouped family size and children:
        Train-score: 0.714, Valid-score: 0.711, Test-score: 0.703, Target count: 41
        Train-score: 0.791, Valid-score: 0.704, Test-score: 0.691, Target count: 166
        Train-score: 0.734, Valid-score: 0.718, Test-score: 0.708, Target count: 435
        Score: 
"""

'RESULTS:\n    PREV + grouped family size and children:\n        Train-score: 0.714, Valid-score: 0.711, Test-score: 0.703, Target count: 41\n        Train-score: 0.791, Valid-score: 0.704, Test-score: 0.691, Target count: 166\n        Train-score: 0.734, Valid-score: 0.718, Test-score: 0.708, Target count: 435\n        Score: \n'

In [21]:
"""RESULTS:
    PREV + Financial metrics:
    Train-score: 0.715, Valid-score: 0.711, Test-score: 0.704, Target count: 44
    Train-score: 0.792, Valid-score: 0.706, Test-score: 0.695, Target count: 141
    Train-score: 0.744, Valid-score: 0.726, Test-score: 0.712, Target count: 442
    Score: 0.50823
    GENDER_M                                        0.07
    EDUCATION_LEVEL_Secondary / secondary special   0.07
    OWN_CAR_AGE                                     0.05
    AMOUNT_ANNUITY_to_TOTAL_SALARY                  0.04
    CHILDRENS                                       0.04
    NAME_CONTRACT_TYPE_Cash                         0.03
    MISSING_AMT_REQ_CREDIT_BUREAU_MON               0.03
    MISSING_EXTERNAL_SCORING_RATING_3               0.03
    FAMILY_SIZE_3+                                  0.03
    FAMILY_STATUS_Single / not married              0.03
"""

'RESULTS:\n    PREV + Financial metrics:\n    Train-score: 0.715, Valid-score: 0.711, Test-score: 0.704, Target count: 44\n    Train-score: 0.792, Valid-score: 0.706, Test-score: 0.695, Target count: 141\n    Train-score: 0.744, Valid-score: 0.726, Test-score: 0.712, Target count: 442\n    Score: 0.50823\n    GENDER_M                                        0.07\n    EDUCATION_LEVEL_Secondary / secondary special   0.07\n    OWN_CAR_AGE                                     0.05\n    AMOUNT_ANNUITY_to_TOTAL_SALARY                  0.04\n    CHILDRENS                                       0.04\n    NAME_CONTRACT_TYPE_Cash                         0.03\n    MISSING_AMT_REQ_CREDIT_BUREAU_MON               0.03\n    MISSING_EXTERNAL_SCORING_RATING_3               0.03\n    FAMILY_SIZE_3+                                  0.03\n    FAMILY_STATUS_Single / not married              0.03\n'

In [22]:
"""RESULTS:
    PREV + Scoring metrics:
    Train-score: 0.715, Valid-score: 0.711, Test-score: 0.702, Target count: 84
    Train-score: 0.801, Valid-score: 0.702, Test-score: 0.691, Target count: 301
    Train-score: 0.74, Valid-score: 0.725, Test-score: 0.709, Target count: 377
    Score: 0.50758
"""

'RESULTS:\n    PREV + Scoring metrics:\n    Train-score: 0.715, Valid-score: 0.711, Test-score: 0.702, Target count: 84\n    Train-score: 0.801, Valid-score: 0.702, Test-score: 0.691, Target count: 301\n    Train-score: 0.74, Valid-score: 0.725, Test-score: 0.709, Target count: 377\n    Score: 0.50758\n'

In [23]:
"""RESULTS:
    PREV + Scoring metrics + removed min max:
    Train-score: 0.715, Valid-score: 0.711, Test-score: 0.701, Target count: 92
    Train-score: 0.795, Valid-score: 0.703, Test-score: 0.684, Target count: 311
    Train-score: 0.74, Valid-score: 0.726, Test-score: 0.709, Target count: 431
    GENDER_M                                        0.07
    EDUCATION_LEVEL_Secondary / secondary special   0.07
    OWN_CAR_AGE                                     0.05
    AMOUNT_ANNUITY_to_TOTAL_SALARY                  0.04
    CHILDRENS                                       0.04
    NAME_CONTRACT_TYPE_Cash                         0.03
    MISSING_AMT_REQ_CREDIT_BUREAU_MON               0.03
    MISSING_EXTERNAL_SCORING_RATING_3               0.03
    FAMILY_SIZE_3+                                  0.03
    FAMILY_STATUS_Single / not married              0.03
"""

'RESULTS:\n    PREV + Scoring metrics + removed min max:\n    Train-score: 0.715, Valid-score: 0.711, Test-score: 0.701, Target count: 92\n    Train-score: 0.795, Valid-score: 0.703, Test-score: 0.684, Target count: 311\n    Train-score: 0.74, Valid-score: 0.726, Test-score: 0.709, Target count: 431\n    GENDER_M                                        0.07\n    EDUCATION_LEVEL_Secondary / secondary special   0.07\n    OWN_CAR_AGE                                     0.05\n    AMOUNT_ANNUITY_to_TOTAL_SALARY                  0.04\n    CHILDRENS                                       0.04\n    NAME_CONTRACT_TYPE_Cash                         0.03\n    MISSING_AMT_REQ_CREDIT_BUREAU_MON               0.03\n    MISSING_EXTERNAL_SCORING_RATING_3               0.03\n    FAMILY_SIZE_3+                                  0.03\n    FAMILY_STATUS_Single / not married              0.03\n'

In [24]:
"""RESULTS:
    PREV + Prev refused applications
    Train-score: 0.715, Valid-score: 0.711, Test-score: 0.701, Target count: 93
    Train-score: 0.795, Valid-score: 0.705, Test-score: 0.692, Target count: 288
    Train-score: 0.739, Valid-score: 0.726, Test-score: 0.708, Target count: 443
    """

'RESULTS:\n    PREV + Prev refused applications\n    Train-score: 0.715, Valid-score: 0.711, Test-score: 0.701, Target count: 93\n    Train-score: 0.795, Valid-score: 0.705, Test-score: 0.692, Target count: 288\n    Train-score: 0.739, Valid-score: 0.726, Test-score: 0.708, Target count: 443\n    '

In [25]:
"""RESULTS:
    PREV + Prev refused AMT_APPLICATION
    Train-score: 0.715, Valid-score: 0.711, Test-score: 0.701, Target count: 93
    Train-score: 0.794, Valid-score: 0.708, Test-score: 0.694, Target count: 315
    Train-score: 0.739, Valid-score: 0.725, Test-score: 0.708, Target count: 415
"""

'RESULTS:\n    PREV + Prev refused AMT_APPLICATION\n    Train-score: 0.715, Valid-score: 0.711, Test-score: 0.701, Target count: 93\n    Train-score: 0.794, Valid-score: 0.708, Test-score: 0.694, Target count: 315\n    Train-score: 0.739, Valid-score: 0.725, Test-score: 0.708, Target count: 415\n'

In [26]:
"""RESULTS:
    PREV + Prev refused 
    Train-score: 0.715, Valid-score: 0.711, Test-score: 0.701, Target count: 93
    Train-score: 0.798, Valid-score: 0.705, Test-score: 0.684, Target count: 260
    Train-score: 0.739, Valid-score: 0.726, Test-score: 0.709, Target count: 424
"""

'RESULTS:\n    PREV + Prev refused \n    Train-score: 0.715, Valid-score: 0.711, Test-score: 0.701, Target count: 93\n    Train-score: 0.798, Valid-score: 0.705, Test-score: 0.684, Target count: 260\n    Train-score: 0.739, Valid-score: 0.726, Test-score: 0.709, Target count: 424\n'

In [27]:
"""RESULTS:
    PREV + PREV ACTIVE BKI
    Train-score: 0.715, Valid-score: 0.708, Test-score: 0.699, Target count: 74
    Train-score: 0.801, Valid-score: 0.7, Test-score: 0.689, Target count: 294
    Train-score: 0.741, Valid-score: 0.725, Test-score: 0.707, Target count: 374
    """

'RESULTS:\n    PREV + PREV ACTIVE BKI\n    Train-score: 0.715, Valid-score: 0.708, Test-score: 0.699, Target count: 74\n    Train-score: 0.801, Valid-score: 0.7, Test-score: 0.689, Target count: 294\n    Train-score: 0.741, Valid-score: 0.725, Test-score: 0.707, Target count: 374\n    '

In [28]:
"""RESULTS:
    PREV + PREV BKI + Active + Closed
    Train-score: 0.714, Valid-score: 0.708, Test-score: 0.7, Target count: 73
    Train-score: 0.803, Valid-score: 0.704, Test-score: 0.691, Target count: 303
    Train-score: 0.741, Valid-score: 0.726, Test-score: 0.707, Target count: 397
"""    

'RESULTS:\n    PREV + PREV BKI + Active + Closed\n    Train-score: 0.714, Valid-score: 0.708, Test-score: 0.7, Target count: 73\n    Train-score: 0.803, Valid-score: 0.704, Test-score: 0.691, Target count: 303\n    Train-score: 0.741, Valid-score: 0.726, Test-score: 0.707, Target count: 397\n'

In [29]:
"""RESULTS:
    PREV + PREV BKI + Active + Closed + new bki statistics + null 0
    Train-score: 0.715, Valid-score: 0.709, Test-score: 0.701, Target count: 77
    Train-score: 0.8, Valid-score: 0.7, Test-score: 0.677, Target count: 302
    Train-score: 0.741, Valid-score: 0.724, Test-score: 0.708, Target count: 403
"""   

'RESULTS:\n    PREV + PREV BKI + Active + Closed + new bki statistics\n    Train-score: 0.715, Valid-score: 0.709, Test-score: 0.701, Target count: 77\n    Train-score: 0.8, Valid-score: 0.7, Test-score: 0.677, Target count: 302\n    Train-score: 0.741, Valid-score: 0.724, Test-score: 0.708, Target count: 403\n'

In [None]:
"""RESULTS:
    PREV + PREV BKI + Active + Closed + new bki statistics + null median
    Train-score: 0.715, Valid-score: 0.709, Test-score: 0.7, Target count: 75
    Train-score: 0.794, Valid-score: 0.703, Test-score: 0.686, Target count: 298
    Train-score: 0.741, Valid-score: 0.724, Test-score: 0.706, Target count: 374
    Score: 0.50718
 """   