In [457]:
# Standard stack
import datetime
import pandas as pd
import numpy as np
import re

# Visualization
from pandas_profiling import ProfileReport
#import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
import tensorflow as tf
import tensorflow_decision_forests as tfdf
try:
    from wurlitzer import sys_pipes
except:
    from colabtools.googlelog import CaptureLog as sys_pipes

# Scikit-learn packages
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, OrdinalEncoder
from sklearn.compose import ColumnTransformer

# display
from IPython.core.magic import register_line_magic
from IPython.display import Javascript

In [458]:
#KAGGLE_EVAL_METRIC = 'logloss' # string name for loss function in xgboost

TRAIN_PATH = "data/train.csv"
TEST_PATH = "data/test.csv"

label = "attrition_flag"

In [459]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

In [465]:
train['new_f_1'] = (train['months_inactive_12_mon']/12) * train['total_ct_chng_q4_q1']
test['new_f_1'] = (test['months_inactive_12_mon']/12) * test['total_ct_chng_q4_q1']

In [468]:
train.head(1)

Unnamed: 0,id,attrition_flag,customer_age,gender,education_level,income_category,total_relationship_count,months_inactive_12_mon,credit_limit,total_revolving_bal,total_amt_chng_q4_q1,total_trans_amt,total_trans_ct,total_ct_chng_q4_q1,avg_utilization_ratio,clusters,new_f_1
0,8805,0,27,F,Post-Graduate,Less than $40K,3,2,1438.3,990,0.715,3855,73,1.147,0.688,1,0.191167


In [541]:
features = ['attrition_flag', 'customer_age', 'gender', 'education_level','income_category',
            'total_relationship_count', 'months_inactive_12_mon','credit_limit', 'total_revolving_bal', 'total_amt_chng_q4_q1',
            'total_trans_amt', 'total_trans_ct', 'total_ct_chng_q4_q1','avg_utilization_ratio']

In [470]:
#new_train = data.drop(features, axis=1)
train = train[features]
test = test[['customer_age', 'gender', 'education_level', 'income_category',
       'total_relationship_count', 'months_inactive_12_mon', 'credit_limit',
      'total_revolving_bal', 'total_amt_chng_q4_q1', 'total_trans_amt',
       'total_trans_ct', 'total_ct_chng_q4_q1', 'avg_utilization_ratio','new_f_1']]

In [471]:
#X_train, X_test = train_test_split(train,test_size=0.2, random_state=2)
#train_tf = tfdf.keras.pd_dataframe_to_tf_dataset(X_train, label=label)
#test_tf = tfdf.keras.pd_dataframe_to_tf_dataset(X_test, label=label)

In [438]:
train_tf = tfdf.keras.pd_dataframe_to_tf_dataset(train, label=label)
test_tf = tfdf.keras.pd_dataframe_to_tf_dataset(test)
predictions = tfdf.keras.pd_dataframe_to_tf_dataset(test,task=tfdf.keras.Task.REGRESSION)

In [439]:
# A more complex, but possibly, more accurate model.
models = {
    'gbt_tune2': tfdf.keras.GradientBoostedTreesModel(hyperparameter_template="benchmark_rank1",sparse_oblique_normalization='MIN_MAX'),
    }

evaluation = {}
for key in models:
    print(key)
    #Evaluate: metric logsloss: BinaryCrossentropy
    models[key].compile(metrics=["BinaryCrossentropy"])
    
    #with sys_pipes():
    models[key].fit(x=train_tf)
    #evaluation[key] = models[key].evaluate(test_tf, return_dict=True)

gbt_tune2


In [425]:
train_df

['attrition_flag',
 'customer_age',
 'gender',
 'education_level',
 'income_category',
 'total_relationship_count',
 'months_inactive_12_mon',
 'credit_limit',
 'total_revolving_bal',
 'total_amt_chng_q4_q1',
 'total_trans_amt',
 'total_trans_ct',
 'total_ct_chng_q4_q1',
 'avg_utilization_ratio',
 'clusters']

In [432]:
train_indices

array([   0,    1,    2, ..., 7085, 7086, 7087])

In [473]:
from sklearn.model_selection import KFold
import numpy as np

train_df = train[features]

models = {
    'gbt_default': tfdf.keras.GradientBoostedTreesModel(),
    'gbt_tune1': tfdf.keras.GradientBoostedTreesModel(hyperparameter_template="benchmark_rank1"),
    'gbt_tune2': tfdf.keras.GradientBoostedTreesModel(hyperparameter_template="benchmark_rank1",sparse_oblique_normalization='MIN_MAX'),
    }

accuraties_per_fold = [] # Test accuracy on the individual folds.

# Run a 10-folds cross-validation.
for key in models:
    print(key)
    for  fold_idx, (train_indices, test_indices) in enumerate(KFold(n_splits=10, shuffle=True, random_state=42).split(train_df)):

        print(f"Running fold {fold_idx+1}")

        # Extract the training and testing examples.
        sub_train_df = train_df.iloc[train_indices]
        sub_test_df = train_df.iloc[test_indices]

        # Convert the examples into tensorflow datasets.
        sub_train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(sub_train_df, label=label)
        sub_test_df = tfdf.keras.pd_dataframe_to_tf_dataset(sub_test_df, label=label)

        # Train the model.
        models[key].compile(metrics=["BinaryCrossentropy"])
        models[key].fit(sub_train_ds, verbose=False)

        # Evaluate the model.
        evaluation = models[key].evaluate(sub_test_df, return_dict=True, verbose=False)
        print(f"Evaluation {evaluation}")

        accuraties_per_fold.append(evaluation["binary_crossentropy"])

    print(f"Cross-validated accuracy: {np.mean(accuraties_per_fold)} for model: " + key)


gbt_default
Running fold 1
Evaluation {'loss': 0.0, 'binary_crossentropy': 0.07758036255836487}
Running fold 2
Evaluation {'loss': 0.0, 'binary_crossentropy': 0.023167598992586136}
Running fold 3
Evaluation {'loss': 0.0, 'binary_crossentropy': 0.025844544172286987}
Running fold 4
Evaluation {'loss': 0.0, 'binary_crossentropy': 0.02621324174106121}
Running fold 5
Evaluation {'loss': 0.0, 'binary_crossentropy': 0.03111068345606327}
Running fold 6
Evaluation {'loss': 0.0, 'binary_crossentropy': 0.02202218770980835}
Running fold 7
Evaluation {'loss': 0.0, 'binary_crossentropy': 0.03466670960187912}
Running fold 8
Evaluation {'loss': 0.0, 'binary_crossentropy': 0.025308076292276382}
Running fold 9
Evaluation {'loss': 0.0, 'binary_crossentropy': 0.028443390503525734}
Running fold 10
Evaluation {'loss': 0.0, 'binary_crossentropy': 0.03499487042427063}
Cross-validated accuracy: 0.032935166545212266 for model: gbt_default
gbt_tune1
Running fold 1
Evaluation {'loss': 0.0, 'binary_crossentropy': 

## Train Final Model on ALL data

In [493]:
train

Unnamed: 0,attrition_flag,customer_age,gender,education_level,income_category,total_relationship_count,months_inactive_12_mon,credit_limit,total_revolving_bal,total_amt_chng_q4_q1,total_trans_amt,total_trans_ct,total_ct_chng_q4_q1,avg_utilization_ratio,new_f_1
0,0,27,F,Post-Graduate,Less than $40K,3,2,1438.3,990,0.715,3855,73,1.147,0.688,0.191167
1,0,42,F,College,Less than $40K,6,4,3050.0,1824,0.771,1973,50,1.381,0.598,0.460333
2,1,47,F,Unknown,Less than $40K,3,3,1561.0,0,0.502,1947,28,0.556,0.000,0.139000
3,0,44,M,Uneducated,$80K - $120K,1,3,25428.0,1528,0.725,13360,97,0.796,0.060,0.199000
4,0,54,M,Graduate,$60K - $80K,3,3,2947.0,2216,0.760,1744,53,0.606,0.752,0.151500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7083,0,37,M,Uneducated,$40K - $60K,3,3,2186.0,1185,0.998,2614,67,0.861,0.542,0.215250
7084,0,50,M,High School,$120K +,5,6,10747.0,1027,0.925,4538,63,0.658,0.096,0.329000
7085,0,62,F,Post-Graduate,$40K - $60K,6,1,2644.0,2207,0.866,1194,30,0.765,0.835,0.063750
7086,0,39,F,College,Less than $40K,1,2,12338.0,1277,0.828,15869,109,0.758,0.104,0.126333


In [545]:
sub_features = list(filter(lambda x: x != label, features))

In [546]:
sub_features

['customer_age',
 'gender',
 'education_level',
 'income_category',
 'total_relationship_count',
 'months_inactive_12_mon',
 'credit_limit',
 'total_revolving_bal',
 'total_amt_chng_q4_q1',
 'total_trans_amt',
 'total_trans_ct',
 'total_ct_chng_q4_q1',
 'avg_utilization_ratio']

In [547]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
submission = test[['id']]

train['new_f_1'] = (train['months_inactive_12_mon']/12) * train['total_ct_chng_q4_q1']
test['new_f_1'] = (test['months_inactive_12_mon']/12) * test['total_ct_chng_q4_q1']

train = train[features]
test = test[sub_features]

train_tf = tfdf.keras.pd_dataframe_to_tf_dataset(train, label=label)
test_tf = tfdf.keras.pd_dataframe_to_tf_dataset(test)
# A more complex, but possibly, more accurate model.
model = tfdf.keras.GradientBoostedTreesModel(hyperparameter_template="benchmark_rank1",sparse_oblique_normalization='MIN_MAX')
model.compile(metrics=["BinaryCrossentropy"])
model.fit(x=train_tf)



<tensorflow.python.keras.callbacks.History at 0x7fd78848d490>

In [548]:
# gbt_tune2 - with features clusers = 0.03938592771689097
# gbt_tine2 without clusters: 0.03184122803310553

In [549]:
scores = model.predict(test_tf)

In [550]:
scores

array([[0.00498693],
       [0.8221112 ],
       [0.00216914],
       ...,
       [0.01304496],
       [0.9569753 ],
       [0.9839114 ]], dtype=float32)

In [551]:
submission['attrition_flag'] = scores

In [552]:
submission

Unnamed: 0,id,attrition_flag
0,3005,0.004987
1,143,0.822111
2,5508,0.002169
3,6474,0.001086
4,9784,0.003082
...,...,...
3034,6697,0.002406
3035,1310,0.003340
3036,4463,0.013045
3037,6944,0.956975


In [553]:
submission.to_csv('submission.csv', index=False)

In [554]:
!kaggle competitions submit -c sliced-s01e07-HmPsw2 -f submission.csv -m "sub 2"

100%|██████████████████████████████████████| 50.6k/50.6k [00:00<00:00, 78.8kB/s]
Successfully submitted to SLICED s01e07