In [1]:
# Guide: https://www.tensorflow.org/decision_forests/tutorials/beginner_colab

In [30]:
# Standard stack
import datetime
import pandas as pd
import numpy as np
import re

# Visualization
from pandas_profiling import ProfileReport
#import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
import tensorflow as tf
import tensorflow_decision_forests as tfdf
try:
    from wurlitzer import sys_pipes
except:
    from colabtools.googlelog import CaptureLog as sys_pipes

# Scikit-learn packages
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold

# display
from IPython.core.magic import register_line_magic
from IPython.display import Javascript

# Constants

In [31]:
TRAIN_PATH = "data/train.csv"
TEST_PATH = "data/test.csv"
label = "attrition_flag"

# Load Data

In [32]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
submission = pd.read_csv(TEST_PATH)

In [33]:
train.head(1)

Unnamed: 0,id,attrition_flag,customer_age,gender,education_level,income_category,total_relationship_count,months_inactive_12_mon,credit_limit,total_revolving_bal,total_amt_chng_q4_q1,total_trans_amt,total_trans_ct,total_ct_chng_q4_q1,avg_utilization_ratio
0,8805,0,27,F,Post-Graduate,Less than $40K,3,2,1438.3,990,0.715,3855,73,1.147,0.688


In [34]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7088 entries, 0 to 7087
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        7088 non-null   int64  
 1   attrition_flag            7088 non-null   int64  
 2   customer_age              7088 non-null   int64  
 3   gender                    7088 non-null   object 
 4   education_level           7088 non-null   object 
 5   income_category           7088 non-null   object 
 6   total_relationship_count  7088 non-null   int64  
 7   months_inactive_12_mon    7088 non-null   int64  
 8   credit_limit              7088 non-null   float64
 9   total_revolving_bal       7088 non-null   int64  
 10  total_amt_chng_q4_q1      7088 non-null   float64
 11  total_trans_amt           7088 non-null   int64  
 12  total_trans_ct            7088 non-null   int64  
 13  total_ct_chng_q4_q1       7088 non-null   float64
 14  avg_util

# Preprocess

### Drop Columns

In [35]:
train.columns

Index(['id', 'attrition_flag', 'customer_age', 'gender', 'education_level',
       'income_category', 'total_relationship_count', 'months_inactive_12_mon',
       'credit_limit', 'total_revolving_bal', 'total_amt_chng_q4_q1',
       'total_trans_amt', 'total_trans_ct', 'total_ct_chng_q4_q1',
       'avg_utilization_ratio'],
      dtype='object')

In [36]:
to_drop = ['id']

In [37]:
train = train.drop(to_drop,axis=1)
test = test.drop(to_drop,axis=1)

### Missing Values

In [38]:
def inpute_missing(dataset):
    """ 
    Edit this to fix nulls. Default version replaces all int/float with 0
    """
    for col in dataset.columns:
        if dataset[col].dtype not in [str, object]:
            dataset[col] = dataset[col].fillna(0)
    return dataset

train = inpute_missing(train)
test = inpute_missing(test)

### NLP

In [39]:
def nlp_transforms(dataset):
    """ 
    NLP tranforms here. Default, None...
    """
    return dataset

train = nlp_transforms(train)
test = nlp_transforms(test)

### Computations

In [40]:
def computation_transforms(dataset):
    """ 
    NLP tranforms here. Default, None...
    """
    return dataset

train = computation_transforms(train)
test = computation_transforms(test)

# Split & Train
Not needed if doing cross validation

In [43]:
#train_df, test_df = train_test_split(train)

#train_tf = tfdf.keras.pd_dataframe_to_tf_dataset(train_df, label=label)
#test_tf = tfdf.keras.pd_dataframe_to_tf_dataset(test_df, label=label)
#predictions = tfdf.keras.pd_dataframe_to_tf_dataset(test)

# Cross Validate

In [76]:
# The hyper-parameter templates of the Gradient Boosted Tree model.
print(tfdf.keras.GradientBoostedTreesModel.predefined_hyperparameters())

[HyperParameterTemplate(name='better_default', version=1, parameters={'growing_strategy': 'BEST_FIRST_GLOBAL'}, description='A configuration that is generally better than the default parameters without being more expensive.'), HyperParameterTemplate(name='benchmark_rank1', version=1, parameters={'growing_strategy': 'BEST_FIRST_GLOBAL', 'categorical_algorithm': 'RANDOM', 'split_axis': 'SPARSE_OBLIQUE', 'sparse_oblique_normalization': 'MIN_MAX', 'sparse_oblique_num_projections_exponent': 1.0}, description='Top ranking hyper-parameters on our benchmark slightly modified to run in reasonable time.')]


In [98]:
models = {
    #'rf_default': tfdf.keras.RandomForestModel(),
    #'gbt_default': tfdf.keras.GradientBoostedTreesModel(),
    'gbt_tune1': tfdf.keras.GradientBoostedTreesModel(hyperparameter_template="benchmark_rank1"),
    'gbt_tune2': tfdf.keras.GradientBoostedTreesModel(
                                                      num_trees=5000,
                                                      early_stopping="LOSS_INCREASE",
                                                      ),
    #'gbt_tune3': tfdf.keras.GradientBoostedTreesModel(num_trees=500,
    #                                                  growing_strategy="BEST_FIRST_GLOBAL",
    #                                                  max_depth=8,
    #                                                  split_axis="SPARSE_OBLIQUE",
    #                                                  categorical_algorithm="RANDOM",
    #                                                  early_stopping="LOSS_INCREASE"
    #                                                  ),
    }

# Run a 10-folds cross-validation.
accuraties_per_fold = []

for key in models:
    print(key)
    for  fold_idx, (train_indices, test_indices) in enumerate(KFold(n_splits=10, shuffle=True, random_state=42).split(train)):

        print(f"Running fold {fold_idx+1}")

        # Extract the training and testing examples.
        sub_train_df = train.iloc[train_indices]
        sub_test_df = train.iloc[test_indices]

        # Convert the examples into tensorflow datasets.
        sub_train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(sub_train_df, label=label)
        sub_test_df = tfdf.keras.pd_dataframe_to_tf_dataset(sub_test_df, label=label)

        # Train the model.
        models[key].compile(metrics=["BinaryCrossentropy"])
        models[key].fit(sub_train_ds, verbose=False)

        # Evaluate the model.
        evaluation = models[key].evaluate(sub_test_df, return_dict=True, verbose=False)
        #print(f"Evaluation {evaluation}")

        accuraties_per_fold.append(evaluation["binary_crossentropy"])

    print(f"Cross-validated Score: {np.mean(accuraties_per_fold)} for model: " + key)

gbt_tune1
Running fold 1
Running fold 2
Running fold 3
Running fold 4
Running fold 5
Running fold 6
Running fold 7
Running fold 8
Running fold 9
Running fold 10
Cross-validated Score: 0.03118262328207493 for model: gbt_tune1
gbt_tune2
Running fold 1
Running fold 2
Running fold 3
Running fold 4
Running fold 5
Running fold 6
Running fold 7
Running fold 8
Running fold 9
Running fold 10
Cross-validated Score: 0.032170530408620834 for model: gbt_tune2


# Test Best Models

In [86]:
train_df, test_df = train_test_split(train)
train_tf = tfdf.keras.pd_dataframe_to_tf_dataset(train_df, label=label)
test_tf = tfdf.keras.pd_dataframe_to_tf_dataset(test_df, label=label)

In [87]:
evaluation = {}
for key in models:
    print(key)
    #Evaluate: metric logsloss: BinaryCrossentropy
    models[key].compile(metrics=["BinaryCrossentropy"])
    
    #with sys_pipes():
    models[key].fit(x=train_tf)
    evaluation[key] = models[key].evaluate(test_tf, return_dict=True)

gbt_tune1
gbt_tune2
gbt_tune3


# Train Final Model

In [None]:
predictions = tfdf.keras.pd_dataframe_to_tf_dataset(test)

In [51]:
train_tf = tfdf.keras.pd_dataframe_to_tf_dataset(train, label=label)

In [53]:
# A more complex, but possibly, more accurate model.
model = tfdf.keras.GradientBoostedTreesModel()

model.compile(metrics=["accuracy","BinaryCrossentropy"])
model.fit(train_tf)



<tensorflow.python.keras.callbacks.History at 0x7f07230afb20>

# Predictions

In [54]:
scores = model.predict(predictions)

In [56]:
scores

array([[0.00608435],
       [0.7595951 ],
       [0.00147151],
       ...,
       [0.02006311],
       [0.92395747],
       [0.95160055]], dtype=float32)

In [59]:
submission[label] = scores