In [1]:
# Standard stack
import datetime
import pandas as pd
import numpy as np
import re

# Visualization
from pandas_profiling import ProfileReport

# Modeling
import tensorflow as tf
import tensorflow_decision_forests as tfdf
try:
    from wurlitzer import sys_pipes
except:
    from colabtools.googlelog import CaptureLog as sys_pipes

# processing
from sklearn.model_selection import train_test_split

# notebook
from IPython.core.magic import register_line_magic
from IPython.display import Javascript

In [2]:
KAGGLE_EVAL_METRIC = 'logloss' # string name for loss function in xgboost

TRAIN_PATH = "data/train.csv"
TEST_PATH = "data/test.csv"

label = "target"

In [3]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

In [4]:
to_drop = ['id']

In [5]:
train = train.drop(to_drop, axis=1)
test = test.drop(to_drop, axis=1)

In [6]:
train.target.nunique()

9

In [7]:
train.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,target
0,0,0,6,1,0,0,0,0,7,0,...,0,0,0,0,0,0,2,0,0,Class_6
1,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,1,0,Class_6
2,0,0,0,0,0,1,0,3,0,0,...,0,0,0,0,1,0,0,0,0,Class_2
3,0,0,7,0,1,5,2,2,0,1,...,0,4,0,2,2,0,4,3,0,Class_8
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Class_2


In [8]:
train.shape

(200000, 76)

In [9]:
train_ds_pd, test_ds_pd = train_test_split(train, test_size=0.2, random_state=42)

In [10]:
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_ds_pd, label=label)
score_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test)

In [11]:
train_ds_pd.head(1)

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,target
153248,0,0,0,2,0,1,0,0,0,0,...,0,5,0,0,0,4,2,0,0,Class_6


In [12]:
features = ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4',
       'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9',
       'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14',
       'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19',
       'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24',
       'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29',
       'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34',
       'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_39',
       'feature_40', 'feature_41', 'feature_42', 'feature_43', 'feature_44',
       'feature_45', 'feature_46', 'feature_47', 'feature_48', 'feature_49',
       'feature_50', 'feature_51', 'feature_52', 'feature_53', 'feature_54',
       'feature_55', 'feature_56', 'feature_57', 'feature_58', 'feature_59',
       'feature_60', 'feature_61', 'feature_62', 'feature_63', 'feature_64',
       'feature_65', 'feature_66', 'feature_67', 'feature_68', 'feature_69',
       'feature_70', 'feature_71', 'feature_72', 'feature_73', 'feature_74']

In [13]:
def create_nn_input(features):
    nn_raw_inputs = []
    for feature in features:
        nn_raw_inputs.append(tf.keras.Input(shape=(1,), name=feature, dtype="float"))
    return nn_raw_inputs

In [14]:
nn_raw_inputs = create_nn_input(features)

In [15]:
#nn_raw_inputs

In [16]:
# normalize
Normalization = tf.keras.layers.experimental.preprocessing.Normalization

def normalize_num_input(dataframe,raw_inputs):
    nn_processed_inputs = []
    for raw_input in raw_inputs:
        values = dataframe[raw_input.name].values
        input_normalizer = Normalization()
        input_normalizer.adapt(values)
        normalized_input = input_normalizer(raw_input)
        nn_processed_inputs.append(normalized_input)
    return nn_processed_inputs

In [17]:
nn_processed_inputs = normalize_num_input(train_ds_pd,nn_raw_inputs)

## Create Models

In [18]:
y = tf.keras.layers.Concatenate()(nn_processed_inputs)
y = tf.keras.layers.Dense(16, activation=tf.nn.relu6)(y)
last_layer = tf.keras.layers.Dense(8, activation=tf.nn.relu, name="last")(y)

# "3" for the three label classes. If it were a binary classification, the
# output dim would be 1.
classification_output = tf.keras.layers.Dense(9)(y)

nn_model = tf.keras.models.Model(nn_raw_inputs, classification_output)

# To reduce the risk of mistakes, group both the decision forest and the
# neural network in a single keras model.
nn_without_head = tf.keras.models.Model(inputs=nn_model.inputs, outputs=last_layer)
df_and_nn_model = tfdf.keras.GradientBoostedTreesModel(
    num_trees=500,
    growing_strategy="BEST_FIRST_GLOBAL",
    hyperparameter_template="better_default@v1",
    max_depth=8,
    split_axis="SPARSE_OBLIQUE",
    preprocessing=nn_without_head)

In [19]:
nn_model.compile(
  optimizer=tf.keras.optimizers.Adam(),
  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=["accuracy"])

nn_model.fit(x=train_ds, validation_data=test_ds, epochs=10, verbose=False)
print("Evaluation:", nn_model.evaluate(test_ds))

Evaluation: [1.7765271663665771, 0.35324999690055847]


In [None]:
df_and_nn_model.compile(metrics=["accuracy"])
df_and_nn_model.fit(x=train_ds)
print("Evaluation:", df_and_nn_model.evaluate(test_ds))



# Submit

In [24]:
nn_model.predict(score_ds)

array([[-0.28142238,  1.6285971 ,  0.7621293 , ..., -0.7531957 ,
        -0.01658785,  0.583485  ],
       [-0.15659511,  0.45305225,  0.12436393, ...,  0.56598884,
         1.7222207 ,  0.9323979 ],
       [-0.54029775, -0.26007187, -0.6321474 , ..., -0.12406699,
         1.211106  ,  0.26631206],
       ...,
       [-0.17781752,  1.581102  ,  0.7540624 , ..., -0.38159484,
         0.44428742,  0.79500735],
       [-0.5016818 , -0.8102361 , -0.82603717, ...,  0.7245041 ,
         2.282336  ,  0.6098359 ],
       [-0.29859027,  0.5207304 ,  0.21241978, ...,  0.07104929,
         1.0848229 ,  0.8045405 ]], dtype=float32)