In [None]:
# https://www.kaggle.com/c/tabular-playground-series-jun-2021/data

In [1]:
# Standard stack
import datetime
import pandas as pd
import numpy as np
import re

# Visualization
from pandas_profiling import ProfileReport

# Modeling
import tensorflow as tf
import tensorflow_decision_forests as tfdf
try:
    from wurlitzer import sys_pipes
except:
    from colabtools.googlelog import CaptureLog as sys_pipes

# processing
from sklearn.model_selection import train_test_split

# notebook
from IPython.core.magic import register_line_magic
from IPython.display import Javascript

In [2]:
KAGGLE_EVAL_METRIC = 'logloss' # string name for loss function in xgboost

TRAIN_PATH = "data/train.csv"
TEST_PATH = "data/test.csv"

label = "target"

In [3]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

In [4]:
to_drop = ['id']

In [5]:
train = train.drop(to_drop, axis=1)
test = test

In [6]:
train.target.nunique()

9

In [7]:
train.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,target
0,0,0,6,1,0,0,0,0,7,0,...,0,0,0,0,0,0,2,0,0,Class_6
1,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,1,0,Class_6
2,0,0,0,0,0,1,0,3,0,0,...,0,0,0,0,1,0,0,0,0,Class_2
3,0,0,7,0,1,5,2,2,0,1,...,0,4,0,2,2,0,4,3,0,Class_8
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Class_2


In [8]:
train.shape

(200000, 76)

In [9]:
train_ds_pd, test_ds_pd = train_test_split(train, test_size=0.2, random_state=42)

In [10]:
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_ds_pd, label=label)
score_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test)

In [11]:
train_ds_pd.head(1)

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,target
153248,0,0,0,2,0,1,0,0,0,0,...,0,5,0,0,0,4,2,0,0,Class_6


In [12]:
features = ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4',
       'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9',
       'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14',
       'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19',
       'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24',
       'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29',
       'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34',
       'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_39',
       'feature_40', 'feature_41', 'feature_42', 'feature_43', 'feature_44',
       'feature_45', 'feature_46', 'feature_47', 'feature_48', 'feature_49',
       'feature_50', 'feature_51', 'feature_52', 'feature_53', 'feature_54',
       'feature_55', 'feature_56', 'feature_57', 'feature_58', 'feature_59',
       'feature_60', 'feature_61', 'feature_62', 'feature_63', 'feature_64',
       'feature_65', 'feature_66', 'feature_67', 'feature_68', 'feature_69',
       'feature_70', 'feature_71', 'feature_72', 'feature_73', 'feature_74']

In [13]:
def create_nn_input(features):
    nn_raw_inputs = []
    for feature in features:
        nn_raw_inputs.append(tf.keras.Input(shape=(1,), name=feature, dtype="float"))
    return nn_raw_inputs

In [14]:
nn_raw_inputs = create_nn_input(features)

In [15]:
#nn_raw_inputs

In [16]:
# normalize
Normalization = tf.keras.layers.experimental.preprocessing.Normalization

def normalize_num_input(dataframe,raw_inputs):
    nn_processed_inputs = []
    for raw_input in raw_inputs:
        values = dataframe[raw_input.name].values
        input_normalizer = Normalization()
        input_normalizer.adapt(values)
        normalized_input = input_normalizer(raw_input)
        nn_processed_inputs.append(normalized_input)
    return nn_processed_inputs

In [17]:
nn_processed_inputs = normalize_num_input(train_ds_pd,nn_raw_inputs)

## Create Models

In [28]:
y = tf.keras.layers.Concatenate()(nn_processed_inputs)
y = tf.keras.layers.Dense(16, activation=tf.nn.relu6)(y)
last_layer = tf.keras.layers.Dense(8, activation=tf.nn.relu, name="last")(y)

# "3" for the three label classes. If it were a binary classification, the
# output dim would be 1.
classification_output = tf.keras.layers.Dense(9, activation='softmax')(y)

nn_model = tf.keras.models.Model(nn_raw_inputs, classification_output)

# To reduce the risk of mistakes, group both the decision forest and the
# neural network in a single keras model.
nn_without_head = tf.keras.models.Model(inputs=nn_model.inputs, outputs=last_layer)
df_and_nn_model = tfdf.keras.GradientBoostedTreesModel(
    num_trees=500,
    growing_strategy="BEST_FIRST_GLOBAL",
    hyperparameter_template="better_default@v1",
    max_depth=8,
    split_axis="SPARSE_OBLIQUE",
    preprocessing=nn_without_head)

In [29]:
nn_model.compile(
  optimizer=tf.keras.optimizers.Adam(),
  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=["accuracy"])

nn_model.fit(x=train_ds, validation_data=test_ds, epochs=10, verbose=False)
print("Evaluation:", nn_model.evaluate(test_ds))



Evaluation: [1.777343511581421, 0.3544999957084656]


In [42]:
df_and_nn_model.compile(metrics=["accuracy"])
df_and_nn_model.fit(x=train_ds)
print("Evaluation:", df_and_nn_model.evaluate(test_ds))

Evaluation: [0.0, 0.32647499442100525]


In [None]:
1 + 1

In [None]:
df_and_nn_model.summary()

In [None]:
## Only Boosted Trees

In [57]:
gbt_model = tfdf.keras.GradientBoostedTreesModel(
    num_trees=500,
    growing_strategy="BEST_FIRST_GLOBAL",
    hyperparameter_template="better_default@v1",
    max_depth=8,
    split_axis="SPARSE_OBLIQUE")

In [None]:
gbt_model.compile(metrics=["accuracy"])
gbt_model.fit(x=train_ds)
print("Evaluation:", gbt_model.evaluate(test_ds))

In [None]:
nn_processed_inputs

In [None]:
gbt_model.GradientBoostedTreesModel

# Submit

In [43]:
scores = df_and_nn_model.predict(score_ds)



In [44]:
scores[0]

array([0.05123195, 0.3139285 , 0.10311405, 0.02255395, 0.01415913,
       0.23455738, 0.04244225, 0.11815401, 0.09985875], dtype=float32)

In [30]:
scores = nn_model.predict(score_ds)

In [31]:
scores[0]

array([0.05057526, 0.3466419 , 0.13916528, 0.02596324, 0.01158005,
       0.19335993, 0.03330079, 0.07270449, 0.12670901], dtype=float32)

In [45]:
scores[0].sum()

1.0

In [48]:
submission = df_and_nn_model.predict(score_ds)

In [49]:
submission

array([[0.05123195, 0.3139285 , 0.10311405, ..., 0.04244225, 0.11815401,
        0.09985875],
       [0.05292083, 0.19752948, 0.09992266, ..., 0.06918231, 0.23192424,
        0.1386662 ],
       [0.0427525 , 0.0806853 , 0.05078527, ..., 0.04061898, 0.16725162,
        0.09383605],
       ...,
       [0.0592821 , 0.16076756, 0.1134045 , ..., 0.05798278, 0.19935758,
        0.14670154],
       [0.04319179, 0.13510779, 0.07627949, ..., 0.04807479, 0.20024964,
        0.11071635],
       [0.04554993, 0.09694237, 0.06279515, ..., 0.0865614 , 0.24397598,
        0.13778292]], dtype=float32)

In [50]:
submit = pd.DataFrame(submission, columns=['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6',
       'Class_7', 'Class_8', 'Class_9'])

In [51]:
submit.head()

Unnamed: 0,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
0,0.051232,0.313929,0.103114,0.022554,0.014159,0.234557,0.042442,0.118154,0.099859
1,0.052921,0.197529,0.099923,0.02614,0.016954,0.16676,0.069182,0.231924,0.138666
2,0.042752,0.080685,0.050785,0.01976,0.013847,0.490464,0.040619,0.167252,0.093836
3,0.041876,0.17599,0.102015,0.024907,0.016299,0.145524,0.074987,0.263153,0.155249
4,0.041057,0.164555,0.08278,0.020824,0.014714,0.279067,0.051768,0.222663,0.122573


In [52]:
submit['id'] = test['id']

In [53]:
submit = submit[['id','Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6',
       'Class_7', 'Class_8', 'Class_9']]

In [54]:
submit.to_csv('submission.csv', index=False)

In [55]:
pd.read_csv('submission.csv')

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
0,200000,0.051232,0.313928,0.103114,0.022554,0.014159,0.234557,0.042442,0.118154,0.099859
1,200001,0.052921,0.197529,0.099923,0.026140,0.016954,0.166760,0.069182,0.231924,0.138666
2,200002,0.042752,0.080685,0.050785,0.019760,0.013847,0.490464,0.040619,0.167252,0.093836
3,200003,0.041876,0.175990,0.102015,0.024907,0.016299,0.145524,0.074987,0.263153,0.155249
4,200004,0.041057,0.164555,0.082780,0.020824,0.014714,0.279067,0.051768,0.222663,0.122573
...,...,...,...,...,...,...,...,...,...,...
99995,299995,0.048629,0.207479,0.094437,0.026961,0.015818,0.214119,0.059243,0.205237,0.128076
99996,299996,0.037546,0.109343,0.072015,0.020469,0.014286,0.387404,0.046645,0.219495,0.092798
99997,299997,0.059282,0.160768,0.113405,0.026677,0.016689,0.219137,0.057983,0.199358,0.146702
99998,299998,0.043192,0.135108,0.076279,0.021890,0.013879,0.350612,0.048075,0.200250,0.110716


In [56]:
! kaggle competitions submit -c tabular-playground-series-jun-2021 -f submission.csv -m "Baseline"

100%|██████████████████████████████████████| 10.3M/10.3M [00:08<00:00, 1.35MB/s]
Successfully submitted to Tabular Playground Series - Jun 2021