<a href="https://colab.research.google.com/github/ForThisPlayer10/neural/blob/main/keras/k-titanic/k_titanic_pred_tfdf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip3 install tensorflow_decision_forests

Collecting tensorflow_decision_forests
  Downloading tensorflow_decision_forests-1.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.0 kB)
Collecting tensorflow==2.16.2 (from tensorflow_decision_forests)
  Downloading tensorflow-2.16.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting wurlitzer (from tensorflow_decision_forests)
  Downloading wurlitzer-3.1.1-py3-none-any.whl.metadata (2.5 kB)
Collecting ydf (from tensorflow_decision_forests)
  Downloading ydf-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Collecting ml-dtypes~=0.3.1 (from tensorflow==2.16.2->tensorflow_decision_forests)
  Downloading ml_dtypes-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting tensorboard<2.17,>=2.16 (from tensorflow==2.16.2->tensorflow_decision_forests)
  Downloading tensorboard-2.16.2-py3-none-any.whl.metadata (1.6 kB)
INFO: pip is looking at multiple versions o

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_decision_forests as tfdf

In [3]:
print(f'Found tdfd version: {tfdf.__version__}')

Found tdfd version: 1.9.2


In [4]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
def preprocess(df):
    df = df.copy()

    def normalize_name(name):
        return " ".join([v.strip(",()[].\"'") for v in name.split(" ")])

    def ticket_num(x):
        return x.split(" ")[-1]

    def ticket_item(x):
        items = x.split(" ")
        return "NONE" if len(items) == 1 else "_".join(items[0:-1])

    df["Name"] = df["Name"].apply(normalize_name)
    df["Ticket_num"] = df["Ticket"].apply(ticket_num)
    df["Ticket_item"] = df["Ticket"].apply(ticket_item)
    return df

In [8]:
train_df = preprocess(train_df)
test_df = preprocess(test_df)

train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Ticket_num,Ticket_item
0,1,0,3,Braund Mr Owen Harris,male,22.0,1,0,A/5 21171,7.25,,S,21171,A/5
1,2,1,1,Cumings Mrs John Bradley Florence Briggs Thayer,female,38.0,1,0,PC 17599,71.2833,C85,C,17599,PC
2,3,1,3,Heikkinen Miss Laina,female,26.0,0,0,STON/O2. 3101282,7.925,,S,3101282,STON/O2.
3,4,1,1,Futrelle Mrs Jacques Heath Lily May Peel,female,35.0,1,0,113803,53.1,C123,S,113803,NONE
4,5,0,3,Allen Mr William Henry,male,35.0,0,0,373450,8.05,,S,373450,NONE


In [9]:
# remove unwanted columns
input_features = list(train_df.columns)
input_features.remove("Ticket")
input_features.remove("PassengerId")
input_features.remove("Survived")

print(f"{input_features=}")

input_features=['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked', 'Ticket_num', 'Ticket_item']


In [10]:
def tokenize_name(features, labels=None):
    features["Name"] = tf.strings.split(features["Name"])
    return features, labels

train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df, label="Survived").map(tokenize_name)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_df).map(tokenize_name)

In [24]:
# training
model = tfdf.keras.GradientBoostedTreesModel(
    features=[tfdf.keras.FeatureUsage(name=n) for n in input_features],  # input features as defined above
    exclude_non_specified_features=True, # only use the defined features
    verbose=1,

    min_examples = 1, # min amount of examples in node
    categorical_algorithm="RANDOM", # learn splits on categorical attributes
    # max_depth = 4, # max depth of tree
    shrinkage=0.05, # shrinking the input
    # num_candidate_attributes_ratio=0.2, # ratio of attributes tested at each node
    split_axis="SPARSE_OBLIQUE", # structure of split for numerical values
    sparse_oblique_normalization="MIN_MAX", # normalization by range
    sparse_oblique_num_projections_exponent=2.0, # number of random projections
    num_trees=2000,
    # validation_ratio=0.0,
)

Use /tmp/tmpembqyp0t as temporary training directory


In [25]:
model_history = model.fit(train_ds)
self_eval = model.make_inspector().evaluation()
print(f'Accuracy: {self_eval.accuracy}, loss: {self_eval.loss}')

Reading training dataset...
Training dataset read in 0:00:00.344369. Found 891 examples.
Training model...
Model trained in 0:00:01.223694
Compiling model...
Model compiled.
Accuracy: 0.8630136847496033, loss: 0.7091237902641296


In [26]:
# model.summary()

In [42]:
test_survived = model.predict(test_ds, verbose=0)[:, 0]
test_survived_df = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": (test_survived >= 0.5) #.astype(int)
})

In [48]:
result = pd.concat([test_df, test_survived_df], axis=1, join="inner")
result

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Ticket_num,Ticket_item,PassengerId.1,Survived
0,892,3,Kelly Mr James,male,34.5,0,0,330911,7.8292,,Q,330911,NONE,892,False
1,893,3,Wilkes Mrs James Ellen Needs,female,47.0,1,0,363272,7.0000,,S,363272,NONE,893,False
2,894,2,Myles Mr Thomas Francis,male,62.0,0,0,240276,9.6875,,Q,240276,NONE,894,False
3,895,3,Wirz Mr Albert,male,27.0,0,0,315154,8.6625,,S,315154,NONE,895,False
4,896,3,Hirvonen Mrs Alexander Helga E Lindqvist,female,22.0,1,1,3101298,12.2875,,S,3101298,NONE,896,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,Spector Mr Woolf,male,,0,0,A.5. 3236,8.0500,,S,3236,A.5.,1305,False
414,1306,1,Oliva y Ocana Dona Fermina,female,39.0,0,0,PC 17758,108.9000,C105,C,17758,PC,1306,True
415,1307,3,Saether Mr Simon Sivertsen,male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,3101262,SOTON/O.Q.,1307,False
416,1308,3,Ware Mr Frederick,male,,0,0,359309,8.0500,,S,359309,NONE,1308,False


In [50]:
# hyperparameter tuning
tuner = tfdf.tuner.RandomSearch(num_trials=1000) # number of random hyperparameter values

tuner.choice('min_examples', [1, 2, 5, 7, 10])
tuner.choice('categorical_algorithm', ["RANDOM", 'CART'])

local_search_space = tuner.choice("growing_strategy", ["LOCAL"])
local_search_space.choice("max_depth", [3,4,5,6,7,8])

global_search_space = tuner.choice("growing_strategy", ["BEST_FIRST_GLOBAL"], merge=True)
global_search_space.choice("max_num_nodes", [16, 32, 64, 128, 256])

tuner.choice('shrinkage', [0.02, 0.05, 0.10, 0.15])
tuner.choice('num_candidate_attributes_ratio', [0.2, 0.5, 0.9, 1.0])
tuner.choice('split_axis', ["AXIS_ALIGNED"])

oblique_space = tuner.choice("split_axis", ["SPARSE_OBLIQUE"], merge=True)
oblique_space.choice("sparse_oblique_normalization", ['MIN_MAX', 'NONE', 'STANDARD_DEVIATION'])
oblique_space.choice("sparse_oblique_weights", ["BINARY", "CONTINUOUS"])
oblique_space.choice("sparse_oblique_num_projections_exponent", [1.0, 1.5, 2.0])

tuned_model = tfdf.keras.GradientBoostedTreesModel(tuner=tuner)
tuned_model_history = tuned_model.fit(train_ds)
tuned_self_eval = tuned_model.make_inspector().evaluation()
print(f'Accuracy: {tuned_self_eval.accuracy}, loss: {tuned_self_eval.loss}')

Use /tmp/tmpg8f570nj as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.538791. Found 891 examples.
Training model...
Model trained in 0:09:11.075278
Compiling model...
Model compiled.
Accuracy: 0.8767123222351074, loss: 0.6746237277984619


In [57]:
test_survived_tuned = tuned_model.predict(test_ds, verbose=0)[:, 0]
test_survived_df_tuned = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": (test_survived_tuned >= 0.5) #.astype(int)
})
print(f"Two dataframes: {test_survived_df.equals(test_survived_df_tuned)}")
result = pd.concat([test_survived_df, test_survived_df_tuned], axis=1, join="inner")
test_survived_df.eq(test_survived_df_tuned, axis=1)['Survived'] == False

Two dataframes: False


Unnamed: 0,Survived
0,False
1,False
2,False
3,False
4,True
...,...
413,False
414,False
415,False
416,False


In [None]:
predictions = None
num_predictions = 0

for i in range(100):
    print(f"i:{i}")
    # Possible models: GradientBoostedTreesModel or RandomForestModel
    model = tfdf.keras.GradientBoostedTreesModel(
        verbose=1,
        features=[tfdf.keras.FeatureUsage(name=n) for n in input_features],
        exclude_non_specified_features=True,
        random_seed=i,
        honest=True,
    )
    model.fit(train_ds)

    sub_predictions = model.predict(test_ds, verbose=0)[:,0]
    if predictions is None:
        predictions = sub_predictions
    else:
        predictions += sub_predictions
    num_predictions += 1

predictions/=num_predictions

kaggle_predictions = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": (predictions >= 0.5).astype(int)
    })

