In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [6]:
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
def preprocess(df):
    df = df.copy()

    def normalize_name(name):
        return " ".join([v.strip(",()[].\"'") for v in name.split(" ")])
    
    def ticket_num(x):
        return x.split(" ")[-1]

    def ticket_item(x):
        items = x.split(" ")[-1]
        return "NONE" if len(items) == 1 else "_".join(items[0:-1])

    df["Name"] = df["Name"].apply(normalize_name)
    df["Ticket_num"] = df["Ticket"].apply(ticket_num)
    df["Ticket_item"] = df["Ticket"].apply(ticket_item)
    return df

In [10]:
train_df = preprocess(train_df)
test_df = preprocess(test_df)

train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Ticket_num,Ticket_item
0,1,0,3,Braund Mr Owen Harris,male,22.0,1,0,A/5 21171,7.25,,S,21171,2_1_1_7
1,2,1,1,Cumings Mrs John Bradley Florence Briggs Thayer,female,38.0,1,0,PC 17599,71.2833,C85,C,17599,1_7_5_9
2,3,1,3,Heikkinen Miss Laina,female,26.0,0,0,STON/O2. 3101282,7.925,,S,3101282,3_1_0_1_2_8
3,4,1,1,Futrelle Mrs Jacques Heath Lily May Peel,female,35.0,1,0,113803,53.1,C123,S,113803,1_1_3_8_0
4,5,0,3,Allen Mr William Henry,male,35.0,0,0,373450,8.05,,S,373450,3_7_3_4_5


In [12]:
# remove unwanted columns
input_features = list(train_df.columns)
input_features.remove("Ticket")
input_features.remove("PassengerId")
input_features.remove("Survived")

print(f"{input_features=}")

input_features=['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked', 'Ticket_num', 'Ticket_item']


In [34]:
def tokenize_name(features, labels=None):
    features["Name"] = tf.strings.split(features["Name"])
    return features, labels

train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df, label="Survived").map(tokenize_name)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_df).map(tokenize_name)

In [None]:
# training
model = tfdf.keras.GradientBoostedTreesModel(
    features=[tfdf.keras.FeatureUsage(name=n) for n in input_features],  # input features as defined above
    exclude_non_specified_features=True, # only use the defined features
    verbose=1,

    min_examples = 1, # min amount of examples in node
    categorical_algorithm="RANDOM", # learn splits on categorical attributes
    max_depth = 4, # max depth of tree
    shrinkage=0.05, # shrinking the input
    num_candidate_attributes_ratio=0.2, # ratio of attributes tested at each node
    split_axis="SPARSE_OBLIQUE", # structure of split for numerical values
    sparse_oblique_normalization="MIN_MAX", # normalization by range
    sparse_oblique_num_projections_exponent=2.0, # number of random projections
    num_trees=2000,
    validation_ratio=0.0,
)

model.compile(optimizer='adam', loss="BINOMIAL_LOG_LIKELIHOOD")
model.summary()

In [None]:
model.fit(train_ds)
self_eval = model.make_inspector().evaluation()
print(f'Accuracy: {self_eval.accuracy}, loss: {self_eval.loss}')