# Titanic competition
Ресурсы:
* https://www.kaggle.com/code/alexisbcook/titanic-tutorial 
* https://proglib.io/p/vvedenie-v-mashinnoe-obuchenie-reshaem-titanic-na-platforme-kaggle-2021-08-17 

In [None]:
import numpy as np
import pandas as pd
import os

import tensorflow as tf
import tensorflow_decision_forests as tfdf

print(f"Found TF-DF {tfdf.__version__}")

In [None]:
train_dataset = pd.read_csv('/kaggle/input/titanic/train.csv')
test_dataset = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
train_dataset.head()

In [None]:
train_dataset.shape

In [None]:
def preprocess(df):
    df = df.copy()
    
    def normalize_name(x):
        return " ".join([v.strip(",()[].\"'") for v in x.split(" ")])
    
    def ticket_number(x):
        return x.split(" ")[-1]
        
    def ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])
    
    df["Name"] = df["Name"].apply(normalize_name)
    df["Ticket_number"] = df["Ticket"].apply(ticket_number)
    df["Ticket_item"] = df["Ticket"].apply(ticket_item)                     
    return df
    
preprocessed_train_data = preprocess(train_dataset)
preprocessed_test_data = preprocess(test_dataset)

preprocessed_train_data.head()

In [None]:
input_features = list(preprocessed_train_data.columns)
input_features.remove("Ticket")
input_features.remove("PassengerId")
input_features.remove("Survived")
#input_features.remove("Ticket_number")

print(f"Input features: {input_features}")

In [None]:
def tokenize_names(features, labels=None):
    """Divite the names into tokens. TF-DF can consume text tokens natively."""
    features["Name"] =  tf.strings.split(features["Name"])
    return features, labels

train_data = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_train_data,label="Survived").map(tokenize_names)
test_data = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_test_data).map(tokenize_names)

In [None]:
model = tfdf.keras.GradientBoostedTreesModel(
    verbose=0, # Very few logs
    features=[tfdf.keras.FeatureUsage(name=n) for n in input_features],
    exclude_non_specified_features=True, # Only use the features in "features"
    random_seed=1234,
)
model.fit(train_data)

self_evaluation = model.make_inspector().evaluation()
print(f"Accuracy: {self_evaluation.accuracy} Loss:{self_evaluation.loss}")

In [None]:
def prediction_to_kaggle_format(model, threshold=0.5):
    proba_survive = model.predict(test_data, verbose=0)[:,0]
    return pd.DataFrame({
        "PassengerId": test_dataset["PassengerId"],
        "Survived": (proba_survive >= threshold).astype(int)
    })

def make_submission(kaggle_predictions):
    path="/kaggle/working/submission.csv"
    kaggle_predictions.to_csv(path, index=False)
    print(f"Submission exported to {path}")
    
kaggle_predictions = prediction_to_kaggle_format(model)
make_submission(kaggle_predictions)