In [140]:
#! pip install wurlitzer

In [735]:
# Standard stack
import datetime
import pandas as pd
import numpy as np
import re

# Visualization
from pandas_profiling import ProfileReport
#import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
import tensorflow as tf
import tensorflow_decision_forests as tfdf
try:
    from wurlitzer import sys_pipes
except:
    from colabtools.googlelog import CaptureLog as sys_pipes

# Scikit-learn packages
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, OrdinalEncoder
from sklearn.compose import ColumnTransformer

## Import helper functions
from ipynb.fs.defs.utils import *

# display
from IPython.core.magic import register_line_magic
from IPython.display import Javascript

In [736]:
KAGGLE_EVAL_METRIC = 'logloss' # string name for loss function in xgboost

In [737]:
TRAIN_PATH = "data/train.csv"
TEST_PATH = "data/test.csv"

In [738]:
label = "Survived"

# Load Data

In [739]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

In [740]:
# Inputation
# Replaces numerical NaN (representing missing values in Pandas Dataframe) with 0s.
# ...Neural Nets don't work well with numerical NaNs.
def impute_numbers(dataset_df):
    for col in dataset_df.columns:
        if dataset_df[col].dtype not in [str, object]:
            dataset_df[col] = dataset_df[col].fillna(0)
    return dataset_df

# Split the dataset into a training and testing dataset.
def split_dataset(dataset, test_ratio=0.30):
    """Splits a panda dataframe in two."""
    test_indices = np.random.rand(len(dataset)) < test_ratio
    return dataset[~test_indices], dataset[test_indices]

def prepare_dataset(dataset):
    name = {"Ticket" : tf.strings.split(dataset["Ticket"]).numpy()}
    return name

In [741]:
data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])

In [742]:
data

<tf.Tensor: shape=(2, 3), dtype=string, numpy=
array([[b'a', b'c', b'd'],
       [b'd', b'z', b'b']], dtype=object)>

In [743]:
nlp_transforms(train)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,First Name,Title,Nickname,Last Name,Junior,Middle Names
0,1,0,3,"braund, mr. owen harris",male,22.0,1,0,A/5 21171,7.2500,,S,braund,mr.,none,owen,False,harris
1,2,1,1,"cumings, mrs. john bradley (florence briggs th...",female,38.0,1,0,PC 17599,71.2833,C85,C,cumings,mrs.,(florence briggs thayer),john,False,bradley
2,3,1,3,"heikkinen, miss. laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,heikkinen,miss.,none,laina,False,
3,4,1,1,"futrelle, mrs. jacques heath (lily may peel)",female,35.0,1,0,113803,53.1000,C123,S,futrelle,mrs.,(lily may peel),jacques,False,heath
4,5,0,3,"allen, mr. william henry",male,35.0,0,0,373450,8.0500,,S,allen,mr.,none,william,False,henry
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"montvila, rev. juozas",male,27.0,0,0,211536,13.0000,,S,montvila,rev.,none,juozas,False,
887,888,1,1,"graham, miss. margaret edith",female,19.0,0,0,112053,30.0000,B42,S,graham,miss.,none,margaret,False,edith
888,889,0,3,"johnston, miss. catherine helen ""carrie""",female,,1,2,W./C. 6607,23.4500,,S,johnston,miss.,"""carrie""",catherine,False,helen
889,890,1,1,"behr, mr. karl howell",male,26.0,0,0,111369,30.0000,C148,C,behr,mr.,none,karl,False,howell


In [744]:
train = impute_numbers(train)
train_ds_pd, test_ds_pd = split_dataset(train)
print("{} examples in training, {} examples for testing.".format(len(train_ds_pd), len(test_ds_pd)))

620 examples in training, 271 examples for testing.


In [745]:
# Convert the datasets into tensorflow datasets
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_ds_pd, label=label)



In [746]:
train_ds

<BatchDataset shapes: ({PassengerId: (None,), Pclass: (None,), Name: (None,), Sex: (None,), Age: (None,), SibSp: (None,), Parch: (None,), Ticket: (None,), Fare: (None,), Cabin: (None,), Embarked: (None,), First_Name: (None,), Title: (None,), Nickname: (None,), Last_Name: (None,), Junior: (None,), Middle_Names: (None,)}, (None,)), types: ({PassengerId: tf.int64, Pclass: tf.int64, Name: tf.string, Sex: tf.string, Age: tf.float64, SibSp: tf.int64, Parch: tf.int64, Ticket: tf.string, Fare: tf.float64, Cabin: tf.string, Embarked: tf.string, First_Name: tf.string, Title: tf.string, Nickname: tf.string, Last_Name: tf.string, Junior: tf.bool, Middle_Names: tf.string}, tf.int64)>

# Build Model

In [747]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,First Name,Title,Nickname,Last Name,Junior,Middle Names
0,1,0,3,"braund, mr. owen harris",male,22.0,1,0,A/5 21171,7.25,,S,braund,mr.,none,owen,False,harris
1,2,1,1,"cumings, mrs. john bradley (florence briggs th...",female,38.0,1,0,PC 17599,71.2833,C85,C,cumings,mrs.,(florence briggs thayer),john,False,bradley
2,3,1,3,"heikkinen, miss. laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,heikkinen,miss.,none,laina,False,
3,4,1,1,"futrelle, mrs. jacques heath (lily may peel)",female,35.0,1,0,113803,53.1,C123,S,futrelle,mrs.,(lily may peel),jacques,False,heath
4,5,0,3,"allen, mr. william henry",male,35.0,0,0,373450,8.05,,S,allen,mr.,none,william,False,henry


In [748]:
train_ds_pd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 620 entries, 0 to 890
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   620 non-null    int64  
 1   Survived      620 non-null    int64  
 2   Pclass        620 non-null    int64  
 3   Name          620 non-null    object 
 4   Sex           620 non-null    object 
 5   Age           620 non-null    float64
 6   SibSp         620 non-null    int64  
 7   Parch         620 non-null    int64  
 8   Ticket        620 non-null    object 
 9   Fare          620 non-null    float64
 10  Cabin         134 non-null    object 
 11  Embarked      618 non-null    object 
 12  First Name    620 non-null    object 
 13  Title         620 non-null    object 
 14  Nickname      620 non-null    object 
 15  Last Name     620 non-null    object 
 16  Junior        620 non-null    bool   
 17  Middle Names  620 non-null    object 
dtypes: bool(1), float64(2), int64(

#### Define Featrures

In [749]:
#https://www.tensorflow.org/decision_forests/tutorials/intermediate_colab

In [750]:
input_1 = tf.keras.Input(shape=(1,), name="Age", dtype="float")
input_2 = tf.keras.Input(shape=(1,), name="Title", dtype="string")
input_3 = tf.keras.Input(shape=(1,), name="Pclass", dtype="float")
input_4 = tf.keras.Input(shape=(1,), name="Sex", dtype="string")
input_5 = tf.keras.Input(shape=(1,), name="SibSp", dtype="float")
input_6 = tf.keras.Input(shape=(1,), name="Parch", dtype="float")
input_7 = tf.keras.Input(shape=(1,), name="Fare", dtype="float")
input_8 = tf.keras.Input(shape=(1,), name="Cabin", dtype="string")
input_9 = tf.keras.Input(shape=(1,), name="Embarked", dtype="string")
input_10 = tf.keras.Input(shape=(1,), name="Title", dtype="string")

nn_raw_inputs = [input_1, input_2,input_3,input_4,input_5,input_6,input_7,input_8,input_9,input_10]
nn_raw_inputs = [input_1,input_7,input_10]

#### Normalize Features

In [751]:
Normalization = tf.keras.layers.experimental.preprocessing.Normalization
CategoryEncoding = tf.keras.layers.experimental.preprocessing.CategoryEncoding
StringLookup = tf.keras.layers.experimental.preprocessing.StringLookup

# Numerical
values = train_ds_pd["Age"].values
input_1_normalizer = Normalization()
input_1_normalizer.adapt(values)

values = train_ds_pd["Fare"].values
input_2_normalizer = Normalization()
input_2_normalizer.adapt(values)

# String (https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing/StringLookup)
values = train_ds_pd["Title"].values
#vocab =  ['asdasd','sadasd'] #list(words)
input_3_indexer = StringLookup(max_tokens=4,vocabulary=vocab)
input_3_indexer.adapt(values)

input_3_onehot = CategoryEncoding(output_mode="binary", max_tokens=4)

normalized_input_1 = input_1_normalizer(input_1)
normalized_input_2 = input_2_normalizer(input_7)
normalized_input_3 = input_3_onehot(input_3_indexer(input_10))

nn_processed_inputs = [normalized_input_1, normalized_input_2,normalized_input_3]





In [752]:
#train_ds_pd["Name"].values

In [753]:
input_10

<KerasTensor: shape=(None, 1) dtype=string (created by layer 'Title')>

In [754]:
#values[0:3]

In [755]:
#input_3_indexer.get_vocabulary()

#### Build the body of the neural network:

In [756]:
y = tf.keras.layers.Concatenate()(nn_processed_inputs)
y = tf.keras.layers.Dense(16, activation=tf.nn.relu6)(y)
last_layer = tf.keras.layers.Dense(8, activation=tf.nn.relu, name="last")(y)

# "3" for the three label classes. If it were a binary classification, the
# output dim would be 1.
classification_output = tf.keras.layers.Dense(2)(y)

nn_model = tf.keras.models.Model(nn_raw_inputs, classification_output)

# To reduce the risk of mistakes, group both the decision forest and the
# neural network in a single keras model.
nn_without_head = tf.keras.models.Model(inputs=nn_model.inputs, outputs=last_layer)
df_and_nn_model = tfdf.keras.RandomForestModel(preprocessing=nn_without_head)

In [764]:
nn_model.inputs

[<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'Age')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'Fare')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'Title')>]

#### Traim amd evaluate the models

In [762]:
nn_model.compile(
  optimizer=tf.keras.optimizers.Adam(),
  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=["accuracy"])

nn_model.fit(x=train_ds, validation_data=test_ds, epochs=10, verbose=False)
print("Evaluation:", nn_model.evaluate(test_ds))

Evaluation: [0.5189318060874939, 0.7527675032615662]


In [763]:
df_and_nn_model.compile(metrics=["accuracy"])
df_and_nn_model.fit(x=train_ds)
print("Evaluation:", df_and_nn_model.evaluate(test_ds))

Evaluation: [0.0, 0.7490774989128113]
