In [3]:
# ! pip install plotly-express

In [137]:
# Standard stack
import datetime
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Modeling
import tensorflow as tf
import tensorflow_decision_forests as tfdf
try:
    from wurlitzer import sys_pipes
except:
    from colabtools.googlelog import CaptureLog as sys_pipes

# Scikit-learn packages
from sklearn import metrics
from sklearn.model_selection import train_test_split

## Import helper functions
from ipynb.fs.defs.utils import *

# display
from IPython.core.magic import register_line_magic
from IPython.display import Javascript

In [138]:
KAGGLE_EVAL_METRIC = 'logloss' # string name for loss function in xgboost

In [139]:
TRAIN_PATH = "data/train.csv"
TEST_PATH = "data/test.csv"

In [140]:
TARGET = ["Survived"]

# Load Data

In [141]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

In [142]:
## Bring in Age Predictions

In [143]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

# Pre Process

In [145]:
train = nlp_transforms(train) # name features
train = ticket_transforms(train) # ticket transforms
train.Nickname = np.where(train.Nickname == 'none', 0,1)
train.Junior = np.where(train.Junior == 'none', 0,1)
train.Cabin = train.Cabin.astype(str)
train.Cabin = train.Cabin.apply(lambda x: x[0])
train['family_size'] = train.SibSp + train.Parch + 1


test = nlp_transforms(test) # name features
test = ticket_transforms(test) # ticket transforms
test.Nickname = np.where(test.Nickname == 'none', 0,1)
test.Junior = np.where(test.Junior == 'none', 0,1)
test.Cabin = test.Cabin.astype(str)
test.Cabin = test.Cabin.apply(lambda x: x[0])
test['family_size'] = test.SibSp + test.Parch + 1

# Tensorflow RandomForest

In [147]:
train.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Embarked,First Name,Title,Nickname,Last Name,Junior,Middle Names,Ticket Number,Ticket Type,family_size
0,1,0,3,"braund, mr. owen harris",male,22.0,1,0,a/5 21171,7.25,...,S,braund,mr.,0,owen,1,harris,21171,a/5,2


In [212]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'First Name', 'Title',
       'Nickname', 'Last Name', 'Junior', 'Middle Names', 'Ticket Number',
       'Ticket Type', 'family_size'],
      dtype='object')

In [213]:
X_train, X_test = train_test_split(train, test_size=0.2, random_state=40)

In [214]:
columns = ['Survived','Sex','Age', 'Name','Pclass','Ticket','Fare','Ticket Number','Ticket Type']
dataframe = X_train[columns]
temp = dataframe.to_dict('list')
train_dataset = tf.data.Dataset.from_tensor_slices(temp)

dataframe = X_test[columns]
temp = dataframe.to_dict('list')
test_dataset = tf.data.Dataset.from_tensor_slices(temp)
#for example in temp.take(3):
#    print({attr_name: attr_tensor.numpy() for attr_name, attr_tensor in example.items()})

In [215]:
def prepare_dataset(example):
    label = example["Survived"]
    features = {"name" : tf.strings.split(example["Name"]),
                #"ticket_type" : tf.strings.split(example["Ticket Type"]),
                #"ticket" : tf.strings.split(example["Ticket"]),
                #"sex" : example["Sex"],
                #"age" : example["Age"],
                #"ticket" : example["Ticket"],
                #"pclass" : example["Pclass"], 
                #"name" : example["Name"]
               }
    return features, label



In [216]:
train_ds = train_dataset.batch(64).map(prepare_dataset)
test_ds = test_dataset.batch(64).map(prepare_dataset)

In [217]:
train_ds

<MapDataset shapes: ({name: (None, None)}, (None,)), types: ({name: tf.string}, tf.int32)>

In [218]:
# Specify the model.
model_1 = tfdf.keras.RandomForestModel(num_trees=30)
model_1.compile(metrics=["accuracy"])
model_1.fit(x=train_ds)



<tensorflow.python.keras.callbacks.History at 0x7fb6baf94130>

In [219]:
evaluation = model_1.evaluate(test_ds)

print(f"BinaryCrossentropyloss: {evaluation[0]}")
print(f"Accuracy: {evaluation[1]}")

BinaryCrossentropyloss: 0.0
Accuracy: 0.7988826632499695


In [220]:
evaluation = model_1.evaluate(test_ds)

print(f"BinaryCrossentropyloss: {evaluation[0]}")
print(f"Accuracy: {evaluation[1]}")

BinaryCrossentropyloss: 0.0
Accuracy: 0.7988826632499695


# Train on full Dataset

In [221]:
columns = ['Survived','Sex','Age', 'Name','Pclass','Ticket','Fare','Ticket Number','Ticket Type']
dataframe = train[columns]
temp = dataframe.to_dict('list')
train_dataset = tf.data.Dataset.from_tensor_slices(temp)

train_ds = train_dataset.batch(64).map(prepare_dataset)

# Specify the model.
model_1 = tfdf.keras.RandomForestModel(num_trees=30)
model_1.compile(metrics=["accuracy"])
model_1.fit(x=train_ds)



<tensorflow.python.keras.callbacks.History at 0x7fb6bae6bd00>

# Make Predicitons

In [222]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,...,First Name,Title,Nickname,Last Name,Junior,Middle Names,Ticket Number,Ticket Type,family_size,Survived
0,892,3,"kelly, mr. james",male,34.5,0,0,330911,7.8292,n,...,kelly,mr.,0,james,1,,330911,,1,0.0
1,893,3,"wilkes, mrs. james (ellen needs)",female,47.0,1,0,363272,7.0,n,...,wilkes,mrs.,1,james,1,,363272,,2,0.4
2,894,2,"myles, mr. thomas francis",male,62.0,0,0,240276,9.6875,n,...,myles,mr.,0,thomas,1,francis,240276,,1,0.066667
3,895,3,"wirz, mr. albert",male,27.0,0,0,315154,8.6625,n,...,wirz,mr.,0,albert,1,,315154,,1,0.333333
4,896,3,"hirvonen, mrs. alexander (helga e lindqvist)",female,22.0,1,1,3101298,12.2875,n,...,hirvonen,mrs.,1,alexander,1,,3101298,,3,0.7


In [223]:
columns = ['Sex','Age', 'Name','Pclass','Ticket','Fare','Ticket Number','Ticket Type']
dataframe = test[columns]
temp = dataframe.to_dict('list')
test_dataset = tf.data.Dataset.from_tensor_slices(temp)

def prepare_dataset(example):
    features = {"name" : tf.strings.split(example["Name"]),
                #"ticket_type" : tf.strings.split(example["Ticket Type"]),
                #"ticket" : tf.strings.split(example["Ticket"]),
                #"sex" : example["Sex"],
                #"age" : example["Age"],
                #"ticket" : example["Ticket"],
                #"pclass" : example["Pclass"], 
                #"name" : example["Name"]
               }
    return features

test_ds = test_dataset.batch(64).map(prepare_dataset)


In [224]:
y_pred = model_1.predict(test_ds)

In [225]:
#y_pred

In [226]:
test['Survived'] = y_pred

In [227]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,...,First Name,Title,Nickname,Last Name,Junior,Middle Names,Ticket Number,Ticket Type,family_size,Survived
0,892,3,"kelly, mr. james",male,34.5,0,0,330911,7.8292,n,...,kelly,mr.,0,james,1,,330911,,1,0.0
1,893,3,"wilkes, mrs. james (ellen needs)",female,47.0,1,0,363272,7.0,n,...,wilkes,mrs.,1,james,1,,363272,,2,0.966667
2,894,2,"myles, mr. thomas francis",male,62.0,0,0,240276,9.6875,n,...,myles,mr.,0,thomas,1,francis,240276,,1,0.066667
3,895,3,"wirz, mr. albert",male,27.0,0,0,315154,8.6625,n,...,wirz,mr.,0,albert,1,,315154,,1,0.2
4,896,3,"hirvonen, mrs. alexander (helga e lindqvist)",female,22.0,1,1,3101298,12.2875,n,...,hirvonen,mrs.,1,alexander,1,,3101298,,3,0.866667


In [228]:
sub = test[['PassengerId','Survived']]

In [229]:
sub['Survived'] = np.where(sub['Survived'] >= .5, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub['Survived'] = np.where(sub['Survived'] >= .5, 1, 0)


In [230]:
sub.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [231]:
sub.to_csv('submission.csv',index=False)

In [232]:
!kaggle competitions submit -c titanic -f submission.csv -m "Only Name"

100%|██████████████████████████████████████| 2.77k/2.77k [00:00<00:00, 3.95kB/s]
Successfully submitted to Titanic - Machine Learning from Disaster