In [3]:
# ! pip install plotly-express

In [25]:
# Standard stack
import datetime
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Modeling
import tensorflow as tf
import tensorflow_decision_forests as tfdf
try:
    from wurlitzer import sys_pipes
except:
    from colabtools.googlelog import CaptureLog as sys_pipes

# Scikit-learn packages
from sklearn import metrics
from sklearn.model_selection import train_test_split

## Import helper functions
from ipynb.fs.defs.utils import *

# display
from IPython.core.magic import register_line_magic
from IPython.display import Javascript

In [26]:
KAGGLE_EVAL_METRIC = 'logloss' # string name for loss function in xgboost

In [27]:
TRAIN_PATH = "data/train.csv"
TEST_PATH = "data/test.csv"

In [28]:
TARGET = ["Survived"]

# Load Data

In [112]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

In [113]:
## Bring in Age Predictions

In [114]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

# Pre Process

In [221]:
train = nlp_transforms(train) # name features
train = ticket_transforms(train) # ticket transforms
train.Nickname = np.where(train.Nickname == 'none', 0,1)
train.Junior = np.where(train.Junior == 'none', 0,1)
train.Cabin = train.Cabin.astype(str)
train.Cabin = train.Cabin.apply(lambda x: x[0])
train['family_size'] = train.SibSp + train.Parch + 1


test = nlp_transforms(test) # name features
test = ticket_transforms(test) # ticket transforms
test.Nickname = np.where(test.Nickname == 'none', 0,1)
test.Junior = np.where(test.Junior == 'none', 0,1)
test.Cabin = test.Cabin.astype(str)
test.Cabin = test.Cabin.apply(lambda x: x[0])
test['family_size'] = test.SibSp + test.Parch + 1

# Tensorflow RandomForest

In [222]:
train.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Embarked,First Name,Title,Nickname,Last Name,Junior,Middle Names,Ticket Number,Ticket Type,family_size
0,1,0,3,"braund, mr. owen harris",male,22.0,1,0,a/5 21171,7.25,...,S,braund,mr.,0,owen,1,harris,21171,a/5,2


In [223]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'First Name', 'Title',
       'Nickname', 'Last Name', 'Junior', 'Middle Names', 'Ticket Number',
       'Ticket Type', 'family_size'],
      dtype='object')

In [224]:
X_train, X_test = train_test_split(train, test_size=0.2, random_state=40)

In [275]:
train.columns.tolist()

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked',
 'First Name',
 'Title',
 'Nickname',
 'Last Name',
 'Junior',
 'Middle Names',
 'Ticket Number',
 'Ticket Type',
 'family_size']

In [489]:
columns = ['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'First Name',
 'Title',
 'Nickname',
 'Last Name',
 'Junior',
 'Middle Names',
 'Ticket Number',
 'Ticket Type',
 'family_size']

dataframe = X_train[columns]
temp = dataframe.to_dict('list')
train_dataset = tf.data.Dataset.from_tensor_slices(temp)

dataframe = X_test[columns]
temp = dataframe.to_dict('list')
test_dataset = tf.data.Dataset.from_tensor_slices(temp)
#for example in temp.take(3):
#    print({attr_name: attr_tensor.numpy() for attr_name, attr_tensor in example.items()})

In [490]:
train_dataset

<TensorSliceDataset shapes: {PassengerId: (), Survived: (), Pclass: (), Name: (), Sex: (), Age: (), SibSp: (), Parch: (), Ticket: (), Fare: (), Cabin: (), First Name: (), Title: (), Nickname: (), Last Name: (), Junior: (), Middle Names: (), Ticket Number: (), Ticket Type: (), family_size: ()}, types: {PassengerId: tf.int32, Survived: tf.int32, Pclass: tf.int32, Name: tf.string, Sex: tf.string, Age: tf.float32, SibSp: tf.int32, Parch: tf.int32, Ticket: tf.string, Fare: tf.float32, Cabin: tf.string, First Name: tf.string, Title: tf.string, Nickname: tf.int32, Last Name: tf.string, Junior: tf.int32, Middle Names: tf.string, Ticket Number: tf.int32, Ticket Type: tf.string, family_size: tf.int32}>

In [491]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Embarked,First Name,Title,Nickname,Last Name,Junior,Middle Names,Ticket Number,Ticket Type,family_size
0,1,0,3,"braund, mr. owen harris",male,22.0,1,0,a/5 21171,7.25,...,S,braund,mr.,0,owen,1,harris,21171,a/5,2
1,2,1,1,"cumings, mrs. john bradley (florence briggs th...",female,38.0,1,0,pc 17599,71.2833,...,C,cumings,mrs.,1,john,1,bradley,17599,pc,2
2,3,1,3,"heikkinen, miss. laina",female,26.0,0,0,ston/o2. 3101282,7.925,...,S,heikkinen,miss.,0,laina,1,,3101282,ston/o2.,1
3,4,1,1,"futrelle, mrs. jacques heath (lily may peel)",female,35.0,1,0,113803,53.1,...,S,futrelle,mrs.,1,jacques,1,heath,113803,,2
4,5,0,3,"allen, mr. william henry",male,35.0,0,0,373450,8.05,...,S,allen,mr.,0,william,1,henry,373450,,1


In [492]:
def prepare_dataset(data):
    
    features = {#"name" : tf.strings.split(data["Name"]),
                #"ticket" : tf.strings.split(data["Ticket Type"]),
                "title" : tf.strings.split(data["Title"]),
                "sex" : data["Sex"],
                "age" : data["Age"],
                "pclass" : data["Pclass"], 
                "Fare" : data["Fare"],
                #"Nickname" : data["Nickname"],
                #"Junior" : data["Junior"],
                #"SibSp" : data["SibSp"],
                #"Parch" : data["Parch"],
                #"name" : data["Name"]
               }
    
    try:
        label = data['Survived']
        output = features,label
    except:
        output = features
    
    return output



In [493]:
train_ds = train_dataset.batch(64).map(prepare_dataset)
test_ds = test_dataset.batch(64).map(prepare_dataset)

In [494]:
train_ds

<MapDataset shapes: ({title: (None, None), sex: (None,), age: (None,), pclass: (None,), Fare: (None,)}, (None,)), types: ({title: tf.string, sex: tf.string, age: tf.float32, pclass: tf.int32, Fare: tf.float32}, tf.int32)>

In [495]:
# Specify the model.
model_1 = tfdf.keras.GradientBoostedTreesModel(hyperparameter_template="benchmark_rank1")
model_1.compile(metrics=["accuracy"])
model_1.fit(x=train_ds)



<tensorflow.python.keras.callbacks.History at 0x7fbf9c8ce6a0>

In [496]:
evaluation = model_1.evaluate(test_ds)

print(f"BinaryCrossentropyloss: {evaluation[0]}")
print(f"Accuracy: {evaluation[1]}")

BinaryCrossentropyloss: 0.0
Accuracy: 0.7932960987091064


In [497]:
model_1.summary()

Model: "gradient_boosted_trees_model_21"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Total params: 1
Trainable params: 0
Non-trainable params: 1
_________________________________________________________________
Type: "GRADIENT_BOOSTED_TREES"
Task: CLASSIFICATION
Label: "__LABEL"

Input Features (5):
	Fare
	age
	pclass
	sex
	title

No weights

Variable Importance: NUM_NODES:
    1.   "Fare" 517.000000 ################
    2.    "age" 181.000000 #####
    3.  "title" 45.000000 #
    4.    "sex" 38.000000 
    5. "pclass"  9.000000 

Variable Importance: NUM_AS_ROOT:
    1.  "Fare" 12.000000 ################
    2.   "sex"  9.000000 ##########
    3.   "age"  4.000000 #
    4. "title"  3.000000 

Variable Importance: SUM_SCORE:
    1.   "Fare" 269.296718 ################
    2.    "sex" 185.353636 ##########
    3.    "age" 81.454014 ####
    4.  "title" 37.012791 ##
    5. "pclass"  2.535684 

Variabl

# Train on full Dataset

In [498]:
dataframe = train[columns]
temp = dataframe.to_dict('list')
train_dataset = tf.data.Dataset.from_tensor_slices(temp)
train_ds = train_dataset.batch(64).map(prepare_dataset)

# Specify the model.
model_1 = tfdf.keras.RandomForestModel(num_trees=30)
model_1.compile(metrics=["accuracy"])
model_1.fit(x=train_ds)



<tensorflow.python.keras.callbacks.History at 0x7fbde436cf70>

# Make Predicitons

In [499]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,...,First Name,Title,Nickname,Last Name,Junior,Middle Names,Ticket Number,Ticket Type,family_size,Survived
0,892,3,"kelly, mr. james",male,34.5,0,0,330911,7.8292,n,...,kelly,mr.,0,james,1,,330911,,1,0.033333
1,893,3,"wilkes, mrs. james (ellen needs)",female,47.0,1,0,363272,7.0,n,...,wilkes,mrs.,1,james,1,,363272,,2,0.666667
2,894,2,"myles, mr. thomas francis",male,62.0,0,0,240276,9.6875,n,...,myles,mr.,0,thomas,1,francis,240276,,1,0.066667
3,895,3,"wirz, mr. albert",male,27.0,0,0,315154,8.6625,n,...,wirz,mr.,0,albert,1,,315154,,1,0.166667
4,896,3,"hirvonen, mrs. alexander (helga e lindqvist)",female,22.0,1,1,3101298,12.2875,n,...,hirvonen,mrs.,1,alexander,1,,3101298,,3,0.733334


In [500]:
test_columns = columns
test_columns.remove('Survived')

dataframe = test[test_columns]
temp = dataframe.to_dict('list')
test_dataset = tf.data.Dataset.from_tensor_slices(temp)
test_ds = test_dataset.batch(64).map(prepare_dataset)


In [501]:
y_pred = model_1.predict(test_ds)

In [502]:
#y_pred

In [503]:
test['Survived'] = y_pred

In [504]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,...,First Name,Title,Nickname,Last Name,Junior,Middle Names,Ticket Number,Ticket Type,family_size,Survived
0,892,3,"kelly, mr. james",male,34.5,0,0,330911,7.8292,n,...,kelly,mr.,0,james,1,,330911,,1,0.066667
1,893,3,"wilkes, mrs. james (ellen needs)",female,47.0,1,0,363272,7.0,n,...,wilkes,mrs.,1,james,1,,363272,,2,0.6
2,894,2,"myles, mr. thomas francis",male,62.0,0,0,240276,9.6875,n,...,myles,mr.,0,thomas,1,francis,240276,,1,0.1
3,895,3,"wirz, mr. albert",male,27.0,0,0,315154,8.6625,n,...,wirz,mr.,0,albert,1,,315154,,1,0.033333
4,896,3,"hirvonen, mrs. alexander (helga e lindqvist)",female,22.0,1,1,3101298,12.2875,n,...,hirvonen,mrs.,1,alexander,1,,3101298,,3,0.8


In [505]:
sub = test[['PassengerId','Survived']]

In [506]:
sub['Survived'] = np.where(sub['Survived'] >= .5, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub['Survived'] = np.where(sub['Survived'] >= .5, 1, 0)


In [507]:
sub.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [508]:
sub.to_csv('submission.csv',index=False)

In [509]:
!kaggle competitions submit -c titanic -f submission.csv -m "no name"

100%|██████████████████████████████████████| 2.77k/2.77k [00:00<00:00, 3.47kB/s]
Successfully submitted to Titanic - Machine Learning from Disaster