In [32]:
import pandas as pd
import numpy as np

In [33]:
train_data = pd.read_csv("../input/train.csv")

In [34]:
clean_training_data = \
    train_data[
        (train_data['Age'].notnull()) & 
        (train_data['Fare'] != 0.0) & 
        (train_data['Embarked'].notnull())].copy()

clean_training_data.drop(columns=['Cabin'], inplace=True)


In [35]:
# We are removing 'Name' column as they unique for each data entry and hence does not give us any predictive power.
clean_training_data_v1 = clean_training_data.drop(columns=['Name'])

In [36]:
# Remove the ticket column here
clean_training_data_v2 = clean_training_data_v1.drop(columns=['Ticket'])
clean_training_data_v2.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [37]:
clean_training_data_v3 = clean_training_data_v2.drop(columns=['Fare'])

In [38]:
# Using h2o
import h2o
h2o.init(nthreads = -1, max_mem_size = 8)

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,32 mins 23 secs
H2O cluster timezone:,America/New_York
H2O data parsing timezone:,UTC
H2O cluster version:,3.18.0.2
H2O cluster version age:,3 months and 5 days
H2O cluster name:,H2O_from_python_babs4JESUS_g9c4k2
H2O cluster total nodes:,1
H2O cluster free memory:,6.818 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


In [39]:
clean_h2o_data = h2o.H2OFrame(clean_training_data_v3)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [40]:
# Make sure that the predictor is labelled as a factor to ensure proper functioning.
clean_h2o_data['Survived'] = clean_h2o_data['Survived'].asfactor()

In [41]:
# Split to 75% training and validation, 25% for test. Seed set to fixed value to ensure reproducibility.
splits = clean_h2o_data.split_frame(ratios=[0.74], seed=1)  
train_validation = splits[0]
test = splits[1]

In [42]:
y_column = 'Survived'
x_columns = 'Sex'

In [43]:
# Import H2O RF:
from h2o.estimators.random_forest import H2ORandomForestEstimator


In [44]:
rf_fit6 = H2ORandomForestEstimator(model_id='rf_fit6', seed=1)

In [45]:
rf_fit6.train(x=x_columns, y=y_column, 
              training_frame=train_validation)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [46]:
rf_fit6.auc()

0.7505208333333333

In [47]:
rf_fit6.model_performance().auc()

0.7505208333333333

In [48]:
rf_fit6.model_performance(train_validation).auc()

0.7567708333333333

### Why is the auc() value differing when I pass in the training data

Not sure of this , have logged https://github.com/h2oai/h2o-tutorials/issues/67 to find out.