In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("./"))

# Any results you write to the current directory are saved as output.

#data cleaning and feature engineering 
def get_name_prefix(data):
    prefix = pd.Series(np.ones(data.shape[0]), index=data.index)
    data['Prefix'] = prefix
    data.loc[data.Name.str.contains('Miss.', regex=False), 'Prefix'] = 2
    data.loc[data.Name.str.contains('Mrs.', regex=False), 'Prefix'] = 3
    data.loc[data.Name.str.contains('Mr.', regex=False), 'Prefix'] = 4
    
# https://stackoverflow.com/a/42523230
def one_hot(df, cols):
    """
    @param df pandas DataFrame
    @param cols a list of columns to encode 
    @return a DataFrame with one-hot encoding
    """
    for each in cols:
        dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
        del df[each]
        df = pd.concat([df, dummies], axis=1)
    return df

def normalize(df, mean, std):
    """
    @param df pandas DataFrame
    @param mean pandas Series of column values mean
    @param std pandas Series of column values standard deviation
    """
    for i in range(mean.size):
        df[mean.index[i]] = (df[mean.index[i]] - mean[0]) / std[0] 

def process_data(data):
    # get prefix data
    get_name_prefix(data)
    # remove name and ticket
    data.drop(['Ticket', 'Name'], inplace=True, axis=1)
    # sex
    data.loc[data.Sex != 'male', 'Sex'] = 0;
    data.loc[data.Sex == 'male', 'Sex'] = 1;
    # cabin
    data.Cabin.fillna('0', inplace=True)
    data.loc[data.Cabin.str[0] == 'A', 'Cabin'] = 1
    data.loc[data.Cabin.str[0] == 'B', 'Cabin'] = 2
    data.loc[data.Cabin.str[0] == 'C', 'Cabin'] = 3
    data.loc[data.Cabin.str[0] == 'D', 'Cabin'] = 4
    data.loc[data.Cabin.str[0] == 'E', 'Cabin'] = 5
    data.loc[data.Cabin.str[0] == 'F', 'Cabin'] = 6
    data.loc[data.Cabin.str[0] == 'G', 'Cabin'] = 7
    data.loc[data.Cabin.str[0] == 'T', 'Cabin'] = 8
    # embarked
    data.Embarked.fillna(0, inplace=True)
    data.loc[data.Embarked == 'C', 'Embarked'] = 1
    data.loc[data.Embarked == 'Q', 'Embarked'] = 2
    data.loc[data.Embarked == 'S', 'Embarked'] = 3
    data.fillna(-1, inplace=True)
    
    data = one_hot(data, ('Pclass', 'Sex', 'Cabin', 'Embarked', 'Prefix'))
    return data.astype(float)

['Help.desktop', 'C_Users_ALFRED~1.DEL_ONEDRI~1_MASTER_2019-2~1_TFM_PORTOS~1 (sshfs-disk)', 'train.csv', 'test.csv', 'a-quick-try-of-h2o-automl-on-titanic-dataset.py', 'SQuirrelLSQL.desktop', 'Azure Storage Explorer.desktop', 'Untitled.ipynb', 'drill-embedded-server.desktop', 'Jupyter.desktop', 'drill-web-console.desktop', 'DSVM tools', '.ipynb_checkpoints', 'RStudio.desktop', 'DSVM Forums.desktop', 'Evince.desktop']


In [2]:
#load data
train_raw = pd.read_csv('./train.csv')
test_raw = pd.read_csv('./test.csv')

display(train_raw)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
train = process_data(train_raw)
test = process_data(test_raw)

data_mean = train[['Age','Fare','SibSp','Parch']].mean(axis=0)
data_std = train[['Age','Fare','SibSp','Parch']].std(axis=0)

normalize(train, data_mean, data_std)
normalize(test, data_mean, data_std)

test, train = test.align(train, axis=1, fill_value=0)

#start H2O 
import h2o
from h2o.automl import H2OAutoML

h2o.init()

#load data as h2o frames
train = h2o.H2OFrame(train)
test = h2o.H2OFrame(test)

#drop passengerId from data set
passId = test['PassengerId']
train = train.drop('PassengerId',axis =1)
test = test.drop('PassengerId',axis =1)

#identify predictors and labels
x = train.columns
y = 'Survived'
x.remove(y)

#for binary classification, lables should be a factor
train[y] = train[y].asfactor()

# Run AutoML
aml_ti = H2OAutoML(max_runtime_secs= 120,max_models= 10, seed= 7,nfolds= 10)
aml_ti.train(x = x, y = y,
          training_frame = train)
          
#check the leaderboard
lb_ti = aml_ti.leaderboard
lb_ti

#prediction
pred = aml_ti.leader.predict(test)

#save predict results to submission form
pred_df = pred.as_data_frame()
pred_res = pred_df.predict
passId_df = passId.as_data_frame()
res_ti = pd.concat([passId_df,pred_res],axis=1,ignore_index = True)
res_ti.columns = ['PassengerId','Survived']
res_ti.to_csv('./Resultados-h2o.csv',index=False)

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.7" 2020-04-14; OpenJDK Runtime Environment (build 11.0.7+10-post-Ubuntu-2ubuntu218.04); OpenJDK 64-Bit Server VM (build 11.0.7+10-post-Ubuntu-2ubuntu218.04, mixed mode, sharing)
  Starting server from /anaconda/envs/py37_default/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpolg15fid
  JVM stdout: /tmp/tmpolg15fid/h2o_adelsors_started_from_python.out
  JVM stderr: /tmp/tmpolg15fid/h2o_adelsors_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,09 secs
H2O cluster timezone:,Etc/UTC
H2O data parsing timezone:,UTC
H2O cluster version:,3.26.0.9
H2O cluster version age:,6 months and 24 days !!!
H2O cluster name:,H2O_from_python_adelsors_ocwb3n
H2O cluster total nodes:,1
H2O cluster free memory:,27.51 Gb
H2O cluster total cores:,12
H2O cluster allowed cores:,12


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
AutoML progress: |████████████████████████████████████████████████████████| 100%
xgboost prediction progress: |████████████████████████████████████████████| 100%
