In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

## Introduction
AutoML is a function in H2O that automates the process of building large number of models, with the goal of finding the “best” model without any prior knowledge. In this article, we will look into AutoML from H2O.ai.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

### Start H2O
Import the h2o Python module and H2OAutoML class and initialize a local H2O cluster.

In [None]:
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.automl import H2OAutoML
h2o.init()

### Load Data

We load the train and test data on H2O.

In [None]:
train_df = h2o.import_file('../input/learn-together/train.csv')
test_df = h2o.import_file('../input/learn-together/test.csv')

In [None]:
test_df.shape

For classification, the response should be encoded as categorical (aka. "factor" or "enum"). Let's take a look.

In [None]:
train_df["Cover_Type"].describe()

In [None]:
#Drop Id columns
train_df = train_df.drop('Id', axis = 1)
#test_ids = test_df["Id"].squeeze() 
#test_df = test_df.drop('Id', axis = 1)

# Make target as categorical
train_df['Cover_Type'] = train_df['Cover_Type'].asfactor()

#Predictor Columns
x_col = train_df.columns
x_col = x_col.remove('Cover_Type')

y_col = 'Cover_Type'

#Split data into training and validation
d = train_df.split_frame(ratios = [0.8], seed = 42)
hf_train = d[0] # using 80% for training
hf_valid = d[1] # rest 20% for testing

### Run AutoML
For the AutoML function, we just specify how long we want to train for and we’re set. For this example, we will train for 300 seconds.

In [None]:
aml = H2OAutoML(seed = 42, max_models=10, max_runtime_secs=1800, verbosity='info')
aml.train(x = x_col, y = y_col, training_frame = hf_train, validation_frame=hf_valid)

### Leaderboard

Once the model is trained, you can access the Leaderboard. The leader model is stored at aml.leader and the leaderboard is stored at aml.leaderboard The leaderboard stores the snapshot of the top models. 

In [None]:
print(aml.leaderboard)

In [None]:
print(aml.leader)

In [None]:
from sklearn.metrics import accuracy_score, f1_score

preds = aml.leader.predict(hf_valid)
accuracy_score(preds['predict'].as_data_frame(), hf_valid['Cover_Type'].as_data_frame())

### Save Leader Model

In [None]:
h2o.save_model(aml.leader)

### Prediction (Output)

In [None]:
#Output
preds = aml.leader.predict(test_df)

# Save test predictions to file
output = pd.DataFrame({'Id': test_df["Id"].as_data_frame().squeeze(),
                       'Cover_Type': preds['predict'].as_data_frame().squeeze()})

output.to_csv('submission.csv', index=False)