## Notebook to support unsupervised tests for H2O engine based on Credit card dataset

### Description
Model generation for unsupervised models like Autoencoders, Isolation Forest, PCA and GLRM.

# H2O.ai

In [1]:
import os
ARTIFACTS_PATH = '../../artifacts/h2o/'
os.makedirs(ARTIFACTS_PATH, exist_ok=True) # Create path if not exists

In [2]:
import h2o
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.15" 2022-04-19; OpenJDK Runtime Environment (build 11.0.15+10-Ubuntu-0ubuntu0.20.04.1); OpenJDK 64-Bit Server VM (build 11.0.15+10-Ubuntu-0ubuntu0.20.04.1, mixed mode, sharing)
  Starting server from /home/elizagonzalez/Documents/repos/mlx/libs/pytonisa/venv/lib/python3.8/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp62wqvc8p
  JVM stdout: /tmp/tmp62wqvc8p/h2o_elizagonzalez_started_from_python.out
  JVM stderr: /tmp/tmp62wqvc8p/h2o_elizagonzalez_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,00 secs
H2O_cluster_timezone:,Europe/Madrid
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.0.2
H2O_cluster_version_age:,3 months and 28 days !!!
H2O_cluster_name:,H2O_from_python_elizagonzalez_nl05i2
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.787 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


In [None]:
data = h2o.import_file("../../data/creditcard/training.csv")

In [None]:
data.head(1)

## Test-Train 

In [None]:
#split the data as described above
train, valid, test = data.split_frame([0.6, 0.2], seed=1234)

#Prepare predictors and response columns
data_X = data.col_names[:-1]     #last column is cover_type, 

## Creating Models

### Autoencoders

In [None]:
model_name = "autoencoder_h2o"

In [None]:
from h2o.estimators.deeplearning import H2OAutoEncoderEstimator
# Training using autoencoders
autoencoder_model =  H2OAutoEncoderEstimator(
    model_id=model_name,
    activation="Tanh",
    hidden = [14,7,7,14],
    epochs = 100,
    standardize = True,
    stopping_metric = 'MSE', # MSE for autoencoders
    train_samples_per_iteration = 32,
    shuffle_training_data = True, 
    autoencoder = True,
    l1 = 10e-5)
autoencoder_model.train(x=data_X, training_frame = train)

#saving model mojo to mlmodels folder
model_file = autoencoder_model.download_mojo(path = ARTIFACTS_PATH + "/",
                                           get_genmodel_jar = False)

In [None]:
autoencoder_model.anomaly(test).cbind(test).head(10)

In [None]:
pred = autoencoder_model.predict(test)
pred.cbind(test).head(10)

# Isolation Forest

In [None]:
model_name = "isolationforest_h2o"

In [None]:
from h2o.estimators import H2OIsolationForestEstimator

isolation_model = H2OIsolationForestEstimator(
    model_id=model_name, 
    seed = 1234)

isolation_model.train(x=data_X, training_frame = train)
#print(rf_model)

#saving model mojo to mlmodels folder
model_file = isolation_model.download_mojo(path = ARTIFACTS_PATH+ "/",
                                           get_genmodel_jar = False)

In [None]:
pred = isolation_model.predict(test)
pred.cbind(test).head(10)

## PCA

In [None]:
model_name = "pca_h2o"

In [None]:
from h2o.estimators import H2OPrincipalComponentAnalysisEstimator

# Build and train the model:
pca_model = H2OPrincipalComponentAnalysisEstimator(k = 10,
                                                   model_id = model_name,
                                                   use_all_factor_levels = True,
                                                   pca_method = "glrm",
                                                   transform = "standardize",
                                                   impute_missing = True)
pca_model.train(training_frame = train)

model_file = pca_model.download_mojo(path = ARTIFACTS_PATH + "/",
                                           get_genmodel_jar = False)

## GLRM 

In [None]:
model_name = "glrm_h2o"

In [None]:
from h2o.estimators import H2OGeneralizedLowRankEstimator

# Build and train the model:
glrm_model = H2OGeneralizedLowRankEstimator(k=4, 
                                            model_id=model_name,
                                            loss="quadratic",
                                            gamma_x=0.5,
                                            gamma_y=0.5,
                                            max_iterations=700,
                                            recover_svd=True,
                                            init="SVD",
                                            transform="standardize")
glrm_model.train(training_frame = train)

model_file = glrm_model.download_mojo(path = ARTIFACTS_PATH + "/",
                                           get_genmodel_jar = False)