# H2O.ai

H2O permite aplicar metodologías de machine learning y analítica predictiva mediante un proceso de inducción del conocimiento en grandes volumenes de información

<img src="../img/Structure.JPG" alt="Structure" style="width:599px;height:670px;">

# Iniciar H2O

In [2]:
import h2o

In [3]:
h2o.init(nthreads=-1)

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,2 hours 31 mins
H2O cluster version:,3.10.4.8
H2O cluster version age:,4 months and 13 days !!!
H2O cluster name:,H2O_from_python_SPULIDO_d49bcz
H2O cluster total nodes:,1
H2O cluster free memory:,15.75 Gb
H2O cluster total cores:,12
H2O cluster allowed cores:,12
H2O cluster status:,"locked, healthy"
H2O connection url:,http://localhost:54321


# Abrir Plataforma

<hr>
<b>Luego de iniciar H2O pueder abrir la plataforma: http://localhost:54321 or http://server_XYZ:54321 
<hr>

# Documentación

<b>Documentación H2O: http://h2o-release.s3.amazonaws.com/h2o/master/3904/docs-website/h2o-py/docs/index.html </b>

<b>GitHub: https://github.com/h2oai </b>

<b>Preguntas: https://stackoverflow.com/questions/tagged/h2o </b>

<b>Video motivacional https://www.youtube.com/watch?v=przqXS2ioZQ </b>

<hr>

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set()
%matplotlib inline

# Cargar datos en H2O

In [4]:
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine, types

engine = create_engine("mysql+mysqldb://user:pwd@ip/squema")

In [5]:
sql = """
select *
from tabla1
"""

datos = pd.read_sql(sql, engine)

In [6]:
datos.shape

(5000000, 7)

In [None]:
datos_h2o = h2o.H2OFrame(datos.values.tolist(), column_names=datos.columns.values.tolist())

<hr>
<hr>

# Modelos No Supervisados

In [None]:
from h2o.estimators.kmeans import H2OKMeansEstimator
from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator

## K-means

<img src="../img/k_means.gif" alt="Structure" style="width:599px;height:599px;">

In [None]:
X=['vble1', 'vble2','vble3', 'vble4']

In [None]:
# train k-means cluster model
clusters = H2OKMeansEstimator(standardize=True, k=3, seed=12345)
clusters.train(x=X, training_frame=datos_h2o)
print(clusters)

In [None]:
datos_h2o.head()

In [None]:
datos_h2o['cluster']= clusters.predict(datos_h2o)

In [None]:
datos_h2o.head()

## PCA

In [None]:
X=['vble1', 'vble2','vble3', 'vble4']

In [None]:
pca = H2OPrincipalComponentAnalysisEstimator(k=2) # project onto 2 PCs
pca.train(x=X, training_frame=datos_h2o)

In [None]:
features = pca.predict(datos_h2o[X])

In [None]:
features.head()

In [None]:
features.shape

In [None]:
features = pca.predict(datos_h2o)
features_pandas = features.as_data_frame()
features_pandas['label'] = datos_h2o[-1].as_data_frame()
print(features_pandas.head())

In [None]:
sns.jointplot(x="PC1", y="PC2", data=features_pandas)

# modelos Supervisados

In [None]:
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator 

In [None]:
datos['vble_rta']=0
datos.loc[datos.vble1=='tipo_falla']=1

In [None]:
datos.head()

In [None]:
y = 'vble_rta'
X=['vble1', 'vble2','vble3', 'vble4']

In [None]:
#train, test = datos_h2o.split_frame([0.8])
x0 = datos[datos.vble_rta==0]
x1 = datos[datos.vble_rta==1]
msk1 = np.random.rand(x0.shape[0]) <= 0.80
msk2 = np.random.rand(x1.shape[0]) <= 0.80
x0_train = x0[msk1]
x1_train = x1[msk2]
x0_test = x0[~msk1]
x1_test = x1[~msk2]

x_train = pd.concat([x0_train, x1_train]).fillna(0)
x_test = pd.concat([x0_test, x1_test]).fillna(0)

In [None]:
train_h2o = h2o.H2OFrame(x_train.values.tolist(), column_names=x_train.columns.values.tolist())
test_h2o  = h2o.H2OFrame(x_test.values.tolist(), column_names=x_test.columns.values.tolist())

In [None]:
train_h2o[y] = train_h2o[y].asfactor()
test_h2o[y] = test_h2o[y].asfactor()

In [None]:
train_h2o.head()

## GLM

<img src="../img/glm.png" alt="Structure">

<img src="../img/glm_h2o.JPG" alt="Structure">

In [None]:
model_glm = H2OGeneralizedLinearEstimator(family='binomial',solver='IRLSM',standardize=True,lambda_search=True, model_id='glm_model')

In [None]:
model_glm.train(X, y, training_frame=train_h2o, validation_frame=test_h2o)

## GBM

<img src="../img/gbm.png" alt="Structure">

In [None]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator

In [None]:
model_gbm = H2OGradientBoostingEstimator(ntrees=400, max_depth=3, sample_rate=0.3, col_sample_rate=0.05, seed=1234, model_id='gbm_model')

In [None]:
model_gbm.train(x=X, y=y, training_frame=train_h2o, validation_frame=test_h2o)

## ANN

<img src="../img/ann.png" alt="Structure">

In [None]:
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

In [None]:
model_ann = H2ODeepLearningEstimator(hidden=[20,20], l1=0.009, l2=0.01, model_id='nn_model')

In [None]:
model_ann.train(X, y, training_frame=train_h2o, validation_frame=test_h2o)