# 3.2 - H20

In [1]:
# librerias

import h2o

from h2o.automl import H2OAutoML

import pandas as pd
pd.set_option('display.max_columns', None)

from sklearn.model_selection import train_test_split as tts 

In [None]:
# inicializamos el servidor h2o

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 .....

## a) Transformado con selección de características

In [None]:
COLS=['accommodates', 'air_conditioning', 'availability_30', 'availability_365', 'availability_60', 'availability_90',
      'bathrooms', 'bedrooms', 'beds', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes',
      'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms', 'cleaning_fee',
      'dishwasher', 'extra_people', 'guests_included','latitude', 'longitude', 'maximum_nights', 'minimum_nights', 
      'number_of_reviews', 'number_of_reviews_ltm', 'room_type_private_room', 'room_type_shared_room', 'security_deposit',
      'price']

len(COLS)

In [None]:
# carga de datos
listings=pd.read_csv('../data/transform_data/listings_normal.csv', usecols=COLS)

listings=listings[(listings.price>=10) & (listings.price<=196)]  

# cambio en el tamaño del tipo de dato
for c in listings.select_dtypes(include='int'):
    listings[c]=pd.to_numeric(listings[c], downcast='integer')

for c in listings.select_dtypes(include='float'):
    listings[c]=pd.to_numeric(listings[c], downcast='float')

In [None]:
train, test=tts(listings)

train=h2o.H2OFrame(train)
test=h2o.H2OFrame(test)

In [None]:
# separa X de y
X=[c for c in train.columns if c!='price']

y='price'

In [None]:
# inicia auto-machine-learning

automl=H2OAutoML(max_models=20, seed=42, max_runtime_secs=300, sort_metric='RMSE')

In [None]:
%%time

# entrena
automl.train(x=X, y=y, training_frame=train, validation_frame=test)

In [None]:
print('Leaderboard:')

leader=automl.leaderboard

leader.head()

In [None]:
# evaliuación mejor modelo

automl.leader.model_performance(normal_test)

## b) Transformado sin selección de características

In [None]:
%%time

df_normal=h2o.import_file(path='../data/transform_data/listings_normal.csv')

normal_train, normal_test=df_normal.split_frame(ratios=[.8])

X=[c for c in normal_train.columns if c!='price']
y='price'

normal_automl=H2OAutoML(max_models=20, seed=42, max_runtime_secs=300, sort_metric='RMSE')

normal_automl.train(x=X, y=y, training_frame=normal_train, validation_frame=normal_test)

normal_leader=normal_automl.leaderboard

normal_leader.head()

In [None]:
normal_automl.leader.model_performance(normal_test)

## c) En crudo

In [None]:
%%time

df_raw=h2o.import_file(path='../data/clean_data/listings.csv')

raw_train, raw_test=df_raw.split_frame(ratios=[.8])

X=[c for c in raw_train.columns if c!='price']
y='price'

raw_automl=H2OAutoML(max_models=20, seed=42, max_runtime_secs=300, sort_metric='RMSE')

raw_automl.train(x=X, y=y, training_frame=raw_train, validation_frame=raw_test)

raw_leader=raw_automl.leaderboard

raw_leader.head()

In [None]:
raw_automl.leader.model_performance(raw_test)