# $h2o$

In [1]:
import pandas as pd
import numpy as np

import h2o
from h2o.automl import H2OAutoML
from sklearn import preprocessing

from sklearn.model_selection import train_test_split

In [2]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_271"; Java(TM) SE Runtime Environment (build 1.8.0_271-b09); Java HotSpot(TM) 64-Bit Server VM (build 25.271-b09, mixed mode)
  Starting server from /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/25/3rfj5zyn0cx18kcqkgd4nn4m0000gn/T/tmps7zdsd4_
  JVM stdout: /var/folders/25/3rfj5zyn0cx18kcqkgd4nn4m0000gn/T/tmps7zdsd4_/h2o_Angela_started_from_python.out
  JVM stderr: /var/folders/25/3rfj5zyn0cx18kcqkgd4nn4m0000gn/T/tmps7zdsd4_/h2o_Angela_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Europe/Madrid
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.0.2
H2O_cluster_version_age:,29 days
H2O_cluster_name:,H2O_from_python_Angela_rop8j5
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.556 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


### Loading dataset

In [28]:
print('Loading dataset...')
train = h2o.import_file("../data/train_dummy.csv")
test = h2o.import_file("../data/test_dummy.csv")

Loading dataset...
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [31]:
x = train.columns # X contains the list of columns used for training
y = "Rating" # Specifies the name of the column we want to predict
x.remove(y) # We delete the column "Rating" from x

### Let's start optimizing and searching for algorithms

In [32]:
automl = H2OAutoML(max_runtime_secs=1200)
automl.train(x=x, y=y, training_frame=train)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [33]:
predictions = automl.predict(test)

gbm prediction progress: |████████████████████████████████████████████████| 100%


In [38]:
model = predictions.as_data_frame()
columns = ['Rating_Predicted']
model.columns = columns

In [39]:
model

Unnamed: 0,Rating_Predicted
0,4.177819
1,3.934817
2,3.657539
3,3.726881
4,3.964698
...,...
13829,3.860228
13830,3.959815
13831,4.029163
13832,3.538528


### Adding Categories Sommelier (see 2_data-modeling)

In [49]:
def categorySomm(rating):
    if 4.9 <= rating:
        return "Extraordinary"
    if (3.9 <= rating) & (rating <= 4.8):
        return "Outstanding"
    if (3.0 <= rating) & (rating <= 3.8):
        return "Above Average to Very Good"
    if (2.0 <= rating) & (rating <= 2.9):
        return "Average"
    if (1.0 <= rating) & (rating <= 1.9):
        return "Below Average"
    if (0.0 <= rating) & (rating <= 0.9):
        return "Unacceptable"
    return "other"

In [50]:
model["Rating_Somm"] = model["Rating_Predicted"].apply(categorySomm)

In [51]:
model

Unnamed: 0,Rating_Predicted,Rating_Somm
0,4.177819,Outstanding
1,3.934817,Outstanding
2,3.657539,Above Average to Very Good
3,3.726881,Above Average to Very Good
4,3.964698,Outstanding
...,...,...
13829,3.860228,other
13830,3.959815,Outstanding
13831,4.029163,Outstanding
13832,3.538528,Above Average to Very Good


### Comparing with original dataset

In [47]:
wines = pd.read_csv("../data/wines_somm.csv")
wines.drop("Unnamed: 0", axis=1, inplace=True)

In [48]:
wines.head()

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,WineStyle,Full_Name,Rating_Somm
0,Pomerol 2011,France,Pomerol,Château La Providence,4.2,100,95.0,2011,red,Pomerol 2011 Château La Providence red,Outstanding
1,Lirac 2017,France,Lirac,Château Mont-Redon,4.3,100,15.5,2017,red,Lirac 2017 Château Mont-Redon red,Outstanding
2,Erta e China Rosso di Toscana 2015,Italy,Toscana,Renzo Masi,3.9,100,7.45,2015,red,Erta e China Rosso di Toscana 2015 Renzo Masi red,Outstanding
3,Bardolino 2019,Italy,Bardolino,Cavalchina,3.5,100,8.72,2019,red,Bardolino 2019 Cavalchina red,Above Average to Very Good
4,Ried Scheibner Pinot Noir 2016,Austria,Carnuntum,Markowitsch,3.9,100,29.15,2016,red,Ried Scheibner Pinot Noir 2016 Markowitsch red,Outstanding


In [52]:
wines["Rating_Somm"].value_counts()

Outstanding                   6938
Above Average to Very Good    6871
Average                         24
Extraordinary                    1
Name: Rating_Somm, dtype: int64

In [53]:
model["Rating_Somm"].value_counts()

Above Average to Very Good    6006
Outstanding                   5449
other                         2377
Average                          2
Name: Rating_Somm, dtype: int64

#### Key Takeaway:
- 