# Predicting Wine Rating with $h2o$

### Importing libraries

In [1]:
import pandas as pd
import numpy as np

import h2o
from h2o.automl import H2OAutoML
from sklearn import preprocessing

from sklearn.model_selection import train_test_split

### Initializing

In [2]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_271"; Java(TM) SE Runtime Environment (build 1.8.0_271-b09); Java HotSpot(TM) 64-Bit Server VM (build 25.271-b09, mixed mode)
  Starting server from /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/25/3rfj5zyn0cx18kcqkgd4nn4m0000gn/T/tmp1t_4j1b3
  JVM stdout: /var/folders/25/3rfj5zyn0cx18kcqkgd4nn4m0000gn/T/tmp1t_4j1b3/h2o_Angela_started_from_python.out
  JVM stderr: /var/folders/25/3rfj5zyn0cx18kcqkgd4nn4m0000gn/T/tmp1t_4j1b3/h2o_Angela_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Europe/Madrid
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.0.2
H2O_cluster_version_age:,1 month
H2O_cluster_name:,H2O_from_python_Angela_5viemw
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.556 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


### Loading dataset

In [3]:
print('Loading dataset...')
train = h2o.import_file("../data/train_dummy.csv")
test = h2o.import_file("../data/test_dummy.csv")

Loading dataset...
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [4]:
x = train.columns # X contains the list of columns used for training
y = "Rating" # Specifies the name of the column we want to predict
x.remove(y) # We delete the column "Rating" from x

### Let's start optimizing and searching for algorithms

In [5]:
automl = H2OAutoML(max_runtime_secs=1200)
automl.train(x=x, y=y, training_frame=train)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [6]:
predictions = automl.predict(test)

gbm prediction progress: |████████████████████████████████████████████████| 100%


In [7]:
model = predictions.as_data_frame()
columns = ['Rating_Predicted']
model.columns = columns

In [9]:
model = model.round(2)
model

Unnamed: 0,Rating_Predicted
0,4.21
1,4.21
2,4.06
3,3.78
4,3.82
...,...
2208,3.85
2209,3.69
2210,4.00
2211,3.83


### Checking model performance and metrics

In [10]:
lb = automl.leaderboard
lb

model_id,mean_residual_deviance,rmse,mse,mae,rmsle
GBM_grid__1_AutoML_20201217_215808_model_2,0.0342985,0.185198,0.0342985,0.143751,0.0388036
GBM_grid__1_AutoML_20201217_215808_model_14,0.0343635,0.185374,0.0343635,0.144096,0.0388476
GBM_2_AutoML_20201217_215808,0.0343748,0.185404,0.0343748,0.144102,0.0388445
GBM_grid__1_AutoML_20201217_215808_model_26,0.0343863,0.185435,0.0343863,0.144162,0.0388531
GBM_1_AutoML_20201217_215808,0.0343873,0.185438,0.0343873,0.143989,0.038845
XGBoost_grid__1_AutoML_20201217_215808_model_10,0.0344287,0.18555,0.0344287,0.144335,0.0388647
GBM_grid__1_AutoML_20201217_215808_model_4,0.0344866,0.185706,0.0344866,0.144704,0.0389219
GBM_grid__1_AutoML_20201217_215808_model_16,0.0345498,0.185876,0.0345498,0.144592,0.0389559
XGBoost_3_AutoML_20201217_215808,0.0346008,0.186013,0.0346008,0.14472,0.0389519
GBM_3_AutoML_20201217_215808,0.0346175,0.186058,0.0346175,0.144542,0.0389687




In [11]:
ld = automl.leader

In [None]:
m = h2o.get_model(ld)

### Adding Categories Sommelier (see "*2_data-modeling*")

In [13]:
def categorySomm(rating):
    if 4.90 <= rating:
        return "Extraordinary"
    if (3.90 <= rating) & (rating <= 4.80):
        return "Outstanding"
    if (3.00 <= rating) & (rating <= 3.89):
        return "Above Average to Very Good"
    if (2.00 <= rating) & (rating <= 2.99):
        return "Average"
    if (1.00 <= rating) & (rating <= 1.99):
        return "Below Average"
    if (0.00 <= rating) & (rating <= 0.99):
        return "Unacceptable"
    return "other"

In [14]:
model["Rating_Somm"] = model["Rating_Predicted"].apply(categorySomm)

In [15]:
model

Unnamed: 0,Rating_Predicted,Rating_Somm
0,4.21,Outstanding
1,4.21,Outstanding
2,4.06,Outstanding
3,3.78,Above Average to Very Good
4,3.82,Above Average to Very Good
...,...,...
2208,3.85,Above Average to Very Good
2209,3.69,Above Average to Very Good
2210,4.00,Outstanding
2211,3.83,Above Average to Very Good


In [16]:
model["Rating_Somm"].value_counts()

Above Average to Very Good    1302
Outstanding                    910
Average                          1
Name: Rating_Somm, dtype: int64

### Comparing with original dataset

In [17]:
wines = pd.read_csv("../data/wines_somm.csv")
wines.drop("Unnamed: 0", axis=1, inplace=True)

In [18]:
wines.head()

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,WineStyle,Full_Name,Rating_Somm
0,Pomerol 2011,France,Pomerol,Château La Providence,4.2,100,95.0,2011,red,Pomerol 2011 Château La Providence red,Outstanding
1,Lirac 2017,France,Lirac,Château Mont-Redon,4.3,100,15.5,2017,red,Lirac 2017 Château Mont-Redon red,Outstanding
2,Erta e China Rosso di Toscana 2015,Italy,Toscana,Renzo Masi,3.9,100,7.45,2015,red,Erta e China Rosso di Toscana 2015 Renzo Masi red,Outstanding
3,Bardolino 2019,Italy,Bardolino,Cavalchina,3.5,100,8.72,2019,red,Bardolino 2019 Cavalchina red,Above Average to Very Good
4,Ried Scheibner Pinot Noir 2016,Austria,Carnuntum,Markowitsch,3.9,100,29.15,2016,red,Ried Scheibner Pinot Noir 2016 Markowitsch red,Outstanding


#### Key Takeaway:
- The model has potential to be optimized since the rating doesn't match perfectly the one from Vivino

---

# Demo

In [19]:
data = {"Country_Luxembourg": [1],
        "Price": [19.19],
        "Year": [2005],
        "WineStyle": "white"
        }

In [20]:
demo = pd.DataFrame(data)
demo

Unnamed: 0,Country_Luxembourg,Price,Year,WineStyle
0,1,19.19,2005,white


#### Function to convert ``WineStyle`` to numerical

In [21]:
def convertCat(df):
    style = {"red":1, "white":2, "sparkling":3, "rose":4}
    df["WineStyle"] = df["WineStyle"].map(style)
    return df

In [22]:
demo = convertCat(demo)

#### Converting df to h2o instance

In [23]:
hf = h2o.H2OFrame(demo)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [24]:
rating_pred = automl.predict(hf)

gbm prediction progress: |████████████████████████████████████████████████| 100%




In [25]:
pred = rating_pred.as_data_frame()
demo['Rating_Predicted'] = pred
demo["Rating_Somm"] = model["Rating_Predicted"].apply(categorySomm)
demo

Unnamed: 0,Country_Luxembourg,Price,Year,WineStyle,Rating_Predicted,Rating_Somm
0,1,19.19,2005,2,3.848919,Outstanding
