# Task - Predicting the price of wine

### We need to we predict the price of a bottle of wine from its description and variety

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from tensorflow import keras
layers = keras.layers

print("TensorFlow version", tf.__version__)

TensorFlow version 2.6.0


In [None]:
# Download the data and convert it to a Pandas DataFrame

!wget -q https://storage.googleapis.com/sara-cloud-ml/wine_data.csv

data_df_original = pd.read_csv("wine_data.csv")
data_df_original.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [None]:
data_df = data_df_original[['variety', 'description', 'price']]
data_df.head()

Unnamed: 0,variety,description,price
0,Cabernet Sauvignon,This tremendous 100% varietal wine hails from ...,235.0
1,Tinta de Toro,"Ripe aromas of fig, blackberry and cassis are ...",110.0
2,Sauvignon Blanc,Mac Watson honors the memory of a wine once ma...,90.0
3,Pinot Noir,"This spent 20 months in 30% new French oak, an...",65.0
4,Provence red blend,"This is the top wine from La Bégude, named aft...",66.0


In [None]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150929 entries, 0 to 150928
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   variety      150929 non-null  object 
 1   description  150929 non-null  object 
 2   price        137234 non-null  float64
dtypes: float64(1), object(2)
memory usage: 3.5+ MB


In [None]:
data_df.price.value_counts(dropna=False)

NaN       13695
20.0       7860
15.0       7056
18.0       5988
25.0       5955
          ...  
1200.0        1
162.0         1
588.0         1
1900.0        1
172.0         1
Name: price, Length: 358, dtype: int64

In [None]:
data_df.dropna(inplace=True)
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 137234 entries, 0 to 150928
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   variety      137234 non-null  object 
 1   description  137234 non-null  object 
 2   price        137234 non-null  float64
dtypes: float64(1), object(2)
memory usage: 4.2+ MB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
# We have 632 varietals of wine in the original dataset

data_df_original.variety.value_counts()

Chardonnay                  14482
Pinot Noir                  14291
Cabernet Sauvignon          12800
Red Blend                   10062
Bordeaux-style Red Blend     7347
                            ...  
Rabigato                        1
Moscatel Graúdo                 1
Grenache Gris                   1
Chardonnay-Pinot Grigio         1
Syrah-Carignan                  1
Name: variety, Length: 632, dtype: int64

In [None]:
# And we have 619 varietals of wine in the restricted dataset

data_df.variety.value_counts()

Chardonnay               13775
Pinot Noir               13628
Cabernet Sauvignon       12671
Red Blend                 9378
Sauvignon Blanc           6054
                         ...  
Magliocco                    1
Malvazija                    1
Silvaner-Traminer            1
Bombino Bianco               1
Cabernet Franc-Malbec        1
Name: variety, Length: 619, dtype: int64

In [None]:
# To make it easier for our models to extract patterns,
# let's keep only records with the top 40 varietals

top_varietals = 40
value_counts = data_df['variety'].value_counts()
value_counts[top_varietals-1]

515

In [None]:
variety_threshold = value_counts[top_varietals-1] - 1
to_remove = value_counts[value_counts <= variety_threshold].index
data_df.replace(to_remove, np.nan, inplace=True)
data_df.dropna(inplace=True)
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119562 entries, 0 to 150928
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   variety      119562 non-null  object 
 1   description  119562 non-null  object 
 2   price        119562 non-null  float64
dtypes: float64(1), object(2)
memory usage: 3.6+ MB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [None]:
data_df.variety.value_counts()

Chardonnay                       13775
Pinot Noir                       13628
Cabernet Sauvignon               12671
Red Blend                         9378
Sauvignon Blanc                   6054
Syrah                             5667
Riesling                          5212
Merlot                            4987
Bordeaux-style Red Blend          4545
Zinfandel                         3794
Malbec                            3085
Sangiovese                        2879
White Blend                       2554
Tempranillo                       2525
Rosé                              2461
Shiraz                            1945
Sparkling Blend                   1820
Portuguese Red                    1812
Nebbiolo                          1529
Rhône-style Red Blend             1455
Cabernet Franc                    1310
Corvina, Rondinella, Molinara     1292
Pinot Gris                        1275
Pinot Grigio                      1270
Viognier                          1254
Champagne Blend          

In [None]:
# To use less RAM we need to restrain the dataset

data_df = data_df[:30000]

In [None]:
random_state = 42

train_full_df, test_df = train_test_split(data_df, random_state=random_state, test_size=0.2)
y_test_df = test_df['price']
X_test_df = test_df.drop(columns=['price'])

X_train_df, X_val_df, y_train_df, y_val_df = train_test_split(train_full_df[['variety', 'description']],
                                                              train_full_df['price'], random_state=random_state, test_size=0.2)

In [None]:
# To learn Tokenizer on feature 'description'

vocab_size = 12000
tokenize = keras.preprocessing.text.Tokenizer(num_words=vocab_size)
tokenize.fit_on_texts(X_train_df['description'])

In [None]:
# To get a bag of words (each element 0 or 1) as Numpy matrix
description_bow_train = tokenize.texts_to_matrix(X_train_df['description'])
description_bow_val = tokenize.texts_to_matrix(X_val_df['description'])
description_bow_test = tokenize.texts_to_matrix(X_test_df['description'])

In [None]:
description_bow_train.shape

(19200, 12000)

In [None]:
description_bow_train[0]

array([0., 1., 1., ..., 0., 0., 0.])

In [None]:
# Convert feature 'variety' to classes (ints)

encoder = LabelEncoder()
encoder.fit(data_df['variety'])
variety_train = encoder.transform(X_train_df['variety'])
variety_val = encoder.transform(X_val_df['variety'])
variety_test = encoder.transform(X_test_df['variety'])
num_classes = np.max(variety_train) + 1
num_classes

40

In [None]:
variety_train[:3]

array([20, 34, 27])

In [None]:
# Convert ints to one hot

variety_train = keras.utils.to_categorical(variety_train, num_classes)
variety_val = keras.utils.to_categorical(variety_val, num_classes)
variety_test = keras.utils.to_categorical(variety_test, num_classes)
variety_train[:1]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32)

In [None]:
# Create the wide model using Functional API

bow_inputs = layers.Input(shape=(vocab_size,))
variety_inputs = layers.Input(shape=(num_classes,))
merged_input = layers.concatenate([bow_inputs, variety_inputs])
dense_layer = layers.Dense(256, activation='relu')(merged_input)
output = layers.Dense(1, activation='linear')(dense_layer)
wide_model = keras.Model(inputs=(bow_inputs, variety_inputs), outputs=output)

In [None]:
wide_model.compile(loss='mse', optimizer='nadam', metrics=['mae'])
wide_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 12000)]      0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 40)]         0                                            
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 12040)        0           input_3[0][0]                    
                                                                 input_4[0][0]                    
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 256)          3082496     concatenate_1[0][0]        

In [None]:
history = wide_model.fit(x=(description_bow_train, variety_train), y=y_train_df,
                         batch_size=32, epochs=3, validation_data=((description_bow_val, variety_val), y_val_df))

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
wide_model.evaluate(x=(description_bow_test, variety_test), y=y_test_df)



[2828.34326171875, 19.127315521240234]

In [None]:
# Representing feature 'description' as lists of ints (indices of words)

description_embed_train = tokenize.texts_to_sequences(X_train_df['description'])
description_embed_val = tokenize.texts_to_sequences(X_val_df['description'])
description_embed_test = tokenize.texts_to_sequences(X_test_df['description'])

In [None]:
tokenize.texts_to_sequences(['Some string'])

[[78, 5849]]

In [None]:
tokenize.texts_to_sequences(['Some string', 'String with four words'])

[[78, 5849], [5849, 5, 793, 3931]]

In [None]:
description_embed_train[0]

[121,
 564,
 41,
 18,
 270,
 83,
 12,
 15,
 2,
 247,
 700,
 5,
 3,
 347,
 631,
 988,
 3330,
 88,
 609,
 2093,
 6,
 7,
 2,
 544,
 202,
 87,
 9,
 74,
 39,
 9,
 151,
 31,
 346,
 1,
 478,
 1746,
 110,
 1,
 272,
 9,
 39,
 228,
 32]

In [None]:
len(description_embed_train[0])

43

In [None]:
lengths_of_lists = [len(each_list) for each_list in description_embed_train]
max(lengths_of_lists)

128

In [None]:
# We need to set up the fixed length of input lists

max_seq_length = max(lengths_of_lists)
description_embed_train = keras.preprocessing.sequence.pad_sequences(description_embed_train, maxlen=max_seq_length)
description_embed_val = keras.preprocessing.sequence.pad_sequences(description_embed_val, maxlen=max_seq_length)
description_embed_test = keras.preprocessing.sequence.pad_sequences(description_embed_test, maxlen=max_seq_length)

In [None]:
len(description_embed_train[0])

128

In [None]:
# Create the model with embedding using Functional API

output_embedding_dim = 12

deep_inputs = layers.Input(shape=(max_seq_length,))
embedding = layers.Embedding(vocab_size, output_embedding_dim, input_length=max_seq_length)(deep_inputs)
flatten = layers.Flatten()(embedding)
output = layers.Dense(1, activation='linear')(flatten)
embedding_model = keras.Model(inputs=deep_inputs, outputs=output)

In [None]:
embedding_model.compile(loss='mse', optimizer='nadam', metrics=['mae'])
embedding_model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 128)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 128, 12)           144000    
_________________________________________________________________
flatten (Flatten)            (None, 1536)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 1537      
Total params: 145,537
Trainable params: 145,537
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = embedding_model.fit(x=description_embed_train, y=y_train_df,
                         batch_size=32, epochs=3, validation_data=(description_embed_val, y_val_df))

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
embedding_model.evaluate(x=description_embed_test, y=y_test_df)



[3021.0947265625, 17.7925968170166]

In [None]:
# Let's combine two models

merged_out = layers.concatenate([wide_model.output, embedding_model.output])
combined_out = layers.Dense(1, activation='linear')(merged_out)
combined_model = keras.Model(list(wide_model.input) + [embedding_model.input], combined_out)
combined_model.compile(loss='mse', optimizer='nadam', metrics=['mae'])
combined_model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 12000)]      0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 40)]         0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 12040)        0           input_3[0][0]                    
                                                                 input_4[0][0]              

In [None]:
history = combined_model.fit([description_bow_train, variety_train] + [description_embed_train], y=y_train_df,
                         batch_size=32, epochs=5,
                         validation_data=([description_bow_val, variety_val] + [description_embed_val], y_val_df))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
combined_model.evaluate(x=[description_bow_test, variety_test] + [description_embed_test], y=y_test_df)



[2687.8251953125, 19.22991943359375]

In [None]:
predictions = combined_model.predict([description_bow_test, variety_test] + [description_embed_test])

In [None]:
for i in range(5):
  val = predictions[i]
  print(X_train_df.description.iloc[i])
  print('Predicted: ', val[0], '; Actual: ', y_test_df.iloc[i], '\n')

So luscious now, it's hard not to finish the bottle, especially with a grilled steak, lamb chop or mushroom risotto. This is the winery's best Pinot in years, rich in cola, red currants and sandalwood, smoothly tannic, and aged in rich, toasty oak.
Predicted:  43.63249 ; Actual:  36.0 

A stalwart, sturdy rendering, this smells like ripe plums and dark spices. It tastes full of fruit but is shaded nicely by slightly smoky, peppery notes. The body is full and the texture firm with tannins and acidity. Best drink this after 2018.
Predicted:  13.643045 ; Actual:  27.0 

Smoke and violet perfume are seductive on this elegant off-dry Riesling. Fresh green apple and pear flavors are tart but concentrated, wrapping up tightly with a squeaky-clean, lemon-lime finish.
Predicted:  18.499884 ; Actual:  30.0 

Waves of purple floral fragrance meld with soy, blackberry and roast lamb on the nose of this complex and compelling bottling. It's quite soft on the palate, with black plum and hoisin sauce