In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from tensorflow import keras
layers = keras.layers

print("TensorFlow version", tf.__version__)

TensorFlow version 1.8.0


In [2]:
data = pd.read_csv("wine_data.csv")

In [3]:
# Do some preprocessing to limit the # of wine varities in the dataset
data = data[pd.notnull(data['country'])]
data = data[pd.notnull(data['price'])]
data = data.drop(data.columns[0], axis=1) 

variety_threshold = 500 # Anything that occurs less than this will be removed.
value_counts = data['variety'].value_counts()
to_remove = value_counts[value_counts <= variety_threshold].index
data.replace(to_remove, np.nan, inplace=True)
data = data[pd.notnull(data['variety'])]

In [4]:
X = data[['description','variety']] # features
y = data['price']                   # labels
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)



In [5]:
# create bag of words vocabulary for description
vocab_size = 12000
tokenize = keras.preprocessing.text.Tokenizer(num_words=vocab_size, char_level=False)
tokenize.fit_on_texts(X_train['description']) # only fit on train

In [6]:
# create bag of words vectors for train and test
description_bow_train = tokenize.texts_to_matrix(X_train['description'])
description_bow_test = tokenize.texts_to_matrix(X_test['description'])

In [7]:
# convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(X_train['variety']) # only fit on train
variety_train = encoder.transform(X_train['variety'])
variety_test = encoder.transform(X_test['variety'])
num_classes = np.max(variety_train) + 1

# Convert index to one hot
variety_train = keras.utils.to_categorical(variety_train, num_classes)
variety_test = keras.utils.to_categorical(variety_test, num_classes)

In [8]:
# wide model
bow_inputs = layers.Input(shape=(vocab_size,)) 
variety_inputs = layers.Input(shape=(num_classes,))
wide_inputs = layers.concatenate([bow_inputs, variety_inputs])
wide_inputs = layers.Dense(256, activation='relu')(wide_inputs)
predictions = layers.Dense(1)(wide_inputs)
wide_model = keras.Model(inputs=[bow_inputs, variety_inputs], outputs=predictions)
print(wide_model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 12000)        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 12040)        0           input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 256)          3082496     concatenate_1[0][0]              
__________

In [9]:
wide_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

In [10]:
# create word embeddings for train and test
description_embed_train = tokenize.texts_to_sequences(X_train['description'])
description_embed_test = tokenize.texts_to_sequences(X_test['description'])

max_seq_length = 170
description_embed_train = keras.preprocessing.sequence.pad_sequences(description_embed_train,
                                                                     maxlen=max_seq_length, padding="post")
    
description_embed_test = keras.preprocessing.sequence.pad_sequences(description_embed_test, 
                                                                    maxlen=max_seq_length, padding="post")

In [11]:
deep_inputs = layers.Input(shape=(max_seq_length,))
embedding = layers.Embedding(vocab_size, 8, input_length=max_seq_length)(deep_inputs)
embedding = layers.Flatten()(embedding)
embed_out = layers.Dense(1)(embedding)
deep_model = keras.Model(inputs=deep_inputs, outputs=embed_out)
print(deep_model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 170)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 170, 8)            96000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 1360)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 1361      
Total params: 97,361
Trainable params: 97,361
Non-trainable params: 0
_________________________________________________________________
None


In [12]:
deep_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

In [13]:
# Combine wide and deep into one model
merged_out = layers.concatenate([wide_model.output, deep_model.output])
merged_out = layers.Dense(1)(merged_out)
combined_model = keras.Model(wide_model.input + [deep_model.input], merged_out)
print(combined_model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 12000)        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 170)          0                                            
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 12040)        0           input_1[0][0]                    
                                                                 input_2[0][0]                    
__________

In [14]:
combined_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

In [15]:
combined_model.fit([description_bow_train, variety_train] + [description_embed_train], y_train, epochs=10, batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras._impl.keras.callbacks.History at 0x7fd3687913c8>

In [16]:
combined_model.evaluate([description_bow_test, variety_test] + [description_embed_test], y_test, batch_size=128)



[631.6691498384703, 0.05206590833428433]

In [17]:
predictions = combined_model.predict([description_bow_test, variety_test] + [description_embed_test])

In [18]:
num_predictions = 40
diff = 0

for i in range(num_predictions):
    val = predictions[i]
    prediction = val[0]
    actual = y_test.iloc[i]
    print(X_test['description'].iloc[i])
    print('Predicted: ', prediction, 'Actual: ', actual, '\n')
    diff += abs(prediction - actual)

print('Average prediction difference: ', diff / num_predictions)

For the price, this is not a good effort. It's overoaked and horsey smelling, with a heavy, overripe palate that tastes herbal and like hickory-infused blackberry. Flat and stewy on the finish.
Predicted:  22.975567 Actual:  29.0 

Here's a 100% Cab that shows the elegance of Napa and the heat of the vintage. The tannins are especially wonderful, being soft, sweet and complicated, giving the wine great structure. But the blackberry, cherry, cassis and chocolate fruit is so overwhelming, it robs the wine of the subtlety desired.
Predicted:  92.36188 Actual:  75.0 

Always dark in color and flavor profile, Craggy Range stays true to those traits with an excellent 2008 from its own Te Muna Road Vineyard. Hints of cocoa, dried herb and leather accent vibrant black cherry and plum flavors, while the tannins are silky and refined. Drink now–2015.
Predicted:  42.84418 Actual:  39.0 

A good Cabernet, rough and tannic, with blackberry and blueberry fruit flavors that finish a bit sweet and syr