<a href="https://colab.research.google.com/github/antonio-flores-tlacuahuac/Machine-Learning/blob/master/wine_price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [0]:
import itertools
import os
import math

In [0]:
import numpy as np

In [0]:
import pandas as pd

In [0]:
import tensorflow as tf

In [0]:
from sklearn.preprocessing import LabelEncoder

In [0]:
from tensorflow import keras

In [0]:
layers = keras.layers

In [9]:
print("You have tensorflow version: ",tf.__version__)

You have tensorflow version:  2.2.0-rc4


In [0]:
URL = "https://storage.googleapis.com/sara-cloud-ml/wine_data.csv"

In [0]:
path = tf.keras.utils.get_file(URL.split('/')[-1],URL)

In [0]:
data = pd.read_csv(path)

In [0]:
data = data.sample(frac=1)

In [14]:
data.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
26278,26278,US,A solid wine. At the equivalent of $5 for a re...,Proprietor's Reserve,85,10.0,California,California,California Other,Cabernet Sauvignon,Glen Ellen
108319,108319,France,90-92 Barrel sample. Good balance between rich...,Barrel sample,91,,Bordeaux,Pessac-Léognan,,Bordeaux-style White Blend,Château Picque Caillou
92267,92267,France,This wine makes you work harder than many of i...,,85,15.0,Alsace,Alsace,,Riesling,Dopff & Irion
57720,57720,US,Shows classic Muscat flavors of oranges and ta...,,86,24.0,California,Paso Robles,Central Coast,Muscat Canelli,Orchid Hill
35204,35204,US,"With moderate alcohol, this Pinot is silky and...",Winery Block Estate,91,38.0,California,Russian River Valley,Sonoma,Pinot Noir,Balletto


In [0]:
data = data[pd.notnull(data['country'])]

In [0]:
data = data[pd.notnull(data['price'])]

In [0]:
data = data.drop(data.columns[0], axis=1)

In [0]:
variety_threshold = 500

In [0]:
value_counts = data['variety'].value_counts()

In [0]:
to_remove = value_counts[value_counts <= variety_threshold].index

In [0]:
data.replace(to_remove, np.nan, inplace = True)

In [0]:
data = data[pd.notnull(data['variety'])]

In [0]:
#train_size = int(len(data)*0.8)
train_size = int(len(data)*0.6)

In [24]:
print("Train size : %d" % train_size )

Train size : 71734


In [25]:
print("Test size : %d" % (len(data) - train_size))

Test size : 47824


In [0]:
description_train = data['description'][:train_size]

In [0]:
variety_train = data['variety'][:train_size]

In [0]:
labels_train = data['price'][:train_size]

In [0]:
description_test = data['description'][:train_size]

In [0]:
variety_test = data['variety'][:train_size]

In [0]:
labels_test = data['price'][:train_size]

In [0]:
vocab_size = 12000

In [0]:
tokenize = keras.preprocessing.text.Tokenizer(num_words=vocab_size, char_level = False)

In [0]:
tokenize.fit_on_texts(description_train)

In [0]:
description_bow_train = tokenize.texts_to_matrix(description_train)

In [0]:
description_bow_test = tokenize.texts_to_matrix(description_test)

In [0]:
encoder = LabelEncoder()

In [38]:
encoder.fit(variety_train)

LabelEncoder()

In [0]:
variety_train = encoder.transform(variety_train)

In [0]:
variety_test = encoder.transform(variety_test)

In [0]:
num_classes = np.max(variety_train)+1

In [0]:
variety_train = keras.utils.to_categorical(variety_train, num_classes)

In [0]:
variety_test = keras.utils.to_categorical(variety_test, num_classes)

In [0]:
bow_inputs = layers.Input(shape=(vocab_size,))

In [0]:
variety_inputs = layers.Input(shape=(num_classes,))

In [0]:
merged_layer = layers.concatenate([bow_inputs, variety_inputs])

In [0]:
merged_layer = layers.Dense(256, activation='relu')(merged_layer)

In [0]:
predictions = layers.Dense(1)(merged_layer)

In [0]:
wide_model = keras.Model(inputs=[bow_inputs, variety_inputs], outputs=predictions)

In [0]:
wide_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

In [51]:
print(wide_model.summary())

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 12000)]      0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 40)]         0                                            
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 12040)        0           input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
dense (Dense)                   (None, 256)          3082496     concatenate[0][0]            

In [0]:
train_embed = tokenize.texts_to_sequences(description_train)

In [0]:
test_embed = tokenize.texts_to_sequences(description_test)

In [0]:
max_seq_length = 170

In [0]:
train_embed = keras.preprocessing.sequence.pad_sequences(train_embed,maxlen=max_seq_length, padding="post")

In [0]:
test_embed = keras.preprocessing.sequence.pad_sequences(test_embed,maxlen=max_seq_length, padding="post")

In [0]:
deep_inputs = layers.Input(shape=(max_seq_length,))

In [0]:
embedding = layers.Embedding(vocab_size,8,input_length=max_seq_length)(deep_inputs)

In [0]:
embedding = layers.Flatten()(embedding)

In [0]:
embed_out = layers.Dense(1)(embedding)

In [0]:
deep_model = keras.Model(inputs=deep_inputs,outputs=embed_out)

In [62]:
print(deep_model.summary())

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 170)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 170, 8)            96000     
_________________________________________________________________
flatten (Flatten)            (None, 1360)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 1361      
Total params: 97,361
Trainable params: 97,361
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
deep_model.compile(loss='mse',optimizer='adam',metrics=['accuracy'])

In [0]:
merged_out = layers.concatenate([wide_model.output,deep_model.output])

In [0]:
merged_out = layers.Dense(1)(merged_out)

In [0]:
combined_model = keras.Model(wide_model.input+[deep_model.input], merged_out)

In [67]:
print(combined_model.summary())

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 12000)]      0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 40)]         0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 170)]        0                                            
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 12040)        0           input_1[0][0]                    
                                                                 input_2[0][0]              

In [0]:
combined_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

In [69]:
combined_model.fit([description_bow_train,variety_train]+[train_embed],labels_train, epochs=30, batch_size=128)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7ff92d00e668>

In [70]:
combined_model.evaluate([description_bow_test,variety_test]+[test_embed], labels_test, batch_size=128)



[13.256608009338379, 0.0]

In [0]:
predictions = combined_model.predict([description_bow_test, variety_test]+ [test_embed])

In [72]:
num_predictions = 40
diff = 0

for i in range(num_predictions):
    val = predictions[i]
    print(description_test.iloc[i])
    print('Predicted : ', val[0], 'Actual : ',labels_test.iloc[i], '\n')
    diff += abs(val[0]-labels_test.iloc[i])


A solid wine. At the equivalent of $5 for a regular bottle, it offers plenty of Cabernet flavor and texture, with smooth tannins framing flavors of blackberries, currants and spices.
Predicted :  9.141994 Actual :  10.0 

This wine makes you work harder than many of its neighbors to find the flavor but it's worth the effort when you uncover an intriguing mélange of hay, apricot and early summer fruits. The crisp, minerally finish puts one in mind of a platter of choucroute garnie. Drink now to 2013.
Predicted :  13.606025 Actual :  15.0 

With moderate alcohol, this Pinot is silky and delicate in the mouth, with excellent acidity. Yet it's intense in sunshiney-ripe raspberries and cherries, as well as a hint of mushrooms that will become more pronounced with age. Defines the lighter style of California Pinot, in a gracious, elegant style.
Predicted :  38.635174 Actual :  38.0 

Light nuances of baking spice and forest floor grace the notes of lively black cherry and raspberry. The velv

In [73]:
print('Average prediction difference : ', diff/num_predictions)

Average prediction difference :  1.713943362236023
