<a href="https://colab.research.google.com/github/breaker2l/ML/blob/master/keras_wide_deep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [0]:
#install the latest version of Tensorflow
!pip install -q -U tensorflow==1.7.0

[K     |████████████████████████████████| 48.0MB 1.6MB/s 
[K     |████████████████████████████████| 3.1MB 30.2MB/s 
[K     |████████████████████████████████| 890kB 46.1MB/s 
[?25h  Building wheel for html5lib (setup.py) ... [?25l[?25hdone
[31mERROR: magenta 0.3.19 has requirement tensorflow>=1.12.0, but you'll have tensorflow 1.7.0 which is incompatible.[0m


In [0]:
import itertools
import os
import math
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
layers = keras.layers

#This code was testes with tensoflow v1.7
print("You have Tensorflow version",tf.__version__)

You have Tesorflow version 1.7.0


In [0]:
#Get the data:original source is here : https://www.kaggle.com/zynicide/wine-reviews/data
URL = "https://storage.googleapis.com/sara-cloud-ml/wine_data.csv"
path = tf.keras.utils.get_file(URL.split('/')[-1],URL)

In [0]:
#Convert the data to a pandas dataframe
data = pd.read_csv(path)

In [0]:
#shuffle the data
data = data.sample(frac=1)

#print the first 5 rows
data.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
40005,40005,Italy,Here's a simple and bright Moscato d'Asti that...,Antica Casa,85,,Piedmont,Moscato d'Asti,,Moscato,Ricossa
41119,41119,Italy,"Dried black cherry, pressed violet, tilled soi...",,92,67.0,Veneto,Amarone della Valpolicella,,Red Blend,I Saltari
93002,93002,France,"A round wine, the flavors initially created by...",Sec,89,20.0,Southwest France,Jurançon Sec,,White Blend,Château de Jurque
82582,82582,US,"A really wonderful Cabernet, clearly grown wel...",Papa's Knoll,93,45.0,California,Napa Valley,Napa,Cabernet Sauvignon,Buehler
8944,8944,US,While smoke and char notes are prominent on th...,Blackbird,86,22.0,New York,Finger Lakes,Finger Lakes,Bordeaux-style Red Blend,Silver Thread


In [0]:
#Do some preprocessing to limit the # of wine varieties in the dataset
data = data[pd.notnull(data['country'])]
data = data[pd.notnull(data['price'])]
data = data.drop(data.columns[0],axis=1)

variety_threshold = 500 #Anything that occurs less than this will be removed
value_counts = data['variety'].value_counts()
to_remove = value_counts[value_counts <= variety_threshold].index
data.replace(to_remove, np.nan,inplace=True)
data = data[pd.notnull(data['variety'])]

In [0]:
# split data into train and test
train_size = int(len(data) * .8)
print("Train size: %d" % train_size)
print("Test size:%d" % (len(data) - train_size))

Train size: 95646
Test size:23912


In [0]:
#Train features
description_train = data['description'][:train_size]
variety_train = data['variety'][:train_size]

#Train labels
labels_train = data['price'][:train_size]

#Test features
description_test = data['description'][train_size:]
variety_test = data['variety'][train_size:]


#Train labels
labels_test = data['price'][train_size:]


In [0]:
#Create a tokenizer to preprocess our text description 
vocab_size = 12000#This is a hyperparameter,experiment with different values for your dataset
tokenize = keras.preprocessing.text.Tokenizer(num_words=vocab_size, char_level=False)
tokenize.fit_on_texts(description_train) #only fit on train


In [0]:
#wide feature 1:sparse bag of words (bow) vocab_size vector
description_bow_train = tokenize.texts_to_matrix(description_train)

description_bow_test = tokenize.texts_to_matrix(description_test)

In [0]:
#wide feature 2: one hot vector of variety categories

#use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(variety_train)
variety_train = encoder.transform(variety_train)
variety_test = encoder.transform(variety_test)
num_classes = np.max(variety_train) + 1

#convert labels to one hot
variety_train = keras.utils.to_categorical(variety_train, num_classes)

variety_test = keras.utils.to_categorical(variety_test, num_classes)


In [0]:
#Define our wide model with the functional API
bow_inputs = layers.Input(shape=(vocab_size))
variety_inputs = layers.Input(shape=(num_classes,))
merged_layer = layers.concatenate([bow_inputs,variety_inputs])
merged_layer = layers.Dense(256, activation='relu')(merged_layer)
predictions = layers.Dense(1)(merged_layer)
wide_model = keras.Model(inputs=[bow_inputs,variety_inputs],output=predictions)

In [0]:
wide_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])
print(wide_model.summary())

NameError: ignored

In [0]:
#Deep model feature: word embeddings of wine description
train_embed = tokenize.texts_to_sequences(description_train)
test_embed = tokenize.texts_to_sequences(description_test)

max_seq_length = 170
train_embed = keras.preprocessing.sequence.pad_sequence(
    train_embed,maxlen=max_seq_length,padding="post")
test_embed = keras.preprocessing.sequence.pad_sequence(
    test_embed,maxlen=max_seq_length,padding="post")



In [0]:
#define our deep model with the functional Api
deep_inputs = layers.Input(shape=(max_seq_length,))
embedding = layers.Embedding(vocab_size, 8 , input_length=max_seq_length)(deep_inputs)
embedding = layers.Flatten()(embedding)
embed_out = layers.Dense(1)(embedding)
deep_model = keras.Model(inputs=deep_inputs, outputs=embed_out)
print(deep_model.summary())

In [0]:
deep_model.compile(loss='mse',optimizer='adam',metrics['accuracy'])

In [0]:
#Combine wide and deep into one model
merged_out = layers.concatenate([wide_model.output, deep_model.output])
merged_out = layers.Dense(1)(merged_out)
combined_model = keras.Model(wide_model.input + [deep_model.input],merged_out)
print(combined_model.summary())

combined_model.compile(loss='mse',optimizer='adam',metrics=['accuracy'])

In [0]:
#Run training
combined_model.fit([description_bow_train,variety_train] + [train_embed],labels_train,epochs=10, batch_size=128)


In [0]:
combined_model.fit([description_bow_test,variety_test] + [test_embed],labels_test,epochs=10, batch_size=128)
