In [21]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Dense, LSTM, Dropout, Conv1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [2]:
EMBEDDING_FILE = 'GoogleNews-vectors-negative300.bin.gz'

data = pd.read_csv('winemag-data_first150k.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [3]:
#word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

Preprocessing

In [4]:
data = data.groupby('variety').filter(lambda x: len(x) >200)

X = data.drop(['Unnamed: 0','country','designation','points','province','region_1','region_2','variety','winery'], axis = 1)
y = data.variety.to_frame()

In [5]:
extras = ['.', ',', '\"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', 'cab',"%"]
stop_words = set(stopwords.words('english'))
stop_words.update(extras)

def clean_text(text):

    text = str(text)
    
    word_tokens = word_tokenize(text)

    filtered_sentence  = [word for word in word_tokens if word.lower() not in stop_words]
    text = ' '.join(filtered_sentence)
    
    return text

clean_text(data['description'][0])

'tremendous 100 varietal wine hails Oakville aged three years oak Juicy red-cherry fruit compelling hint caramel greet palate framed elegant fine tannins subtle minty tone background Balanced rewarding start finish years ahead develop nuance Enjoy 2022–2030'

In [6]:
X.head()

Unnamed: 0,description,price
0,This tremendous 100% varietal wine hails from ...,235.0
1,"Ripe aromas of fig, blackberry and cassis are ...",110.0
2,Mac Watson honors the memory of a wine once ma...,90.0
3,"This spent 20 months in 30% new French oak, an...",65.0
5,"Deep, dense and pure from the opening bell, th...",73.0


In [7]:
encoder =  LabelEncoder()
encoded_y = encoder.fit_transform(y.variety.as_matrix())
dummy_y = pd.get_dummies(encoded_y).values

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, dummy_y, random_state=1)

X_train = [clean_text(x) for x in X_train['description']]
X_labels = y_train
print('Loaded Training Data')

X_test = [clean_text(x) for x in X_test['description']]
y_labels = y_test
print('Loaded Testing Data')

Loaded Training Data
Loaded Testing Data


In [9]:
tokenizer = Tokenizer(num_words=None, lower=True, split=' ')

In [10]:
tokenizer = Tokenizer(num_words=None)
tokenizer.fit_on_texts(X_train)
print('Finished Building Tokenizer')

Finished Building Tokenizer


In [11]:
train_sequences = tokenizer.texts_to_sequences(X_train)
print('Finished Tokenizing Training')

test_sequences = tokenizer.texts_to_sequences(X_test)
print('Finished Tokenizing Testing')

Finished Tokenizing Training
Finished Tokenizing Testing


In [12]:
word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

Found 30374 unique tokens


In [13]:
train_data = pad_sequences(train_sequences, maxlen=35)
train_labels = np.array(X_labels)
print('Shape of data tensor:', train_data.shape)
print('Shape of label tensor:', X_labels.shape)
print('Finished Padding Training')

test_data = pad_sequences(test_sequences, maxlen=35)
test_labels = np.array(y_labels)
print('Finished Padding Testing')

Shape of data tensor: (104583, 35)
Shape of label tensor: (104583, 64)
Finished Padding Training
Finished Padding Testing


In [None]:
kVECTORLEN = 50

model = Sequential()
model.add(Embedding(40000, 50, input_length=35))
model.add(Dropout(0.2))
model.add(LSTM(200))
model.add(Dropout(0.2))
model.add(Dense(64, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())
early_stop = EarlyStopping(monitor='val_loss', patience=3)
model.fit(train_data, train_labels, validation_data=(test_data, test_labels), epochs=20, batch_size=64, shuffle=True, callbacks=[early_stop])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 35, 50)            2000000   
_________________________________________________________________
dropout_21 (Dropout)         (None, 35, 50)            0         
_________________________________________________________________
lstm_11 (LSTM)               (None, 200)               200800    
_________________________________________________________________
dropout_22 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 64)                12864     
Total params: 2,213,664
Trainable params: 2,213,664
Non-trainable params: 0
_________________________________________________________________
None
Train on 104583 samples, validate on 34862 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20

In [None]:
scores = model.evaluate(test_data, test_labels, verbose=0)
accuracy = (scores[1]*100)

print("Accuracy: {:.2f}%".format(scores[1]*100))