In [2]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.callbacks import TensorBoard
import numpy as np
import random
import sys
from imblearn.under_sampling import RandomUnderSampler

Using TensorFlow backend.


In [3]:
wine_df = pd.read_csv('winemag_data_first150k.csv')

### Preprocessing

In [4]:
wine_df = wine_df.drop_duplicates(subset='description', keep='last')

wine_df = wine_df.drop(['Unnamed: 0', 'country', 'designation', 'points', 'price', 'province', 
                        'region_1', 'region_2', 'winery'], axis=1)

greater = wine_df.variety.value_counts() >= 50
greater_tru = greater[greater==True]
greaterthan50 = list(greater_tru.index)

wine_df.variety = wine_df.variety.apply(lambda x:x if x in greaterthan50 else 'Other')

wine_50_df = wine_df[wine_df.variety!='Other']
wine_50_df['fake_X'] = wine_50_df.index
fake_X = np.array(wine_50_df['fake_X'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [5]:
target = wine_50_df['variety']
y = pd.get_dummies(target)
_, y_scalar = np.where(y>0)

# variety_text = wine_df.variety + ": " + wine_df.description

# variety_text.shapefake_X.shape, y_scalar.shape

In [6]:
rus2 = RandomUnderSampler(replacement=False)
fake_X = fake_X.reshape(len(fake_X), 1)
X_res, y_res = rus2.fit_sample(fake_X, y_scalar)

X_res.shape, y_res.shape

((5508, 1), (5508,))

In [7]:
X_res[:,0].shape

(5508,)

In [8]:
balanced_lst = X_res[:, 0]
balanced_lst

array([ 28914,  29199,  33708, ..., 105073, 124703, 148365])

In [9]:
wine_50_df.fake_X = wine_50_df.fake_X.apply(lambda x:x if x in balanced_lst else 'out')
wine_50_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


Unnamed: 0,description,variety,fake_X
0,This tremendous 100% varietal wine hails from ...,Cabernet Sauvignon,out
1,"Ripe aromas of fig, blackberry and cassis are ...",Tinta de Toro,1
2,Mac Watson honors the memory of a wine once ma...,Sauvignon Blanc,out
3,"This spent 20 months in 30% new French oak, an...",Pinot Noir,out
5,"Deep, dense and pure from the opening bell, th...",Tinta de Toro,5


In [10]:
balanced_df = wine_50_df[wine_50_df.fake_X !='out']
balanced_df.head()

Unnamed: 0,description,variety,fake_X
1,"Ripe aromas of fig, blackberry and cassis are ...",Tinta de Toro,1
5,"Deep, dense and pure from the opening bell, th...",Tinta de Toro,5
7,Lush cedary black-fruit aromas are luxe and of...,Tinta de Toro,7
10,"Elegance, complexity and structure come togeth...",Friulano,10
13,This wine is in peak condition. The tannins an...,Tannat,13


In [11]:
balanced_df.shape

(5508, 3)

In [12]:
variety_text = balanced_df.variety + ": " + balanced_df.description

variety_text.shape

(5508,)

In [13]:
variety_text.head()

1     Tinta de Toro: Ripe aromas of fig, blackberry ...
5     Tinta de Toro: Deep, dense and pure from the o...
7     Tinta de Toro: Lush cedary black-fruit aromas ...
10    Friulano: Elegance, complexity and structure c...
13    Tannat: This wine is in peak condition. The ta...
dtype: object

In [14]:
text = ""
for description in variety_text:
    text += description + " "
    
print('corpus length:', len(text))

corpus length: 1389453


In [15]:
text[:1000]

'Tinta de Toro: Ripe aromas of fig, blackberry and cassis are softened and sweetened by a slathering of oaky chocolate and vanilla. This is full, layered, intense and cushioned on the palate, with rich flavors of chocolaty black fruits and baking spices. A toasty, everlasting finish is heady but ideally balanced. Drink through 2023. Tinta de Toro: Deep, dense and pure from the opening bell, this Toro is a winner. Aromas of dark ripe black fruits are cool and moderately oaked. This feels massive on the palate but sensationally balanced. Flavors of blackberry, coffee, mocha and toasty oak finish spicy, smooth and heady. Drink this exemplary Toro through 2023. Tinta de Toro: Lush cedary black-fruit aromas are luxe and offer notes of marzipan and vanilla. This bruiser is massive and tannic on the palate, but still lush and friendly. Chocolate is a key flavor, while baked berry and cassis flavors are hardly wallflowers. On the finish, this is tannic and deep as a sea trench. Drink this satu

In [16]:
chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 111


In [17]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

nb sequences: 463138


In [18]:
print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1
    
print("...Vectorization Finished")

Vectorization...
...Vectorization Finished


In [19]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
print("...Model Built")

Build model...
...Model Built


In [20]:
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 128)               122880    
_________________________________________________________________
dense_1 (Dense)              (None, 111)               14319     
_________________________________________________________________
activation_1 (Activation)    (None, 111)               0         
Total params: 137,199.0
Trainable params: 137,199.0
Non-trainable params: 0.0
_________________________________________________________________


In [21]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [22]:
seed_lst = []
for seed in list(variety_text):
    seed_lst.append(seed[:maxlen])
print(len(seed_lst))
seed_lst[:5]

5508


['Tinta de Toro: Ripe aromas of fig, black',
 'Tinta de Toro: Deep, dense and pure from',
 'Tinta de Toro: Lush cedary black-fruit a',
 'Friulano: Elegance, complexity and struc',
 'Tannat: This wine is in peak condition. ']

In [23]:
rand_generator = np.random.randint((len(seed_lst)-1))
seed_lst[rand_generator]

'Cabernet Sauvignon-Shiraz: This blend of'

In [24]:
board = TensorBoard()

In [28]:
# train the model, output generated text after each iteration
for iteration in range(10):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(X, y,
              batch_size=128,
              epochs=1,
              callbacks=[board])
    
    rand_generator = np.random.randint((len(seed_lst)-1))

    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print()
        print('----- diversity:', diversity)

        generated = seed_lst[rand_generator]
        print('----- Generating with seed: "' + generated + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x[0, t, char_indices[char]] = 1.

            preds = model.predict(x, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()


--------------------------------------------------
Iteration 0
Epoch 1/1

----- diversity: 0.2
----- Generating with seed: "Nero d'Avola: From the Alcamo area of we"
Nero d'Avola: From the Alcamo area of wes is a structure and a soft and spice and spice and cherry and spice and spice and spice and chewy and spice and spice and cherry and spice and cherry and cherry and citrus and spice. The wine is supple and spice and chewy and spice and chewy and spice and like and spice and spice and spice and spice and cherry flavors. The bright and chewy and chewy and chewy and chewy and spice and a bit the fin

----- diversity: 0.5
----- Generating with seed: "Nero d'Avola: From the Alcamo area of we"
Nero d'Avola: From the Alcamo area of weish is crushed berry and a wine of and a soft notes of cherry, cherry and berry flavors of apple fruit and dry and cherry and subtle flavors of cherry and tannins, with a tannins. The black fruit fruit and a delicious and texture is flavor, this is a dark win

  after removing the cwd from sys.path.


y and a bit the wine and a soft, but it is a bit the finish. Pinot Grigi: The wine is a soft, and a soft, and a soft and cherry and cherry and chocolate and cherry, blackberry and cherry and citrus and chocolate. The wine is a bit the nose of leather and apricots and cherry and cher

----- diversity: 0.5
----- Generating with seed: "Petite Sirah: Shows the dry astringency,"
Petite Sirah: Shows the dry astringency,ry, fruit and spice and cherry and cherry, chocolate flavors are its and grapefruit and blackberry, chocolated wine, with a touch of chocolate, peach and red blackberry, chocolate and cherry, peach and cherry flavors. The black currants, blackberry, blackberry and blackberry tones and eleganced, it has a clean richness aromas of cherry, blackberry, blackberry and chocolate, smoke, with a densely b

----- diversity: 1.0
----- Generating with seed: "Petite Sirah: Shows the dry astringency,"
Petite Sirah: Shows the dry astringency,ut like it's a surerry or vineyards of herbs. Dri

Portuguese Red: A fresh, ripe red wine, wine for a wine that is a bright and concentrated fruit and a structured wine with chocolate and coconut and concentrated or perfumed by a bit the wine is structured wine with a soft and fresh and a full of chocolate and candied fruit and cherry and caramel. The palate is perfumed and tannins of berry and caramel. The palate is full and earthy and ripe and complex, with a bit the wine is full of p

----- diversity: 0.5
----- Generating with seed: "Portuguese Red: A fresh, ripe red wine, "
Portuguese Red: A fresh, ripe red wine, each and chocolate. The palate is full and up the nose, it's a based by fresh acidity and a beautiful and the wine offers a bit complex, with fruit and mature and crisp and pure with concentral that that is a bright and full of raspberry and fruit. The wine is it to the close. The finish is perfect the form flavors of caramely and cherry and honey and lemon and green apple and caramel. The wine is

----- diversity: 1.0
---

In [None]:
# train the model, output generated text after each iteration
for iteration in range(10):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(X, y,
              batch_size=128,
              epochs=1,
              callbacks=[board])
    
    rand_generator = np.random.randint((len(seed_lst)-1))

    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print()
        print('----- diversity:', diversity)

        generated = seed_lst[rand_generator]
        print('----- Generating with seed: "' + generated + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x[0, t, char_indices[char]] = 1.

            preds = model.predict(x, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()


--------------------------------------------------
Iteration 0
Epoch 1/1

----- diversity: 0.2
----- Generating with seed: "Port: A delicious, easy wine that floats"
Port: A delicious, easy wine that floats of red fruit and chocolate. The finish is refreshing and st

  after removing the cwd from sys.path.


yle wine with a blend of the Toro't will this is a bit lively but with a structured wine with a blend of the Torrose is a beautiful and style of the Torrose, with a structured wine is a little style of the Torrose is a bit beautifully and structure. The backbone and black fruit and a clean and black fruit and blackberry flavors and stone 

----- diversity: 0.5
----- Generating with seed: "Port: A delicious, easy wine that floats"
Port: A delicious, easy wine that floatsfruit, lemon lime kiph. Grenache: Panisa and Sauvignon and style, this is a touch of this wine with orange and a good color and intense and chocolate. It has an abonession to a forcess, soft style. Give thefreco is a clean, and refreshed black fruit, lemon and peach, blackberry fruit and black fruit. It's a der a beautiful wine shows a vanilla and fruity and blossom, a fine style what is the poten

----- diversity: 1.0
----- Generating with seed: "Port: A delicious, easy wine that floats"
Port: A delicious, easy wine tha

Monastrell: Tight aromas of cranberry an from the style, with a slight to the finish. The palate is a delicious wine is the wine is the palate of tannins and the finish the wine is a delicious wine is a good Garnach. The finish is a structured and soft, this is a strong and a the tart and has a berry from Cabernet Sauvignon Blanc: This is a del, the finish is the fruit is a structured and a light acidity and a berry from fruit and the p

----- diversity: 0.5
----- Generating with seed: "Monastrell: Tight aromas of cranberry an"
Monastrell: Tight aromas of cranberry analate offers and wood, the wine is tannic touch of tannins and ripe and light acidity and a good Bought to savory blackberry acidity. The wine has a del the spicy, this is a balanced tannins and pear and to the sweet tannins into the San at a politter the vineyards with the palate of cherry spice alongside berry, crisp acidity. It is a fine wine. It has a two a lot and bright tanning and on a fres

----- diversity: 1.0
---

In [None]:
model.save('balRNN_upTboard_model.h5')

--------------------------------------------------
Iteration 0
Epoch 1/1
463138/463138 [==============================] - 343s - loss: 1.5875   

----- diversity: 0.2
----- Generating with seed: "Malbec: Black currant, cherry, plum and "
Malbec: Black currant, cherry, plum and  of concentrate of fruit and soft and black cherry and sweet and soft, ripe and soft and on the finish. The finish. The finish. The finish. The finish. The finish. Cabernet Sangioves: This is a thin the finds of thin in the finish. The fruit and the finish. The finish. The finish. The fruit and fruit and soft and a soft and a bit of complex that is a bit of complex and soft and a bit of complex an

----- diversity: 0.5
----- Generating with seed: "Malbec: Black currant, cherry, plum and "
Malbec: Black currant, cherry, plum and d balanced by Petit also offers flavors, this wine is a soft and rich, pair structure and soft, but the flavors of pineapple, fruit and chocolate and honeyed by white flavors of chocolate and rich and flavors of corn with a times of crisp fruit of drink ersess of citrus and its a pretty, acidity, with complex than of some struiget than the cherry flavors of pear and pear and tention of blackberry 

----- diversity: 1.0
----- Generating with seed: "Malbec: Black currant, cherry, plum and "
Malbec: Black currant, cherry, plum and aromaty and flavor grap enjoy, , firl, herbened. in:y, firm, up fruits, earthrizes, Brancedina: it has elegance  on soft, ylingly Gromas: Crest and aromas of cocoa and over  a full of charactering aging lend on the nose of nose. A most age some sweet itso of white hearthlia is a bottle firm firm, well-limot and manbor. Ceathed notity compritends unant. Franccoriss with delicate lens over and tyn r

----- diversity: 1.2
----- Generating with seed: "Malbec: Black currant, cherry, plum and "
Malbec: Black currant, cherry, plum and edarcaarond fruit and rmeoness. Fintumell: For a luss, soft creaminess of heagefulhy ampreteys moke soriscannes with pilfoned opens smoky, fruit only are pucoas. Its delicious flavors this golde not-soft well wothed throun lemgting clenite to th nose of Cavatbery modest of chocolate sweet nose : and darky acidity, payde. The poa. It's mitd medium with interested wine noads of GoCden-Merlotsen: Thi