In [1]:
import numpy as np
import pandas as pd
import csv
from scipy import stats
import os
import datetime
import sys
import time
import random
import json
import re
import pickle
from collections import Counter
# import torch
# import torch.nn.functional as F
# import torchtext
# from torchtext.data import get_tokenizer
import spacy
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
# import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
%matplotlib inline

pd.set_option("display.max_columns", 30)

In [2]:
'''
USEFUL DOCUMENTATION
https://debuggercafe.com/getting-started-with-variational-autoencoder-using-pytorch/
https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
charon.me posts
https://nicgian.github.io/text-generation-vae/ -- not successful version by someone using tf, used VAE?
https://medium.com/dataseries/variational-autoencoder-with-pytorch-2d359cbf027b
https://debuggercafe.com/sparse-autoencoders-using-kl-divergence-with-pytorch/
https://avandekleut.github.io/vae/
https://analyticsindiamag.com/hands-on-guide-to-implement-deep-autoencoder-in-pytorch-for-image-reconstruction/ -- pretty hands on, for an AE and for MNIST
https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/ -- combine into one corpus, then read?
https://stackoverflow.com/questions/45375488/how-to-filter-tokens-from-spacy-document -- used this to remove extra tokens
https://www.kdnuggets.com/2020/07/pytorch-lstm-text-generation-tutorial.html
'''

'\nUSEFUL DOCUMENTATION\nhttps://debuggercafe.com/getting-started-with-variational-autoencoder-using-pytorch/\nhttps://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html\ncharon.me posts\nhttps://nicgian.github.io/text-generation-vae/ -- not successful version by someone using tf, used VAE?\nhttps://medium.com/dataseries/variational-autoencoder-with-pytorch-2d359cbf027b\nhttps://debuggercafe.com/sparse-autoencoders-using-kl-divergence-with-pytorch/\nhttps://avandekleut.github.io/vae/\nhttps://analyticsindiamag.com/hands-on-guide-to-implement-deep-autoencoder-in-pytorch-for-image-reconstruction/ -- pretty hands on, for an AE and for MNIST\nhttps://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/ -- combine into one corpus, then read?\nhttps://stackoverflow.com/questions/45375488/how-to-filter-tokens-from-spacy-document -- used this to remove extra tokens\nhttps://www.kdnuggets.com/2020/07/pytorch-lstm-text-generation-tutorial.html\

In [3]:
# !python -m spacy download en_core_web_sm

In [4]:
nlp = spacy.load('en_core_web_sm')

In [5]:
################ Read the file, check it out ################

file_dir = './' 

df_ratings_all = pd.read_csv(file_dir + 'whisk_reviews_combined.csv')
df_ratings_all.head()

Unnamed: 0,whiskey_type,whiskey_name,reviewer_name,review_date,rev_rating,rev_notes
0,american_single_malt,STRANAHAN'S COLORADO WHISKEY,elbucko,"Tasted December 16, 2021",3.75,"Tastes like whiskey, maybe some pear? Great on..."
1,american_single_malt,STRANAHAN'S COLORADO WHISKEY,gmrocks,"Tasted December 8, 2021",3.75,This one proved quite popular with group of fr...
2,american_single_malt,STRANAHAN'S COLORADO WHISKEY,Mark-Willis,"Tasted November 27, 2021",4.5,Surprise of the flight consisting of itself Gl...
3,american_single_malt,STRANAHAN'S COLORADO WHISKEY,Dan-Cordial,"Tasted November 13, 2021",3.75,Floral notes
4,american_single_malt,STRANAHAN'S COLORADO WHISKEY,MoparRocker74,"Tasted November 11, 2021",3.75,Really good American single malt. Oaky and Cok...


## MAIN SETTINGS

In [6]:
VOCABULARY_SIZE = 50000
BATCH_SIZE = 128
LEARNING_RATE = 0.005
NUM_EPOCHS = 10
DEVICE = 'cpu'

EMBEDDING_DIM = 256
HIDDEN_DIM = 64
NUM_CLASSES = 2

## INITIAL TRANSFORMS FOR DF

In [7]:
################ INITIAL NEW/TRANSFORMED COLS FOR DF ################

rev_notes = df_ratings_all['rev_notes']
rev_notes = [str(elem).encode("ascii", "ignore").decode('utf-8') for elem in rev_notes] 
rev_notes = [re.sub('[\n\r\t\f]', ' ', elem) for elem in rev_notes] 
df_ratings_all['rev_notes'] = rev_notes # get the NEW rev notes

len_review = [len(str(review)) for review in df_ratings_all['rev_notes']]
df_ratings_all['rev_char_len'] = len_review

df_ratings_all = df_ratings_all.loc[df_ratings_all['rev_char_len'] > 0, :].reset_index(drop = True) # aka get rid of empty reviews

df_ratings_all['review_flg_pos'] = 1*(df_ratings_all['rev_rating'] >= 4) # DEFINE A POSITIVE REVIEW! READ SOME ABAOVE 
df_ratings_all['review_flg_neg'] = 1*(df_ratings_all['rev_rating'] < 3) # DEFINE A POSITIVE REVIEW! READ SOME ABAOVE 

In [8]:
df_ratings_all['rev_char_len'].describe([0.05, 0.1, 0.25, 0.35, .5, 0.65, .75, 0.9, 0.95, 0.975])

count    114483.000000
mean        221.219858
std         385.680698
min           1.000000
5%           11.000000
10%          19.000000
25%          41.000000
35%          58.000000
50%          93.000000
65%         149.000000
75%         216.000000
90%         528.000000
95%         945.900000
97.5%      1403.000000
max        7686.000000
Name: rev_char_len, dtype: float64

In [9]:
df_ratings_all.loc[(df_ratings_all['rev_rating'] == 4),['rev_notes']].iloc[200][0]

'Light peat on the nose, light citrus. Medium body, lovely peat, spice, medium finish'

## GET A SUBSET OF DATASET TO GENERATE REVIEWS

In [10]:
# Subset dataset to get reviews from X to Y?

df_ratings_subset_full = df_ratings_all.loc[(df_ratings_all['rev_char_len'] >= 200) & 
                                            (df_ratings_all['rev_char_len'] <= 500), :].reset_index(drop = True)
df_ratings_subset_full = df_ratings_subset_full.loc[~np.isnan(df_ratings_subset_full['rev_rating'])].reset_index(drop = True)
df_ratings_subset_full['rev_notes'] = df_ratings_subset_full['rev_notes'].str.lower() # convert to lower?
df_ratings_subset_pos = df_ratings_subset_full.loc[df_ratings_subset_full['rev_rating'] >= 4, ].reset_index(drop = True)

In [11]:
tokenize_en = spacy.load('en_core_web_sm')
nlp = tokenize_en

In [12]:
print(len(df_ratings_subset_pos))

10178


In [13]:
#### PLAY AROUND WITH A FEW NUMBERS HERE ####
# https://machinelearningknowledge.ai/complete-guide-to-spacy-tokenizer-with-examples/#:~:text=In%20Spacy%2C%20the%20process%20of,matches%20the%20tokenizer%20exception%20rules.
test_text = df_ratings_subset_pos['rev_notes'][10]
test_doc = nlp(test_text)
test_doc.text

'nose: barley, oak, lots of honey, tiny amount of cinnamon, love this nose  taste: serious cinnamon notes, oak spice, thick honey texture, balanced sweetness, slightly fruity   balance: 84/100 texture: 87/100 enjoyment: 85/100 overall: 85/100'

In [14]:
# get all words as Counter
all_words = Counter()
for doc in nlp.pipe(df_ratings_subset_pos['rev_notes']):
    words = [token.text for token in doc]
    all_words.update(words)

In [15]:
# find all words appearing 1 or fewer times! Remove from dimension
small_words = {x: count for x, count in all_words.items() if count <= 1}
print(len(all_words.items()))
print(len(small_words.items()))

16816
8307


In [16]:
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
from spacy.tokens import Doc

In [17]:
ls_new_revs = []

In [18]:
time_st = time.time()

for doc in nlp.pipe(df_ratings_subset_pos['rev_notes']):
#     print(doc.text)
    indexes = []
    for index, token in enumerate(doc):
        if token.text in small_words.keys():
            indexes.append(index)
#     print(indexes)
    np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
    np_array = np.delete(np_array, indexes, axis = 0)
    doc2 = Doc(doc.vocab, words=[t.text for i, t in enumerate(doc) if i not in indexes])
    doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)
    ls_new_revs.append(doc2.text)

time_end = time.time()

time_end - time_st

78.21132016181946

In [19]:
df_ratings_final_pos = df_ratings_subset_pos.copy()
df_ratings_final_pos['new_rev_notes'] = ls_new_revs
len_review = [len(str(review)) for review in df_ratings_final_pos['new_rev_notes']]
df_ratings_final_pos['rev_char_len'] = len_review

# after remove tokens, get back to >=200
df_ratings_final_pos = df_ratings_final_pos.loc[(df_ratings_final_pos['rev_char_len'] >= 200), :].\
    reset_index(drop = True)

# add in the final puncutation IF it is missing
fin_rev_notes = [elem + '. ' if re.match('[?.!]', elem[len(elem)-2]) is None else elem \
                 for elem in df_ratings_final_pos['new_rev_notes']]

# ADD EOS padding sa well, to indicate that we are on NEXT sentence
# https://stackoverflow.com/questions/70346894/how-to-add-sos-token-to-keras-tokenizer
fin_rev_notes = [elem + '<eos> ' for elem in fin_rev_notes]

df_ratings_final_pos['new_rev_notes'] = fin_rev_notes

In [20]:
df_ratings_text = df_ratings_final_pos['new_rev_notes'].str.cat(sep = ' ')

# REMOVE double or more of ANY space type, where present. They mess up the array-forming b/c sequences do not match
pattern= "\s+"
df_ratings_text = re.sub(pattern, " ", df_ratings_text)

df_ratings_text = df_ratings_text.split(' ')

In [21]:
# organize into sequences of tokens
param_length_seq = 40 + 1 # 50 tokens is around 200 characters long!
sequences = list()
for i in range(param_length_seq, len(df_ratings_text)):
    # select sequence of tokens
    seq = df_ratings_text[i-param_length_seq:i]
    # convert into a line
    line = ' '.join(seq)
    # store
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 659578


In [22]:
# # Save the sequence of text doc
# def save_doc(lines, filename):
#     data = '\n'.join(lines)
#     file = open(filename, 'w')
#     file.write(data)
#     file.close()

# save_doc(sequences, file_dir + 'whisk_reviews_sequence_pos.txt')

In [23]:
# Run Keras API Tokenizer()
tokenizer = Tokenizer(filters='\t\n') # we want to KEEP all punctuation

In [24]:
rand_array = np.random.randint(0, np.min([len(sequences), 500000]), size=300000, dtype=int)

In [25]:
# Run the instantialized tokenzer() 
print(type(sequences))
print(type(sequences[1000]))
seq_small = np.array(sequences)[rand_array]
seq_small = seq_small.tolist()

# .fit_on_texts fits on a list of texts -- https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer
tokenizer.fit_on_texts(seq_small)

<class 'list'>
<class 'str'>


In [26]:
# Transforms each text in texts to a sequence of integers.

sequence_ints = tokenizer.texts_to_sequences(seq_small)
print(len(sequence_ints))

300000


In [27]:
# Get vocab size, need + 1 because more easily can split out X and y training sets below
vocab_size = len(tokenizer.word_index) + 1 
print(vocab_size)

8020


In [28]:
len(sequences[68262].split(' '))

41

In [29]:
sequences[29732]

'flavor forward . is it worth $ 50 yes , is it worth twice the special reserve , i m not sure but i am on the hunt for a bottle . <eos> non - store pick . sweet caramel and'

In [30]:
# ENSURE DIMENSION MATCHING -- IF ANYTHING PRINTED, DEBUG
for i in range(len(sequence_ints)):
    if len(sequence_ints[i]) != param_length_seq:
        print(i)

In [31]:
# Convert ALL sequences into nparray format
sequence_array = np.asarray(sequence_ints)
sequence_array.shape

(300000, 41)

In [32]:
# Indexing input and output for our text data
X = sequence_array[:,:-1] # this means take all number of docs (the rows), exclude the LAST word
y = sequence_array[:,-1] # this means take all number of docts (the rows), take only the LAST word

# Converts each y into an X dimensional vector matching vocab size!
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1] # get the sequence length

In [33]:
print(seq_length)

40


<br>

## Model

In [34]:
# https://stackoverflow.com/questions/48479915/what-is-the-preferred-ratio-between-the-vocabulary-size-and-embedding-dimension -- maybe try lower embedding!!!
# https://medium.com/deep-learning-with-keras/lstm-understanding-the-number-of-parameters-c4e087575756

model = Sequential()
model.add(Embedding(input_dim = vocab_size, output_dim = 64, input_length=seq_length))
model.add(LSTM(64, activation="tanh", recurrent_activation="sigmoid", use_bias=True))
model.add(Dense(64, activation = 'relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 40, 64)            513280    
_________________________________________________________________
lstm (LSTM)                  (None, 64)                33024     
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_________________________________________________________________
dense_1 (Dense)              (None, 8020)              521300    
Total params: 1,071,764
Trainable params: 1,071,764
Non-trainable params: 0
_________________________________________________________________
None


In [35]:
# specifies what to run for loss and optimization
opt = tf.keras.optimizers.Adam(learning_rate=0.05)
model.compile(loss='categorical_crossentropy', optimizer='adam', 
              metrics=['accuracy', tf.keras.metrics.CategoricalCrossentropy()])

In [36]:
# fit model
model.fit(X, y, batch_size=128, epochs=200, shuffle=True, verbose = 2)

Epoch 1/200
2344/2344 - 106s - loss: 5.6451 - accuracy: 0.1098 - categorical_crossentropy: 5.6451
Epoch 2/200
2344/2344 - 106s - loss: 4.9423 - accuracy: 0.1812 - categorical_crossentropy: 4.9423
Epoch 3/200
2344/2344 - 107s - loss: 4.5770 - accuracy: 0.2127 - categorical_crossentropy: 4.5770
Epoch 4/200
2344/2344 - 107s - loss: 4.3412 - accuracy: 0.2308 - categorical_crossentropy: 4.3412
Epoch 5/200
2344/2344 - 106s - loss: 4.1678 - accuracy: 0.2444 - categorical_crossentropy: 4.1678
Epoch 6/200
2344/2344 - 106s - loss: 4.0305 - accuracy: 0.2546 - categorical_crossentropy: 4.0305
Epoch 7/200
2344/2344 - 107s - loss: 3.9157 - accuracy: 0.2643 - categorical_crossentropy: 3.9157
Epoch 8/200
2344/2344 - 107s - loss: 3.8163 - accuracy: 0.2728 - categorical_crossentropy: 3.8163
Epoch 9/200
2344/2344 - 106s - loss: 3.7275 - accuracy: 0.2809 - categorical_crossentropy: 3.7275
Epoch 10/200
2344/2344 - 106s - loss: 3.6459 - accuracy: 0.2892 - categorical_crossentropy: 3.6459
Epoch 11/200
2344/2

Epoch 84/200
2344/2344 - 106s - loss: 2.1517 - accuracy: 0.5268 - categorical_crossentropy: 2.1517
Epoch 85/200
2344/2344 - 105s - loss: 2.1468 - accuracy: 0.5276 - categorical_crossentropy: 2.1468
Epoch 86/200
2344/2344 - 106s - loss: 2.1439 - accuracy: 0.5285 - categorical_crossentropy: 2.1439
Epoch 87/200
2344/2344 - 106s - loss: 2.1389 - accuracy: 0.5293 - categorical_crossentropy: 2.1389
Epoch 88/200
2344/2344 - 105s - loss: 2.1292 - accuracy: 0.5318 - categorical_crossentropy: 2.1292
Epoch 89/200
2344/2344 - 106s - loss: 2.1303 - accuracy: 0.5311 - categorical_crossentropy: 2.1303
Epoch 90/200
2344/2344 - 106s - loss: 2.1219 - accuracy: 0.5328 - categorical_crossentropy: 2.1219
Epoch 91/200
2344/2344 - 105s - loss: 2.1195 - accuracy: 0.5332 - categorical_crossentropy: 2.1195
Epoch 92/200
2344/2344 - 106s - loss: 2.1126 - accuracy: 0.5352 - categorical_crossentropy: 2.1126
Epoch 93/200
2344/2344 - 106s - loss: 2.1101 - accuracy: 0.5347 - categorical_crossentropy: 2.1101
Epoch 94/2

2344/2344 - 105s - loss: 1.9187 - accuracy: 0.5687 - categorical_crossentropy: 1.9187
Epoch 167/200
2344/2344 - 106s - loss: 1.9264 - accuracy: 0.5669 - categorical_crossentropy: 1.9264
Epoch 168/200
2344/2344 - 105s - loss: 1.9155 - accuracy: 0.5694 - categorical_crossentropy: 1.9155
Epoch 169/200
2344/2344 - 105s - loss: 1.9137 - accuracy: 0.5698 - categorical_crossentropy: 1.9137
Epoch 170/200
2344/2344 - 105s - loss: 1.9121 - accuracy: 0.5706 - categorical_crossentropy: 1.9121
Epoch 171/200
2344/2344 - 105s - loss: 1.9190 - accuracy: 0.5674 - categorical_crossentropy: 1.9190
Epoch 172/200
2344/2344 - 105s - loss: 1.9082 - accuracy: 0.5706 - categorical_crossentropy: 1.9082
Epoch 173/200
2344/2344 - 105s - loss: 1.9165 - accuracy: 0.5684 - categorical_crossentropy: 1.9165
Epoch 174/200
2344/2344 - 105s - loss: 1.9112 - accuracy: 0.5695 - categorical_crossentropy: 1.9112
Epoch 175/200
2344/2344 - 106s - loss: 1.9104 - accuracy: 0.5697 - categorical_crossentropy: 1.9104
Epoch 176/200


<tensorflow.python.keras.callbacks.History at 0x7fda387e5e20>

In [37]:
# https://machinelearningknowledge.ai/complete-guide-to-spacy-tokenizer-with-examples/#:~:text=In%20Spacy%2C%20the%20process%20of,matches%20the%20tokenizer%20exception%20rules.
# https://spacy.io/usage/processing-pipelines
# https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/ -- word-level RNN in Keras!
# https://towardsdatascience.com/text-generation-with-bi-lstm-in-pytorch-5fda6e7cc22c --> character-level text generation with Pytorch
# https://www.kdnuggets.com/2020/07/pytorch-lstm-text-generation-tutorial.html --> IS THIS WHAT I WANT? 

<br>

## Save Model and Tokenizer

In [38]:
# save the model to file
model.save('./model_pos_v7.h5')

In [39]:
# save the tokenizer
pickle.dump(tokenizer, open('./tokenizer_pos_v7.pkl', 'wb'))

<br>

## Predict / Generate New Reviews

In [None]:
# load the model
model = load_model('./model_pos_v1.h5')

In [None]:
# laod the tokenizer
tokenizer = load(open('tokenizer_pos_v1.pkl', 'rb'))

In [116]:
seed_text = sequences[120395]
print(seed_text)
print(len(seed_text.split(' '))) # this shows there are 51 elements!

reminds me a lot of buff trace but more readily available . sweet vanilla / leathery nose . after adding a few drops of water , i can definitely see what they mean by smooth . vanilla , butterscotch , carmel on the palate with some spice . nice sweet ,
51


In [115]:
encoded = tokenizer.texts_to_sequences([seed_text])

len(encoded[0]) # HOW COME THIS IS 50, NOT 51?
# encoded = encoded[:-1]

50

In [119]:
# Make the prediction as follows
yhat = model.predict_classes(encoded, verbose=0)

In [132]:
for word, index in tokenizer.word_index.items():
    if index == yhat:
        out_word = word
        break
print(out_word)

cinnamon


In [133]:
# generate a sequence from a language model

def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    
    return ' '.join(result)

In [138]:
seed_text = sequences[527121]
print(seed_text)
generate_seq(model, tokenizer, seq_length, seed_text = seed_text, n_words = 50)

. on the palate , the toasted coconut transitions to a sweeter honeysuckle . from there , the flavor smoothly slides into a deeper , richer , caramelized banana custard , with flairs of smoke and oak . floral , grassy , followed by honey and dark fruit aroma . the


"same mouth feel , the same mouth feel , the highland apparently and made it were out or or sticking 2 dilution this for the price . i do nt know what i have some other higher proof bourbon . i do n't love . nice , this one is"

## Get Data, Build Vocab