In [1]:
import numpy as np
import pandas as pd
import csv
from scipy import stats
import os
import datetime
import sys
import time
import random
import json
import re
import pickle
from collections import Counter
import torch
import torch.nn.functional as F
import torchtext
from torchtext.data import get_tokenizer
import spacy
# import tensorflow as tf
# import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
%matplotlib inline

pd.set_option("display.max_columns", 30)

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
################ Read the file, check it out ################

file_dir = './' 

df_ratings_all = pd.read_csv(file_dir + 'whisk_reviews_combined.csv')
df_ratings_all.head()

Unnamed: 0,whiskey_type,whiskey_name,reviewer_name,review_date,rev_rating,rev_notes
0,american_single_malt,STRANAHAN'S COLORADO WHISKEY,elbucko,"Tasted December 16, 2021",3.75,"Tastes like whiskey, maybe some pear? Great on..."
1,american_single_malt,STRANAHAN'S COLORADO WHISKEY,gmrocks,"Tasted December 8, 2021",3.75,This one proved quite popular with group of fr...
2,american_single_malt,STRANAHAN'S COLORADO WHISKEY,Mark-Willis,"Tasted November 27, 2021",4.5,Surprise of the flight consisting of itself Gl...
3,american_single_malt,STRANAHAN'S COLORADO WHISKEY,Dan-Cordial,"Tasted November 13, 2021",3.75,Floral notes
4,american_single_malt,STRANAHAN'S COLORADO WHISKEY,MoparRocker74,"Tasted November 11, 2021",3.75,Really good American single malt. Oaky and Cok...


## MAIN SETTINGS

In [4]:
VOCABULARY_SIZE = 50000
BATCH_SIZE = 128
LEARNING_RATE = 0.005
NUM_EPOCHS = 15
DEVICE = 'cpu'

EMBEDDING_DIM = 256
HIDDEN_DIM = 128
NUM_CLASSES = 2

## INITIAL TRANSFORMS FOR DF

In [5]:
################ INITIAL NEW/TRANSFORMED COLS FOR DF ################

rev_notes = df_ratings_all['rev_notes']
rev_notes = [str(elem).encode("ascii", "ignore").decode('utf-8') for elem in rev_notes] 
rev_notes = [re.sub('[\n\r\t\f]', '. ', elem) for elem in rev_notes] # INSTEAD OF SPACE, PERIOD + SPACE
df_ratings_all['rev_notes'] = rev_notes # get the NEW rev notes

len_review = [len(str(review)) for review in df_ratings_all['rev_notes']]
df_ratings_all['rev_char_len'] = len_review

df_ratings_all = df_ratings_all.loc[df_ratings_all['rev_char_len'] > 0, :].reset_index(drop = True) # aka get rid of empty reviews

# CREATE a LOW/HIGH flag
df_ratings_all['review_flg'] = 1*(df_ratings_all['rev_rating'] >= 3.75) # DEFINE A POSITIVE REVIEW! READ SOME ABAOVE 

# OR CREATE a LOW/MED/HIGH flag. TRY TO FIGURE THIS OUT? 
# revivew_flg_vec = np.array([0]*len(df_ratings_all))
# revivew_flg_vec[np.where(df_ratings_all['rev_rating'] >= 3)] = 1
# revivew_flg_vec[np.where(df_ratings_all['rev_rating'] >= 4)] = 2
# df_ratings_all['review_flg'] = revivew_flg_vec

In [6]:
df_ratings_all['review_flg'].describe([0.05, 0.1, 0.25, 0.35, .5, 0.65, .75, 0.9, 0.95, 0.975])

count    114483.000000
mean          0.625910
std           0.483889
min           0.000000
5%            0.000000
10%           0.000000
25%           0.000000
35%           0.000000
50%           1.000000
65%           1.000000
75%           1.000000
90%           1.000000
95%           1.000000
97.5%         1.000000
max           1.000000
Name: review_flg, dtype: float64

In [7]:
df_ratings_all.loc[(df_ratings_all['rev_rating'] == 3.25),['rev_notes']].iloc[282][0]

'Smelling hints of cherry, vanilla, a backside of berry hints. Taste is a decent amount of butterscotch, light oak and a tad bit of spice. Still new to bourbon though so I could be all wrong'

## BREAK UP INTO LONG / SHORT RATINGS

In [8]:
# SHORT -- >=40, <200 (around 30th to 70th percentile)
# LONG -- >=200 (cap at 2500 characters) (70th to 100th percentile)

df_ratings_short_full = df_ratings_all.loc[(df_ratings_all['rev_char_len'] >= 50) & (df_ratings_all['rev_char_len'] < 500), :].\
    reset_index(drop = True)
# THIS gets rid of reviews with no rating
df_ratings_short_full = df_ratings_short_full.loc[~np.isnan(df_ratings_short_full['rev_rating'])].reset_index(drop = True)
df_ratings_short = df_ratings_short_full.loc[:,['rev_notes', 'review_flg']]
# df_ratings_short = df_ratings_short.rename(columns = {'rev_rating': 'review_flg'})

df_ratings_long_full = df_ratings_all.loc[(df_ratings_all['rev_char_len'] >= 500), :].\
    reset_index(drop = True)
rev_capped = [elem[0:2500] for elem in df_ratings_long_full['rev_notes']]
df_ratings_long_full['rev_notes'] = rev_capped # THIS IS TO CAP LONG REVIEWS
# THIS gets rid of reviews with no rating
df_ratings_long_full = df_ratings_long_full.loc[~np.isnan(df_ratings_long_full['rev_rating'])].reset_index(drop = True)
df_ratings_long = df_ratings_long_full.loc[:,['rev_notes', 'review_flg']]
# df_ratings_long = df_ratings_long.rename(columns = {'rev_rating': 'review_flg'})

In [9]:
print(len(df_ratings_short))
print(len(df_ratings_long))

66473
12150


In [10]:
df_ratings_short.head()

Unnamed: 0,rev_notes,review_flg
0,"Tastes like whiskey, maybe some pear? Great on...",1
1,This one proved quite popular with group of fr...,1
2,Surprise of the flight consisting of itself Gl...,1
3,Really good American single malt. Oaky and Cok...,1
4,"Cask strength. Strong ethanol nose, slight che...",0


In [11]:
df_ratings_short.to_csv('df_ratings_short.csv', index = False)
df_ratings_long.to_csv('df_ratings_long.csv', index = False)

## Build Vocab for Short Reviews

In [12]:
TEXT = torchtext.legacy.data.Field(
    tokenize = 'spacy', 
    tokenizer_language = 'en_core_web_sm'
    )

LABEL = torchtext.legacy.data.LabelField(dtype = torch.long)
# HOW TO CONVERT TO NUMERICAL?
# LABEL = torchtext.legacy.data.LabelField(dtype = torch.long, use_vocab = False) 

In [13]:
fields = [('rev_notes', TEXT), ('review_flg', LABEL)]

dataset = torchtext.legacy.data.TabularDataset(
    path = 'df_ratings_short.csv', format = 'csv', 
    skip_header = True, fields = fields)

In [14]:
train_data, valid_data, test_data = dataset.split(split_ratio = [0.60, 0.20, 0.20], 
                                        random_state = random.seed(72033))

In [15]:
print(vars(train_data.examples[2])['rev_notes'])

['Nose', ':', 'cherry', ',', 'vanilla', ',', 'oak', '.', 'Proof', 'coming', 'through', '.', 'Palate', ':', 'classic', 'BT', 'cherry', 'palate', 'but', 'with', 'a', 'kick', '.', 'More', 'oak', ',', 'leather', ',', 'and', 'caramels', '.', 'Finish', ':', 'very', 'oily', 'and', 'long', ',', 'almost', 'creme', 'brle', '-', 'like', 'at', 'the', 'very', 'end', '.', '.', 'Delicious', '.', 'Enough', 'said', '.']


In [16]:
TEXT.build_vocab(train_data, max_size=VOCABULARY_SIZE)
LABEL.build_vocab(train_data)

print(f'Vocabulary size: {len(TEXT.vocab)}')
print(f'Number of classes: {len(LABEL.vocab)}')

print(TEXT.vocab.freqs.most_common(50))

Vocabulary size: 30801
Number of classes: 2
[('.', 124326), (',', 77999), ('and', 42868), ('the', 40276), ('a', 37785), ('of', 25616), ('with', 18220), ('is', 16952), ('I', 16719), ('but', 14423), ('to', 14208), ('on', 13305), ('it', 13141), (':', 10796), ('finish', 10700), ('this', 10030), ('for', 9298), ('in', 8856), ('nose', 8808), ('sweet', 8778), ('vanilla', 8681), ('not', 7086), ('-', 6498), ('that', 6237), ('spice', 6032), ('taste', 5973), ('oak', 5822), ('like', 5712), ('The', 5637), ('smooth', 5613), ('caramel', 5175), ('..', 5086), ('as', 5083), ('very', 5076), ('good', 5049), ('notes', 5040), ('!', 4782), ('Nose', 4749), ('some', 4730), ('at', 4653), ('more', 4541), ('Very', 4508), ('A', 4455), ('bourbon', 4405), ('little', 4394), ('bit', 4157), ('flavor', 4143), ('you', 4121), ('my', 4032), ('palate', 3813)]


In [17]:
print(TEXT.vocab.itos[0:10]) # itos = integer-to-string # FIRST, SECOND are UNKNOWN AND PAD!

['<unk>', '<pad>', '.', ',', 'and', 'the', 'a', 'of', 'with', 'is']


In [18]:
# Convert string to integer!
print(TEXT.vocab.stoi['salty']) # stoi = string-to-integer
print(LABEL.vocab.stoi)

312
defaultdict(None, {'1': 0, '0': 1})


In [19]:
# NOTE THAT TARGETS ARE FLIPPED IN DICTIONARY!
print(LABEL.vocab.stoi)
print(LABEL.vocab.freqs)

defaultdict(None, {'1': 0, '0': 1})
Counter({'1': 25636, '0': 14248})


## Data Loaders

In [20]:
train_loader, valid_loader, test_loader = \
    torchtext.legacy.data.BucketIterator.splits(
        (train_data, valid_data, test_data),
         batch_size=BATCH_SIZE,
         sort_within_batch=True, # FOR PACKED LSTM, but why not always?
         sort_key=lambda x: len(x.rev_notes),
         device='cpu'
    )

In [21]:
print('Train') 
for batch in train_loader: # WHY NO SENTENCE LENGTH OF 8? 
    print(f'Text matrix size: {batch.rev_notes.size()}')
    print(f'Target vector size: {batch.review_flg.size()}')
    break

print('\nTest:') 
for batch in test_loader:
    print(f'Text matrix size: {batch.rev_notes.size()}')
    print(f'Target vector size: {batch.review_flg.size()}')
    break

Train
Text matrix size: torch.Size([29, 128])
Target vector size: torch.Size([128])

Test:
Text matrix size: torch.Size([10, 128])
Target vector size: torch.Size([128])


## Model
This one uses the Normal LSTM

In [22]:
# REDEFINE THE SETTINGS AS NEEDED
LEARNING_RATE = 0.0025
NUM_EPOCHS = 15
DEVICE = 'cpu'
EMBEDDING_DIM = 16 # PERHAPS DO NOT NEED THAT BIG EMBEDDING, PREVENT OVERFIT. n-input x embedding ~ 5k * DIM >>> 500k
HIDDEN_DIM = 32
NUM_CLASSES = 2
N_LAYERS = 1
DROPOUT = 0

In [23]:
class RNN(torch.nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super().__init__()
        
        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        # THIS WAS FOR AN RNN
        #self.rnn = torch.nn.RNN(embedding_dim,
        #                        hidden_dim,
        #                        nonlinearity='relu')
        self.rnn = torch.nn.LSTM(embedding_dim,
                                 hidden_dim,
                                 num_layers = n_layers,
                                 dropout = dropout)        
        
        self.fc = torch.nn.Linear(hidden_dim, output_dim) # these are RAW non-activated values, but maybe okay
#         self.final_act = torch.nn.Sigmoid() # now it gives me probabilities. Also, NO PARAMS NEEDED --> will take "self"
        

    def forward(self, text):
        # text dim: [sentence length, batch size]
        
        embedded = self.embedding(text)
        # embedded dim: [sentence length, batch size, embedding dim]
        
        ## FOR PACKED LSTM
#         packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, text_length.to('cpu'))
        
        output, (hidden, cell) = self.rnn(embedded) # NOTICE THIS EITHER packed OR embedded
        # output dim: [sentence length, batch size, hidden dim]
        # hidden dim: [1, batch size, hidden dim]

        hidden.squeeze_(0) # --> squeeze the sentence length since always just 1?
        # hidden dim: [batch size, hidden dim]
        
        output = self.fc(hidden)
        
#         output = self.final_act(outraw) # LETS NOT ACTIVATE FINAL LAYER
        
        return output

In [24]:
torch.manual_seed(164809) # 20937, 293674, 457943
model = RNN(input_dim = len(TEXT.vocab),
            embedding_dim = EMBEDDING_DIM,
            hidden_dim = HIDDEN_DIM,
            output_dim = NUM_CLASSES, # could use 1 for binary classification, but this for generalization
            n_layers = N_LAYERS, 
            dropout = DROPOUT
)

model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

## Training

In [25]:
def compute_accuracy(model, data_loader, device):

    with torch.no_grad():

        correct_pred, num_examples = 0, 0

        for i, (features, targets) in enumerate(data_loader):

            features = features.to(device)
            targets = targets.float().to(device)

            logits = model(features)
            _, predicted_labels = torch.max(logits, 1)

            num_examples += targets.size(0)
            correct_pred += (predicted_labels == targets).sum()
    return correct_pred.float()/num_examples * 100

In [26]:
# TESTING CODE
# model.train()
# for batch_idx, batch_data in enumerate(train_loader):
#     # FOR PACKED LSTM
#     features = batch_data.rev_notes
#     features
#     break

In [27]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):
    model.train()
    for batch_idx, batch_data in enumerate(train_loader):
        
        features = batch_data.rev_notes.to(DEVICE)
        labels = batch_data.review_flg.to(DEVICE)
        
        ### FORWARD AND BACK PROP
#         logits = model(text) # FOR NORMAL LSTM
        logits = model(features) # FOR PACKED LSTM
        loss = F.cross_entropy(logits, labels)
        optimizer.zero_grad()
        
        loss.backward()
        
        ### UPDATE MODEL PARAMETERS
        optimizer.step()
        
        ### LOGGING
        if not batch_idx % 50:
            print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                   f'Batch {batch_idx:03d}/{len(train_loader):03d} | '
                   f'Loss: {loss:.4f}')

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

Epoch: 001/015 | Batch 000/312 | Loss: 0.6859
Epoch: 001/015 | Batch 050/312 | Loss: 0.6649
Epoch: 001/015 | Batch 100/312 | Loss: 0.6032
Epoch: 001/015 | Batch 150/312 | Loss: 0.6337
Epoch: 001/015 | Batch 200/312 | Loss: 0.6751
Epoch: 001/015 | Batch 250/312 | Loss: 0.5878
Epoch: 001/015 | Batch 300/312 | Loss: 0.6022
training accuracy: 70.40%
valid accuracy: 69.66%
Time elapsed: 0.15 min
Epoch: 002/015 | Batch 000/312 | Loss: 0.5254
Epoch: 002/015 | Batch 050/312 | Loss: 0.5554
Epoch: 002/015 | Batch 100/312 | Loss: 0.5480
Epoch: 002/015 | Batch 150/312 | Loss: 0.5816
Epoch: 002/015 | Batch 200/312 | Loss: 0.4983
Epoch: 002/015 | Batch 250/312 | Loss: 0.5694
Epoch: 002/015 | Batch 300/312 | Loss: 0.5700
training accuracy: 74.95%
valid accuracy: 70.29%
Time elapsed: 0.27 min
Epoch: 003/015 | Batch 000/312 | Loss: 0.5300
Epoch: 003/015 | Batch 050/312 | Loss: 0.4825
Epoch: 003/015 | Batch 100/312 | Loss: 0.4668
Epoch: 003/015 | Batch 150/312 | Loss: 0.5146
Epoch: 003/015 | Batch 200/3

In [28]:
# SAVE THE MODEL
torch.save(model.state_dict(), 'model_short_reviews_2class.pt')

In [29]:
model_short = model

In [30]:
RNN_model = model_short

## Predict New Ratings

In [31]:
import spacy


nlp = spacy.blank("en")

def predict_sentiment(model, sentence):

    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(DEVICE)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.nn.functional.softmax(model(tensor), dim=1)
    return prediction[0][0].item()

# print('Probability positive:')
# predict_sentiment(model, "This is a pretty good whiskey, not that sweet")

In [32]:
df_short_pos = df_ratings_short_full[df_ratings_short_full['rev_rating'] >= 4].reset_index(drop = True)
df_short_neg = df_ratings_short_full[df_ratings_short_full['rev_rating'] <= 2.5].reset_index(drop = True)
df_short_med = df_ratings_short_full[(df_ratings_short_full['rev_rating'] > 2.5) & \
                                     (df_ratings_short_full['rev_rating'] <= 3.5)].reset_index(drop = True)

In [33]:
print('\nPositive Reviews: ')
random.seed(20369)
rand_sample = random.sample(list(range(len(df_short_pos))), 3)
for samp_idx in rand_sample:
    samp_review = df_short_pos['rev_notes'][samp_idx]
    samp_rating = df_short_pos['rev_rating'][samp_idx]
    print(samp_review)
    pred_pos = predict_sentiment(model_short, samp_review)
    print('Positive Rating Prediction - ' + str(round(pred_pos, 4)))
    print('Actual Rating - ' + str(samp_rating))

print('\nNegative Reviews: ')
random.seed(36283)
rand_sample = random.sample(list(range(len(df_short_neg))), 5)
for samp_idx in rand_sample:
    samp_review = df_short_neg['rev_notes'][samp_idx]
    samp_rating = df_short_neg['rev_rating'][samp_idx]
    print(samp_review)
    pred_pos = predict_sentiment(model_short, samp_review)
    print('Positive Rating Prediction - ' + str(round(pred_pos, 4)))
    print('Actual Rating - ' + str(samp_rating))


Positive Reviews: 
Dumped on 3/28/19 Barrel # 372 Proof 125.6. . AMAZING! Butter and barrel on the nose with hints of brown sugar. On the palate it is incredibly viscous, like eating buttery corn. Notes of barrel and some spice leads to a strong finish that seems to keep going. One of the best pours Ive ever had!
Positive Rating Prediction - 0.999
Actual Rating - 4.5
Favorite bourbon of mine... Hard to find and retails in my area (Atlanta)for $50. If you see it don't buy it... Call me first.
Positive Rating Prediction - 0.9938
Actual Rating - 5.0
This is batch 007. Bottled Jan. 2015. Lots of smoky, briny, oaky, and even meaty aromas. That seaside campfire evoked by Laphroaig 10 is all there. This is like someone took a log off that fire and hit me in the face with it. I'm not complaining, mind you. There are a lot of good vanilla and coffee flavors underneath it all too.
Positive Rating Prediction - 0.9987
Actual Rating - 5.0

Negative Reviews: 
Nose is mushy apples almost a Smokey sm

In [34]:
print('\nMiddle Reviews: ')
random.seed(34047)
rand_sample = random.sample(list(range(len(df_short_med))), 5)
for samp_idx in rand_sample:
    samp_review = df_short_med['rev_notes'][samp_idx]
    samp_rating = df_short_med['rev_rating'][samp_idx]
    print(samp_review)
    pred_pos = predict_sentiment(model_short, samp_review)
    print('Positive Rating Prediction - ' + str(round(pred_pos, 4)))
    print('Actual Rating - ' + str(samp_rating))


Middle Reviews: 
Drinking at work :). A little nutty but heavy in the liquor. Good to mix being bottle in bond
Positive Rating Prediction - 0.1099
Actual Rating - 3.0
Interesting taste overall. I was expecting only a subtle rum taste in the bourbon but it tastes closer to a rum/bourbon hybrid. Its hard to say what else I can taste because of the overwhelming rum taste. I think it is very good but not my particular taste.
Positive Rating Prediction - 0.0258
Actual Rating - 2.75
By Heaven Hill. An 80 proof, younger version of Evan Williams. Solid, inexpensive bourbon.
Positive Rating Prediction - 0.0453
Actual Rating - 3.0
Very supple, sweet and smooth. Tasty but lacks bite and character.
Positive Rating Prediction - 0.96
Actual Rating - 3.0
Simple, easy profile with not much to get excited about. . Chalet
Positive Rating Prediction - 0.1658
Actual Rating - 3.0


In [35]:
pos_review = \
    'Best mouthfeel Ive had on a bourbon. Felt and tasted buttery. Great barrel presence and heat followed by caramel and spices. Very long finish as well. Landed a great bottle!'
print('\nPositive Review: ' + pos_review)
pred_pos = predict_sentiment(RNN_model, pos_review)
print('Model Positive Rating Prediction - ' + str(round(pred_pos, 4)))
print('')

neg_review = \
    'I tried to like it. Knowing taste buds can be different at times I kept at it. Finished the bottle. At times there was a musty funk taste and then there was the taste without the mustiness. Its not for me.'
print('\nNegative Review: ' + neg_review)
pred_pos = predict_sentiment(RNN_model, neg_review)
print('Model Positive Rating Prediction - ' + str(round(pred_pos, 4)))
print('')

med_review = \
    'Nose of Red Apple and nutmeg hint of lemon. Palate of apple rye spice and clove. Finish has some heat medium'
print('\nMiddle Review: ' + med_review)
pred_pos = predict_sentiment(RNN_model, med_review)
print('Model Positive Rating Prediction - ' + str(round(pred_pos, 4)))
print('')


Positive Review: Best mouthfeel Ive had on a bourbon. Felt and tasted buttery. Great barrel presence and heat followed by caramel and spices. Very long finish as well. Landed a great bottle!
Model Positive Rating Prediction - 0.9946


Negative Review: I tried to like it. Knowing taste buds can be different at times I kept at it. Finished the bottle. At times there was a musty funk taste and then there was the taste without the mustiness. Its not for me.
Model Positive Rating Prediction - 0.0235


Middle Review: Nose of Red Apple and nutmeg hint of lemon. Palate of apple rye spice and clove. Finish has some heat medium
Model Positive Rating Prediction - 0.559



In [36]:
df_ratings_short_miss = df_ratings_all.loc[(df_ratings_all['rev_char_len'] >= 40) & (df_ratings_all['rev_char_len'] < 200), :].\
    reset_index(drop = True)
df_ratings_short_miss = df_ratings_short_miss.loc[np.isnan(df_ratings_short_miss['rev_rating'])].reset_index(drop = True)

In [37]:
print('\nReviews without Rating: ')
random.seed(97623)
rand_sample = random.sample(list(range(len(df_ratings_short_miss))), 10)
for samp_idx in rand_sample:
    samp_review = df_ratings_short_miss['rev_notes'][samp_idx]
    print(samp_review)
    print('Predicted Rating - ' + str(round(predict_sentiment(model_short, samp_review), 4)))


Reviews without Rating: 
10/22/16 Harpeth Liquors & Wine. $34.99 + tax
Predicted Rating - 0.0072
Sweet Maple and citrus nose, spicy wood palate, nice finish
Predicted Rating - 0.3714
I'm not sure what I taste with this one. It was awful from start to finish. I strongly encourage you to get a sample bottle to try before you waste your money.
Predicted Rating - 0.012
Heerlijk. Boozy, overrijpe banaan, sinaasappel likeur. Boterig.
Predicted Rating - 0.0006
Fall 2021 Pick by Rishi's International Beverage
Predicted Rating - 0.0048
GlenFiddich Distillery - Dufftown 9/30/2019
Predicted Rating - 0.0362
Finally starting to see this in NC, $12.40/750ml
Predicted Rating - 0.0301
Good Old Fashioned made with brown sugar simple syrup.
Predicted Rating - 0.8194
Gibbys & The Packie pick. Barrel F220881. Filled 11/2/15. Infinity 3/13/21
Predicted Rating - 0.0025
A strong paint thinner aroma persuaded me not to fully enjoy this particular spirit.
Predicted Rating - 0.0191


In [38]:
unk_review = \
    'Citrus and cinnamon notes. Very clean'
print('\nPositive Review: ' + unk_review)
pred_pos = predict_sentiment(RNN_model, unk_review)
print('Model Positive Rating Prediction - ' + str(round(pred_pos, 4)))
print('')

unk_review = \
    'Heinous. Serious contender for worst "bourbon" Ive ever tasted. Although at one year Im not sure how they can legally call this bourbon.'
print('\nPositive Review: ' + unk_review)
pred_pos = predict_sentiment(RNN_model, unk_review)
print('Model Positive Rating Prediction - ' + str(round(pred_pos, 4)))
print('')

unk_review = \
    'Smells syrupy and fruity. In the mouth it is very light and bright. Slightly warms the back of the mouth and throat. A sweet wheat flavor linger well after.'
print('\nPositive Review: ' + unk_review)
pred_pos = predict_sentiment(RNN_model, unk_review)
print('Model Positive Rating Prediction - ' + str(round(pred_pos, 4)))
print('')


Positive Review: Citrus and cinnamon notes. Very clean
Model Positive Rating Prediction - 0.8602


Positive Review: Heinous. Serious contender for worst "bourbon" Ive ever tasted. Although at one year Im not sure how they can legally call this bourbon.
Model Positive Rating Prediction - 0.0231


Positive Review: Smells syrupy and fruity. In the mouth it is very light and bright. Slightly warms the back of the mouth and throat. A sweet wheat flavor linger well after.
Model Positive Rating Prediction - 0.5589



In [39]:
### POTENTIAL PROBLEMS WITH OVERFITTING AND BAD DATA

sentence_1 = "Easily a top contender for a bottle NOT to purchase!! This is literal trash"
pred_pos = predict_sentiment(RNN_model, sentence_1)
print('Probability of a Positive Rating - ' + str(round(pred_pos, 4)))

Probability of a Positive Rating - 0.5715


In [40]:
sentence_2 = "Easily a top contender for a bottle NOT to buy... Nose: moldy. Palate: dirt. Finish: sewer."
pred_pos = predict_sentiment(RNN_model, sentence_2)
print('Probability of a Positive Rating - ' + str(round(pred_pos, 4)))

Probability of a Positive Rating - 0.0034


In [68]:
sentence_3 = "Easily a top contender for a bottle to dump out.. This is literal trash.."
pred_pos = predict_sentiment(RNN_model, sentence_3)
print('Probability of a Positive Rating - ' + str(round(pred_pos, 4)))

Probability of a Positive Rating - 0.2381


In [70]:
sentence_3 = "Easily a top contender for a bottle to dump out!! This is literal trash!!"
pred_pos = predict_sentiment(RNN_model, sentence_3)
print('Probability of a Positive Rating - ' + str(round(pred_pos, 4)))

Probability of a Positive Rating - 0.4864


In [None]:
### DOES IT PREDICT LONG REVIEWS AT ALL?
df_long_pos = df_ratings_long_full[df_ratings_long_full['rev_rating'] >= 4].reset_index(drop = True)
df_long_neg = df_ratings_long_full[df_ratings_long_full['rev_rating'] <= 2.5].reset_index(drop = True)

print('\nPositive LONG Reviews: ')
rand_sample = random.sample(list(range(len(df_long_pos))), 3)
for samp_idx in rand_sample:
    samp_review = df_long_pos['rev_notes'][samp_idx]
    samp_rating = df_long_pos['rev_rating'][samp_idx]
    print(samp_review)
    print('Predicted Rating - ' + str(round(predict_sentiment(model_short, samp_review), 4)))
    print('Actual Rating - ' + str(samp_rating))

print('\nNegative LONG Reviews: ')
rand_sample = random.sample(list(range(len(df_long_neg))), 3)
for samp_idx in rand_sample:
    samp_review = df_long_neg['rev_notes'][samp_idx]
    samp_rating = df_long_neg['rev_rating'][samp_idx]
    print(samp_review)
    print('Predicted Rating - ' + str(round(predict_sentiment(model_short, samp_review), 4)))
    print('Actual Rating - ' + str(samp_rating))

## Build Vocab for Long Reviews

We will try using packed LSTM here

In [386]:
TEXT = torchtext.legacy.data.Field(
    tokenize = 'spacy', 
    tokenizer_language = 'en_core_web_sm',
    include_lengths=True # FOR PACKED LSTM
)

LABEL = torchtext.legacy.data.LabelField(dtype = torch.long)

In [387]:
fields = [('rev_notes', TEXT), ('review_flg', LABEL)]

dataset = torchtext.legacy.data.TabularDataset(
    path = 'df_ratings_long.csv', format = 'csv', 
    skip_header = True, fields = fields)

In [388]:
train_data, test_data = dataset.split(split_ratio = [0.8, 0.2], 
                                        random_state = random.seed(72033))
train_data, valid_data = train_data.split(split_ratio = [0.8, 0.2],
                                            random_state = random.seed(20373))

In [393]:
# PRINT OUT A SAMPLE REVIEW -- first 100 words
print(vars(train_data.examples[379])['rev_notes'][0:250])

['Very', 'young', '.', 'If', 'this', 'was', '8', 'years', 'old', 'toc10', 'years', 'it', 'would', 'be', 'superb', '.', 'As', 'it', 'stands', ',', 'it', "'s", 'just', 'cuvee', 'sweet', 'young', 'corn', 'whiskey', 'with', 'that', 'alcohol', 'burn', 'and', 'green', 'taste', '...', 'Disappointed', 'for', 'the', 'price', '.', 'Hope', 'future', 'revisions', 'of', 'this', 'are', 'older']


In [394]:
TEXT.build_vocab(train_data, max_size=VOCABULARY_SIZE)
LABEL.build_vocab(train_data)

print(f'Vocabulary size: {len(TEXT.vocab)}')
print(f'Number of classes: {len(LABEL.vocab)}')

print(TEXT.vocab.freqs.most_common(25))

Vocabulary size: 43678
Number of classes: 2
[('.', 153316), (',', 137008), ('the', 79145), ('and', 74628), ('a', 71470), ('of', 47626), ('I', 43060), ('is', 41821), ('to', 32587), ('it', 30320), ('with', 26846), ('this', 24287), ('but', 23538), (' ', 22492), ('in', 20303), ('that', 18768), ('on', 16758), ('-', 16421), (':', 16386), ('for', 16165), ('The', 16075), ("'s", 12214), ('not', 11751), ('as', 10932), ('nose', 10797)]


In [395]:
# Convert string to integer!
print(TEXT.vocab.stoi['subtle']) # stoi = string-to-integer

330


In [396]:
# NOTE THAT THEY ARE FLIPPED IN DICTIONARY!
print(LABEL.vocab.stoi) 
print(LABEL.vocab.freqs) # note also longer reviews tend to be positive, ~ 2 to 1
df_ratings_long.groupby(['review_flg']).size()

defaultdict(None, {'1': 0, '0': 1})
Counter({'1': 12641, '0': 6945})


review_flg
0    10784
1    19818
dtype: int64

## Define Data Loaders

In [397]:
train_loader, valid_loader, test_loader = \
    torchtext.legacy.data.BucketIterator.splits(
        (train_data, valid_data, test_data),
         batch_size=BATCH_SIZE,
         sort_within_batch=True, # NEW. necessary for packed_padded_sequence
         sort_key=lambda x: len(x.rev_notes),
         device='cpu'
    )

In [399]:
print('Train') 
for batch in train_loader: # WHY NO SENTENCE LENGTH OF 42 WORDS?!?! PACKED LSTM DOES NOT FIX IT 
    print(f'Text matrix size: {batch.rev_notes[0].size()}')
    print(f'Target vector size: {batch.review_flg.size()}')
    break
    
print('\nValid:')
for batch in valid_loader:
    print(f'Text matrix size: {batch.rev_notes[0].size()}')
    print(f'Target vector size: {batch.review_flg.size()}')
    break
    
print('\nTest:') 
for batch in test_loader:
    print(f'Text matrix size: {batch.rev_notes[0].size()}')
    print(f'Target vector size: {batch.review_flg.size()}')
    break

Train
Text matrix size: torch.Size([415, 128])
Target vector size: torch.Size([128])

Valid:
Text matrix size: torch.Size([42, 128])
Target vector size: torch.Size([128])

Test:
Text matrix size: torch.Size([42, 128])
Target vector size: torch.Size([128])


## Model
This one uses the PACKED LSTM

In [422]:
# REDEFINE THE SETTINGS
LEARNING_RATE = 0.005
NUM_EPOCHS = 10 # TAKES MUCH LONGER TO TRAIN -- SO 10 epochs is okay. ALSO AFTER THE FACT, RESULTS NO CHANGE after 10 epochs
DEVICE = 'cpu'
EMBEDDING_DIM = 128 # PERHAPS DO NOT NEED THAT BIG EMBEDDING, PREVENT OVERFIT
HIDDEN_DIM = 64
NUM_CLASSES = 2

In [423]:
class RNN(torch.nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        
        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        # THIS WAS FOR AN RNN
        #self.rnn = torch.nn.RNN(embedding_dim,
        #                        hidden_dim,
        #                        nonlinearity='relu')
        self.rnn = torch.nn.LSTM(embedding_dim,
                                 hidden_dim)        
        
        self.fc = torch.nn.Linear(hidden_dim, output_dim) # now it's 2, but these are RAW non-activated values
        self.final_act = torch.nn.Sigmoid() # now it gives me probabilities. Also, NO PARAMS NEEDED --> will take "self"
        

    def forward(self, text, text_length):
        # text dim: [sentence length, batch size]
        
        embedded = self.embedding(text)
        # ebedded dim: [sentence length, batch size, embedding dim]
        
        ## NEW
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, text_length.to('cpu'))
        
        packed_output, (hidden, cell) = self.rnn(packed)
        # output dim: [sentence length, batch size, hidden dim]
        # hidden dim: [1, batch size, hidden dim]

        hidden.squeeze_(0) # --> squeeze the sentence length since always just 1?
        # hidden dim: [batch size, hidden dim]
        
        outraw = self.fc(hidden)
        
        output = self.final_act(outraw)
        
        return output

In [424]:
torch.manual_seed(63853)
model = RNN(input_dim=len(TEXT.vocab),
            embedding_dim=EMBEDDING_DIM,
            hidden_dim=HIDDEN_DIM,
            output_dim=NUM_CLASSES # could use 1 for binary classification, but this for generalization
)

model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

## Training

In [425]:
def compute_accuracy(model, data_loader, device):

    with torch.no_grad():

        correct_pred, num_examples = 0, 0

        for batch_idx, batch_data in enumerate(data_loader):

            # NEW
            features, text_length = batch_data.rev_notes
            targets = batch_data.review_flg.to(DEVICE)
            
            logits = model(features, text_length)
            _, predicted_labels = torch.max(logits, 1)

            num_examples += targets.size(0)

            correct_pred += (predicted_labels == targets).sum()
    return correct_pred.float()/num_examples * 100

In [426]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):
    model.train()
    for batch_idx, batch_data in enumerate(train_loader):

        # NEW
        features, text_length = batch_data.rev_notes
        labels = batch_data.review_flg.to(DEVICE)

        ### FORWARD AND BACK PROP
        logits = model(features, text_length)
        loss = F.cross_entropy(logits, labels)
        optimizer.zero_grad()
        
        loss.backward()
        
        ### UPDATE MODEL PARAMETERS
        optimizer.step()
        
        ### LOGGING
        if not batch_idx % 50:
            print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                   f'Batch {batch_idx:03d}/{len(train_loader):03d} | '
                   f'Loss: {loss:.4f}')

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

Epoch: 001/015 | Batch 000/154 | Loss: 0.6889
Epoch: 001/015 | Batch 050/154 | Loss: 0.6487
Epoch: 001/015 | Batch 100/154 | Loss: 0.6171
Epoch: 001/015 | Batch 150/154 | Loss: 0.5613
training accuracy: 72.88%
valid accuracy: 69.36%
Time elapsed: 3.95 min
Epoch: 002/015 | Batch 000/154 | Loss: 0.6308
Epoch: 002/015 | Batch 050/154 | Loss: 0.6306
Epoch: 002/015 | Batch 100/154 | Loss: 0.5527
Epoch: 002/015 | Batch 150/154 | Loss: 0.5577
training accuracy: 81.50%
valid accuracy: 73.73%
Time elapsed: 7.57 min
Epoch: 003/015 | Batch 000/154 | Loss: 0.5354
Epoch: 003/015 | Batch 050/154 | Loss: 0.4731
Epoch: 003/015 | Batch 100/154 | Loss: 0.5003
Epoch: 003/015 | Batch 150/154 | Loss: 0.4906
training accuracy: 86.65%
valid accuracy: 74.26%
Time elapsed: 11.08 min
Epoch: 004/015 | Batch 000/154 | Loss: 0.4100
Epoch: 004/015 | Batch 050/154 | Loss: 0.4411
Epoch: 004/015 | Batch 100/154 | Loss: 0.4021
Epoch: 004/015 | Batch 150/154 | Loss: 0.4249
training accuracy: 88.32%
valid accuracy: 74.41

In [429]:
model_long_revs = model

## Predict New Sentiments

In [470]:
import spacy


nlp = spacy.blank("en")

def predict(model, sentence):

    model.eval()
    
    with torch.no_grad():
        tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
        indexed = [TEXT.vocab.stoi[t] for t in tokenized]
        length = [len(indexed)]
        tensor = torch.LongTensor(indexed).to(DEVICE)
        tensor = tensor.unsqueeze(1)
        length_tensor = torch.LongTensor(length)
        predict_probas = torch.nn.functional.softmax(model(tensor, length_tensor), dim=1)
        predicted_label_index = torch.argmax(predict_probas)
        predicted_label_proba = torch.max(predict_probas)
        return predicted_label_index.item(), predicted_label_proba.item()

In [471]:
df_long_pos = df_ratings_long_full[df_ratings_long_full['rev_rating'] >= 4].reset_index(drop = True)
df_long_neg = df_ratings_long_full[df_ratings_long_full['rev_rating'] <= 2.5].reset_index(drop = True)
df_long_med = df_ratings_long_full[(df_ratings_long_full['rev_rating'] > 2.5) & \
                                   (df_ratings_long_full['rev_rating'] < 4)].reset_index(drop = True)

In [486]:
sentence = df_ratings_long_full['rev_notes'][203]
sentence

'Peated holds up best to ice, and Id say wakes up with the cold. Color suffers, but thats about it. Everything else is as it should be, if anything more sweetness shows up than I previously noticed... alls well that ends well.'

In [499]:
tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
indexed = [TEXT.vocab.stoi[t] for t in tokenized]
length = [len(indexed)]
tensor = torch.LongTensor(indexed).to(DEVICE)
tensor = tensor.unsqueeze(1)
length_tensor = torch.LongTensor(length)
predict_probas = torch.nn.functional.softmax(model_long_revs(tensor, length_tensor), dim=1)
predict_probas

tensor([[0.7311, 0.2689]], grad_fn=<SoftmaxBackward0>)

In [482]:
for idx in range(1000):
    samp_review = df_ratings_long_full['rev_notes'][idx]
    predicted_label_index, predicted_label_proba = predict(model_long_revs, samp_review)
    ls_lbl_index.append(predicted_label_index)
    ls_all_scores.append(predicted_label_proba)

In [481]:
print('\nPositive Reviews: ')
random.seed(20369)
rand_sample = random.sample(list(range(len(df_long_pos))), 3)
for samp_idx in rand_sample:
    samp_review = df_long_pos['rev_notes'][samp_idx]
    samp_rating = df_long_pos['rev_rating'][samp_idx]
    print(samp_review)
    predicted_label_index, predicted_label_proba = predict(model_long_revs, samp_review)
    print('Predicted Rating - ' + str(round(predicted_label_proba, 4)))
    print('Actual Rating - ' + str(samp_rating))

print('\nNegative Reviews: ')
random.seed(36283)
rand_sample = random.sample(list(range(len(df_long_neg))), 3)
for samp_idx in rand_sample:
    samp_review = df_long_neg['rev_notes'][samp_idx]
    samp_rating = df_long_neg['rev_rating'][samp_idx]
    print(samp_review)
    predicted_label_index, predicted_label_proba = predict(model_long_revs, samp_review)
    print('Predicted Rating - ' + str(round(predicted_label_proba, 4)))
    print('Actual Rating - ' + str(samp_rating))

print('\nMiddling Reviews: ')
rand_sample = random.sample(list(range(len(df_long_med))), 3)
for samp_idx in rand_sample:
    samp_review = df_long_med['rev_notes'][samp_idx]
    samp_rating = df_long_med['rev_rating'][samp_idx]
    print(samp_review)
    predicted_label_index, predicted_label_proba = predict(model_long_revs, samp_review)
    print('Predicted Rating - ' + str(round(predicted_label_proba, 4)))
    print('Actual Rating - ' + str(samp_rating))


Positive Reviews: 
The nose is to die for. I'd want this in a candle in my house burning 24/7. Strong oak and burnt caramel on the nose. The palette is surprisingly lacking compared to the nose with the same oak and caramel with very strong cinnamon spice accompanying it through the finish. Super long and spicy finish leaves you warm and comfy.
Predicted Rating - 0.731
Actual Rating - 4.25
Nose: slight nuttiness, coconut, vanilla, quite sweet aside from that HH peanut thing  Palate: vanilla, sweet oak, almost a tea flavor, spice in the background, quite rich for a $10 bottle of bourbon  Finish: spicy and medium, but a crispy almost cola taste at the start  My favorite bottom shelf bourbon.
Predicted Rating - 0.7311
Actual Rating - 4.0
Nov 2019. Clear medium gold. Medium small tulip. Aromatic oatmeal, cereal, sweet spicy, milo beverage, sweet maltose, brine, buckwheat soba noodles in nose. Medium full body with sweet malt beverage, farmyard, oatmeal, multigrain bread and gentle sweet s

In [442]:
### DOES IT PREDICT SHORT REVIEWS WELL?

print('\nPositive SHORT Reviews: ')
rand_sample = random.sample(list(range(len(df_short_pos))), 3)
for samp_idx in rand_sample:
    samp_review = df_short_pos['rev_notes'][samp_idx]
    samp_rating = df_short_pos['rev_rating'][samp_idx]
    print(samp_review)
    predicted_label_index, predicted_label_proba = predict(model_long_revs, samp_review)
    print('Predicted Rating - ' + str(round(predicted_label_proba, 4)))
    print('Actual Rating - ' + str(samp_rating))
    
print('\nNegative SHORT Reviews: ')
rand_sample = random.sample(list(range(len(df_short_neg))), 3)
for samp_idx in rand_sample:
    samp_review = df_short_neg['rev_notes'][samp_idx]
    samp_rating = df_short_neg['rev_rating'][samp_idx]
    print(samp_review)
    predicted_label_index, predicted_label_proba = predict(model_long_revs, samp_review)
    print('Predicted Rating - ' + str(round(predicted_label_proba, 4)))
    print('Actual Rating - ' + str(samp_rating))


Positive SHORT Reviews: 
Smell of Granny Smith Apple and Caramel...taste of Apple and cinnamon...my favorite
Predicted Rating - 0.6837
Actual Rating - 5.0
Smoky bacon fat and cigar, turns into blackberries
Predicted Rating - 0.7311
Actual Rating - 4.5
Layered, sweet baked goods, enriched with rye spiciness. If only Bulleit could finally do sth about their constantly chipping corks!
Predicted Rating - 0.7311
Actual Rating - 4.75

Negative SHORT Reviews: 
Thin delicate , sweetness high high pep, berry, 1.5  Thin berry blackcurrent sweetness, thin, delicate 1.5  Mid sustained thin 1 1.25
Predicted Rating - 0.7311
Actual Rating - 1.25
Ryes just are not for me. Did not enjoy, had a very odd turn at the end that did not please me.  Brooke did not like at all.
Predicted Rating - 0.731
Actual Rating - 0.75
Nicely balanced but perhaps on the bland side.
Predicted Rating - 0.7193
Actual Rating - 2.5


In [None]:
str_rand = df_ratings_all['rev_notes'].loc[df_ratings_all['rev_char_len'] == 40].iloc[80]
str_rand

In [None]:
df_ratings_all['rev_notes'][1025]