In [15]:
import numpy as np
import pandas as pd
import csv
from scipy import stats
import os
import datetime
import sys
import time
import random
import json
import re
import pickle
from collections import Counter
import torch
import torchtext
from torchtext.data import get_tokenizer
import spacy
# import tensorflow as tf
# import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
%matplotlib inline

pd.set_option("display.max_columns", 30)

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
################ Read the file, check it out ################

file_dir = 'C:/Users/aclou/Documents/Projects/Whiskey_Ratings/Datasets/' 

df_ratings_all = pd.read_csv(file_dir + 'whisk_reviews_combined.csv')
df_ratings_all.head()

Unnamed: 0,whiskey_type,whiskey_name,reviewer_name,review_date,rev_rating,rev_notes
0,american_single_malt,STRANAHAN'S COLORADO WHISKEY,elbucko,"Tasted December 16, 2021",3.75,"Tastes like whiskey, maybe some pear? Great on..."
1,american_single_malt,STRANAHAN'S COLORADO WHISKEY,gmrocks,"Tasted December 8, 2021",3.75,This one proved quite popular with group of fr...
2,american_single_malt,STRANAHAN'S COLORADO WHISKEY,Mark-Willis,"Tasted November 27, 2021",4.5,Surprise of the flight consisting of itself Gl...
3,american_single_malt,STRANAHAN'S COLORADO WHISKEY,Dan-Cordial,"Tasted November 13, 2021",3.75,Floral notes
4,american_single_malt,STRANAHAN'S COLORADO WHISKEY,MoparRocker74,"Tasted November 11, 2021",3.75,Really good American single malt. Oaky and Cok...


In [4]:
################ INITIAL NEW/TRANSFORMEd COLS FOR DF ################

rev_notes = df_ratings_all['rev_notes']
rev_notes = [str(elem).encode("ascii", "ignore").decode('utf-8') for elem in rev_notes] 
rev_notes = [re.sub('[\n\r\t\f]', ' ', elem) for elem in rev_notes] 
df_ratings_all['rev_notes'] = rev_notes # get the NEW rev notes

len_review = [len(str(review)) for review in df_ratings_all['rev_notes']]
df_ratings_all['rev_char_len'] = len_review

df_ratings_all = df_ratings_all.loc[df_ratings_all['rev_char_len'] > 0, :].reset_index(drop = True) # aka get rid of empty reviews

In [5]:
df_ratings_all['rev_char_len'].describe([0.05, 0.1, 0.25, 0.35, .5, 0.65, .75, 0.9, 0.95, 0.975])

count    114483.000000
mean        221.219858
std         385.680698
min           1.000000
5%           11.000000
10%          19.000000
25%          41.000000
35%          58.000000
50%          93.000000
65%         149.000000
75%         216.000000
90%         528.000000
95%         945.900000
97.5%      1403.000000
max        7686.000000
Name: rev_char_len, dtype: float64

In [21]:
################ BREAK UP INTO LONG / SHORT RATINGS AND PREPROCESS ################
# SHORT -- >=40, <200 (around 30th to 70th percentile)
# LONG -- >=200 (cap at 2500 characters) (70th to 100th percentile)

df_ratings_short = df_ratings_all.loc[(df_ratings_all['rev_char_len'] >= 40) & (df_ratings_all['rev_char_len'] < 200), :].\
    reset_index(drop = True)
df_ratings_short['review_flg'] = 1*(df_ratings_short['rev_rating'] >= 3.75) # DEFINE A POSITIVE REVIEW
df_ratings_short = df_ratings_short.loc[~np.isnan(df_ratings_short['review_flg'])].reset_index(drop = True)
df_ratings_short = df_ratings_short.loc[:,['rev_notes', 'review_flg']]

df_ratings_long = df_ratings_all.loc[(df_ratings_all['rev_char_len'] >= 200), :].\
    reset_index(drop = True)
rev_capped = [elem[0:2500] for elem in df_ratings_long['rev_notes']]
df_ratings_long['rev_notes'] = rev_capped

In [22]:
df_ratings_short.head()

Unnamed: 0,rev_notes,review_flg
0,"Tastes like whiskey, maybe some pear? Great on...",1
1,Really good American single malt. Oaky and Cok...,1
2,"Cask strength. Strong ethanol nose, slight che...",0
3,Pre-Distiller tasting. Bar pour with a big roc...,0
4,"Not bad at all, a little on the harsh side tho...",1


In [24]:
df_ratings_short.to_csv('df_ratings_short.csv', index = False)

In [29]:
TEXT = torchtext.legacy.data.Field(
    tokenize = 'spacy', 
    tokenizer_language = 'en_core_web_sm')

LABEL = torchtext.legacy.data.LabelField(dtype = torch.long)

In [33]:
fields = [('rev_notes', TEXT), ('review_flg', LABEL)]

dataset = torchtext.legacy.data.TabularDataset(
    path = 'df_ratings_short.csv', format = 'csv', 
    skip_header = True, fields = fields)

In [34]:
train_data, test_data = dataset.split(split_ratio = [0.8, 0.2])
train_data, val_data = train_data.split(split_ratio = [0.8, 0.2])

In [42]:
vars(train_data.examples[120])

{'rev_notes': ['This',
  'was',
  'terrible',
  '.',
  'I',
  'wished',
  'it',
  'was',
  'sweet',
  ',',
  'instead',
  'it',
  'was',
  'a',
  'headache',
  'in',
  'a',
  'bottle',
  '.',
  'What',
  'the',
  'he',
  "'ll",
  'was',
  'Michter',
  "'s",
  'thinking',
  '.'],
 'review_flg': '0'}

In [38]:
print(len(test_data))

11148


In [8]:
# token_counts = Counter()
# tokenizer = get_tokenizer("basic_english")

# for example in df_ratings_short['rev_notes']:
#     tokens = tokenizer(example) # string to tokenize
#     token_counts.update(tokens)

# print(len(token_counts))

24498


In [None]:
from sklearn.model_selection import train_test_split

df_ratings_short_trainval = df_ratings_short.loc[~np.isnan(df_ratings_short['rev_rating']),:].reset_index(drop = True)

df_ratings_short_trn, df_ratings_short_val, ratings_short_trn,  ratings_short_val = \
    train_test_split(df_ratings_short_trainval, df_ratings_short_trainval['rev_rating'], test_size=0.30, random_state=42)

In [None]:
str_rand = df_ratings_all['rev_notes'].loc[df_ratings_all['rev_char_len'] == 40].iloc[80]
str_rand

In [None]:
df_ratings_all['rev_notes'][1025]