# Appendix 
A. Creating custom list of stop words. 

In [8]:
# Basic data science packages
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

import joblib
from sklearn.feature_extraction import text


In [9]:
# load data 
data = pd.read_pickle('data/cleaned_data.pkl')
data.head()

Unnamed: 0,name,review.point,Blended Malt Scotch Whisky,Blended Scotch Whisky,Grain Scotch Whisky,Single Grain Whisky,Single Malt Scotch,price_string,cleaned_reviews
0,"Johnnie Walker Blue Label, 40%",97,0.0,1.0,0.0,0.0,0.0,225.0,magnificently powerful and intense caramels dr...
1,"Black Bowmore, 1964 vintage, 42 year old, 40.5%",97,0.0,0.0,0.0,0.0,1.0,4500.0,what impresses me most is how this whisky evol...
2,"Bowmore 46 year old (distilled 1964), 42.9%",97,0.0,0.0,0.0,0.0,1.0,13500.0,there have been some legendary bowmores from t...
3,"Compass Box The General, 53.4%",96,1.0,0.0,0.0,0.0,0.0,325.0,with a name inspired by a 1926 buster keaton m...
4,"Chivas Regal Ultis, 40%",96,1.0,0.0,0.0,0.0,0.0,160.0,captivating enticing and wonderfully charming ...


***
## Adding adjectives and proper nouns to list of stop words
Some adjectives are positive or negative by definition. These words are probably highly predictive of a high or low rating, but do not tell us anything else about the whisky specifically. Therefore, to learn more about whisky specific language, these words should be ignored when tokenizing. Likewise, the name of the distiller in a review does not tell us anything about the whisky. These too will be added to the list of stopwords. We will start with creating a list of adjectives. 

In [10]:
adj_to_remove = [
    'best', 'good', 'love', 'incredible', 'remarkable', 'excellent', 'stunning', 'great', 'fantastic', 'wonderful',
    'outstanding', 'superb', 'magnificent', 'exceptional', 'marvelous', 'superior', 'awesome', 
    'bad', 'terrible', 'worst', 'poor', 'unpleasant', 'inferior', 'unsatisfactory', 'inadequate', 'lousy', 'atrocious', 
    'deficient', 'awful'
]

We can inspect the names of the whiskys to see the names of each distiller, and then manually add them to a list

In [11]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(data['name'].sort_values())

935                Big Peat Christmas Edition 2017, 54.1%
1418                             Dewar’s White Label, 40%
740      Exclusive Malts 2002 (distilled at Miltonduff...
1058           GlenDronach Cask Strength (batch 5), 55.3%
1409           J. Mossman Platinum Crown 15 year old, 40%
69       Lagavulin 12 year old (Diageo Special Release...
718                           Port Askaig 110° Proof, 55%
722      Teaninich 17 year old (Diageo Special Release...
538      That Boutique-y Whisky Company Blended Whisky...
421     A. D. Rattray (distilled at Bowmore), 18 year ...
669     A.D. Rattray (distilled at Highland Park) 19 y...
1297    A.D. Rattray (distilled at Littlemill) 22 year...
43               Aberfeldy 14 year old Single Cask, 58.1%
1614                           Aberfeldy 16 year old, 40%
1197                           Aberfeldy 18 year old, 40%
181     Aberfeldy Single Cask (Cask No. 5) 16 year old...
1924                          Aberfeldy, 12 year old, 40%
780           

In [12]:
# actual proper nouns that I need to remove - manually inspect proper nouns list and names of the whiskeus
proper_nouns_list = [
    'aberfeldy', 'aberlour', 'alltabhainne', 'ardbeg', 'auchentoshan', 'auchentoshans', 'asyla', 'alexander', 'ardmore',
    'arran', 'auchroisk', 'aultmore', 'askaig', 'antiquary', 'adelphi', 
    'ballechin', 'balvenie', 'benachie', 'benriachs', 'bladnoch', 'bladnocha', 'brora', 'broras', 'bowmore', 'bull',
    'bruichladdich', 'bruichladdichit', 'bruichladdichs', 'bunnahabhain', 'balblair', 'ballantine', 'nevis', 'benriach', 
    'benrinnes', 'benromach', 'balmenach', 'blackadder', 'blair', 'boutique', 'box', 'binnys', "binny's",
    'cardhu', 'chivas', 'clynelishs', 'clynelish', 'craigellachie', 'cragganmore', 'cadenhead', 'caol', 'ila', 
    'chieftain', 'compass', 'cuatro', 'cutty', 'collection',
    'deanston', 'dailuaine', 'dalmore', 'dalwhinnie', 'dewars', 'dewar', 'deveron', 'douglas', 'duncan', 'dufftown', 
    'edradour', 'edradours', 'ellen', 'editors', 
    'farclas', 
    'garioch', 'gariochs', 'glenallechie','glenburgie', 'glencadam', 'glencraig', 'glendronach', 'glendronachs', 
    'glenfiddich', 'glenfiddichs', 'glengoyne', 'glenisla', 'glenkeir', 'glenrothes', 'glenlivet', 'glenturret', 
    'glenfarclas', 'glenglassaugh', 'glenkinchie', 'glenmorangie', 'glenugie', 'gordon',
    'hart', 'hazelburn', 'highland', 'hazelwood', 'hunter', 
    'inchmurrin', 
    'johnnie', 'jura', 'juras',
    'keith', 'kensington', 'kilchomanfree', 'kilchomans', 'kildalton', 'kinchie', 'kininvie', 'kirkland', 
    'lochnager', 'lochranza', 'lagavulin', 'littlemill', 'linkwood', 'longmorn', 'linlithgow', 'laphroig', 'ledaig',
    'lomand', 'lombard', 'lonach', 'longrow', 
    'macduff', 'macmillans', 'magdelene', 'macallan', 'mortlach', 'monnochmore', 'macdougall', 'mossman', 'mackillops',
    'mackinlays', 'master', 'murray', 'mcdavid', 
    'oban', 
    'park', 'pulteney', 'peerless', 
    'scapa', 'shackleton', 'shieldaig', 'skye', 'springbank', 'springbanks', 'strathclyde', 'strathisla', 'scotia',
    'signatory', 'scotts', 'singleton', 'speyburn', 'strathmill', 'sovereign', 
    'talisker', 'tomintoul', 'turasmara', 'teaninich', 'taylor', 'tobermory', 'tomatin', 'tormore', 'tullibardine', 
    'uigeadail', 'usquaebach',  
    'valinch', 
    'walker', 'wemyss'
    ]

In [14]:
# Adding adjectives and names of distillers to the list of stop words
my_stop_words = text.ENGLISH_STOP_WORDS.union(proper_nouns_list, adj_to_remove)

# Saving custom stop words to list
joblib.dump(my_stop_words, 'data/my_stop_words.pkl')

['data/my_stop_words.pkl']