In [171]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast
import re
import seaborn as sns

import spacy
from spacy.tokens import Doc, Span, Token # for creating global objects 
from spacy.matcher import Matcher # for rule-based matching
from spacy.matcher import DependencyMatcher
from spacy.language import Language # for building custom pipeline components
from spacy.pipeline import EntityRuler 

from copy import deepcopy

from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from gensim.matutils import corpus2dense
from gensim.models import Phrases
from gensim.models.coherencemodel import CoherenceModel
from gensim.similarities import MatrixSimilarity

In [172]:
        # initializes the spacy Language model
        nlp = spacy.load('en_core_web_lg') # loads our NLP model
        nlp.Defaults.stop_words |= {"hint", "Hint", "hints", "touch", "touches", "Touch", "note", "Note", "notes", "Notes", "little", "end", "thing", "palate", "Palate", "nose", "Nose", "whisper", "whispers"}
        ruler = nlp.add_pipe("entity_ruler")
        pattern_csk = [{"LEMMA": {"IN": ["cask", "octave", "pipe", "puncheon", "butt", "barrel", "hogshead"]} } ]
        patterns = [ {"label": "CSK", "pattern": pattern_csk }]

        ruler.add_patterns(patterns)

In [173]:
class Scotch_Recommender:

    def __init__(self, corpus = None, dictionary = None, phrase_model = None, full_data = None, lda_mod = None, index_sim = None):

        if dictionary == None:
            # load pickled dictionary. the default will be the reduced dictionary stored in the dictionary folder.
            dictionary_path = "dictionaries\\reduced_gemsimdict_unified.pkl"
            self.dictionary = pickle.load(open(dictionary_path, 'rb'))

        if phrase_model == None:
            # load phrase model
            phraser_path = "models\\phrase_mod.pkl"
            self.phrase_model = pickle.load(open(phraser_path, 'rb'))
        
        if corpus == None:
            #load gensim BoW corpus
            corpus_path = "data\\final\\descriptor_corpus.pkl"
            self.corpus = pickle.load(open(corpus_path, 'rb'))

        if lda_mod == None:
            lda_path = "models\\Scotch_LDA.pkl"
            self.scotch_topic_model = pickle.load(open(lda_path, 'rb'))
 
        if index_sim == None:
            index_sim_path = "data\\final\\index_sim.pkl"
            self.index_sim = pickle.load(open(index_sim_path, 'rb'))

        # breakdown of scotch by topic
        self.scotch_lda_decomp = self.scotch_topic_model[self.corpus]


        if full_data == None:
            whisk_token_path = "data\\interim\\whisk_unified_tokenized.csv"
            self.full_data = pd.read_csv(whisk_token_path).drop(columns=["Unnamed: 0"])

    def get_recommendations(self, name, num_rec = 10):

        doc_num = self.full_data[self.full_data['name'] == name].index[0]
        sims = self.index_sim[self.scotch_lda_decomp[doc_num]]

        sorted_val_df = pd.Series(sims).sort_values(ascending = False).drop(index=doc_num).to_frame(name = "similarity")

        sorted_val_df['name'] = self.full_data.iloc[sorted_val_df.index].name.values
    
        # drop whiskies that have the same name and age expression.
        sorted_val_df.drop(index = sorted_val_df[sorted_val_df['name'].str.contains(name, flags=re.IGNORECASE, regex=True)].index, inplace = True)


        return sorted_val_df.name[0: num_rec]

    #------the rest of this class is for taking in a custom whisky desciption and getting recommendations -------------

    # now we tokenize the incoming text
    def tokenize_text(self, text, *args):

        unique_token_set = set(self.dictionary.itervalues())

        # construct doc
        doc = nlp(text)
        # generate token list removing verbs, punctuation, prepositions, stop words, and initial lemmatizing
        token_list = [token.lemma_.lower() for token in doc if ( (not token.is_stop) & (not token.is_punct) & (token.pos_ != 'VERB') & (token.dep_ != 'prep') & (token.is_alpha)) ]

        # MANUAL LEMMATIZATION
        # ----------------------- 
        doc_str = " ".join(token_list)

        #all of these go to -y endings
        regex_search_pattern1 = [r'iness\b', r'ied\b', r'iful\b', r'ifull\b', r'ifully\b']
        doc_str = re.sub("|".join(regex_search_pattern1) , 'y', doc_str)

        # all tokens ending with "-ness" or "-ful" should just have this ending chopped off.
        regex_search_pattern2 = [r'ness\b', r'ful\b', r'full\b']
        doc_str = re.sub("|".join(regex_search_pattern2) , '', doc_str)
        
        #specific word replacement rule
        regex_search_pattern3 = r'tannic'
        doc_str = re.sub(regex_search_pattern3 , 'tannin', doc_str)

        # this is where we deal with -y endings
        doc_list = doc_str.split()
        regex_search_pattern4 = r"y\b"
        regex_search_pattern5 = r"\w+[^aeiou][aeiou][^aeiou]\b"
        
        spac_doc_list = []
        for token in doc_list:

            stemmed_tok = re.sub(regex_search_pattern4, "", token)
            # check against the gensim dictionary
            if stemmed_tok in unique_token_set:
                spac_doc_list.append(stemmed_tok)
            # if not in dictionary and stem ends with vowel and consonant after stripping y, then add -e to end (smok -> smoke)   
            elif (stemmed_tok not in unique_token_set) & (not not re.findall(regex_search_pattern5, stemmed_tok)):
                stemmed_tok = stemmed_tok + 'e'
                spac_doc_list.append(stemmed_tok)
        
            else:
                spac_doc_list.append(token)
        #----FINISH OF MANUAL LEMMATIZATION-----------------------------
        # apply trained gensim phrase object on token list
        bigram_tokenized = self.phrase_model[spac_doc_list]

        return bigram_tokenized

    def recommend_from_text(self, text, num_rec = 10):

        tokenized = self.tokenize_text(text)
        bow_vec = self.dictionary.doc2bow(tokenized)
        sims = self.index_sim[self.scotch_topic_model[bow_vec]]

        sorted_val_df = pd.Series(sims).sort_values(ascending = False).to_frame(name = "similarity")

        sorted_val_df['name'] = self.full_data.iloc[sorted_val_df.index].name.values

        return sorted_val_df.name[0: num_rec]
        

        

In [174]:
reco_engine = Scotch_Recommender()

In [175]:
txt ="Behind the obvious peatiness on the nose lurks a beautiful array of scents: smoke, decay, iodine, leather, seawater, charcoal, and wet stone. After opening up, taffy, peanut butter fudge, and sweetly viscous gumminess are present, herbal ocean tones underlying still. On the palate, the first impression is the interplay between sweet and salty. Malted barley and sea salt. Savory, with marked slate and driftwood notes. Match sticks, sulphur, hay, and smoked salt blend together with the ripe sugar elements that define the spirit. It is clear the play between very sweet oak and very smoky peat is what makes this spirit the seminal one that it is today. Sweeter on the palate than expected considering the heft peat brings to whisky, leaves the impression of sea spray and hot breakfast cereal on the finish, fading into just the smoky peat we know it for."
reco_engine.recommend_from_text(txt, 30)

3682    Jura 26 Year Old 1989 (casks 30739 & 30740) - ...
303                                           SPEY Fumare
2166    Caol Ila 8 Year Old 2010 (casks 318710 & 31871...
3528    Lagavulin 2000 (bottled 2016) Distillers Editi...
2921    Lagavulin 2000 (bottled 2016) Pedro Ximénez Ca...
1776                           Kilchoman STR Cask Matured
7934            Ledaig 21 Year Old Manzanilla Cask Finish
6305    Ledaig 22 Year Old 1972 - Lost Bottlings Serie...
2219    Caol Ila 12 Year Old 2008 (cask 14251) - Old P...
4254    Laphroaig 10 Year Old 2005 (cask 80080) - Dime...
6964    Tomintoul 48 Year Old 1967 (casks 150031 & 150...
7384          Laphroaig Cairdeas Port Wood Edition (2013)
3133                                Ailsa Bay Single Malt
6358    Highland Park 21 Year Old 1991 (cask 9200) - D...
4709     Bunnahabhain Dràm An Stiùreadair - Feis Ile 2014
4049    Springbank 26 Year Old 1990 (cask 096) (Rest &...
7131    Caol Ila 7 Year Old 2009 (casks 318823 & 31882...
936           