In [2]:
import abc


class Dialog(abc.ABC):
    """
    A dialog listens for utterances, parses and interprets them, then updates
    its internal state. It can then formulate a response on demand.
    """

    def listen(self, text, response=True, **kwargs):
        """
        A text utterance is passed in and parsed. It is then passed to the
        interpret method to determine how to respond. If a response is
        requested, the respond method is used to generate a text response
        based on the most recent input and the current Dialog state.
        """
        # Parse the input
        sents = self.parse(text)

        # Interpret the input
        sents, confidence, kwargs = self.interpret(sents, **kwargs)

        # Determine the response
        if response:
            reply = self.respond(sents, confidence, **kwargs)
        else:
            reply = None

        # Return initiative
        return reply, confidence

    @abc.abstractmethod
    def parse(self, text):
        """
        Every dialog may need its own parsing strategy, some dialogs may need
        dependency vs. constituency parses, others may simply require regular
        expressions or chunkers.
        """
        return []

    @abc.abstractmethod
    def interpret(self, sents, **kwargs):
        """
        Interprets the utterance passed in as a list of parsed sentences,
        updates the internal state of the dialog, computes a confidence of the
        interpretation. May also return arguments specific to the response
        mechanism.
        """
        return sents, 0.0, kwargs

    @abc.abstractmethod
    def respond(self, sents, confidence, **kwargs):
        """
        Creates a response given the input utterances and the current state of
        the dialog, along with any arguments passed in from the listen or the
        interpret methods.
        """
        return None

In [4]:
from collections.abc import Sequence
from operator import itemgetter


class SimpleConversation(Dialog, Sequence):
    """
    This is the most simple version of a conversation.
    """

    def __init__(self, dialogs):
        self._dialogs = dialogs

    def __getitem__(self, idx):
        return self._dialogs[idx]

    def __len__(self):
        return len(self._dialogs)

    def listen(self, text, response=True, **kwargs):
        """
        Simply return the best confidence response
        """
        responses = [
            dialog.listen(text, response, **kwargs)
            for dialog in self._dialogs
        ]

        # Responses is a list of (response, confidence) pairs
        return max(responses, key=itemgetter(1))

    def parse(self, text):
        """
        Returns parses for all internal dialogs for debugging
        """
        return [dialog.parse(text) for dialog in self._dialogs]

    def interpret(self, sents, **kwargs):
        """
        Returns interpretations for all internal dialogs for debugging
        """
        return [dialog.interpret(sents, **kwargs) for dialog in self._dialogs]

    def respond(self, sents, confidence, **kwargs):
        """
        Returns responses for all internal dialogs for debugging
        """
        return [
            dialog.respond(sents, confidence, **kwargs)
            for dialog in self._dialogs
        ]

In [5]:
import re

class Greeting(Dialog):
    """
    Keeps track of the participants entering or leaving the conversation and
    responds with appropriate salutations. This is an example of a rules based
    system that keeps track of state and uses regular expressions and logic to
    handle the dialog.
    """

    PATTERNS = {
        'greeting': r'hello|hi|hey|good morning|good evening',
        'introduction': r'my name is ([a-z\-\s]+)',
        'goodbye': r'goodbye|bye|ttyl',
        'rollcall': r'roll call|who\'s here?',
    }

    def __init__(self, participants=None):
        # Participants is a map of user name to real name
        self.participants = {}

        if participants is not None:
            for participant in participants:
                self.participants[participant] = None

        # Compile regular expressions
        self._patterns = {
            key: re.compile(pattern, re.I)
            for key, pattern in self.PATTERNS.items()
        }

    def parse(self, text):
        """
        Applies all regular expressions to the text to find matches.
        """
        matches = {}
        for key, pattern in self._patterns.items():
            match = pattern.match(text)
            if match is not None:
                matches[key] = match
        return matches

    def interpret(self, sents, **kwargs):
        """
        Takes in parsed matches and determines if the message is an enter,
        exit, or name change.
        """
        # Can't do anything with no matches
        if len(sents) == 0:
            return sents, 0.0, kwargs

        # Get username from the participants
        user = kwargs.get('user', None)

        # Determine if an introduction has been made
        if 'introduction' in sents:
            # Get the name from the utterance
            name = sents['introduction'].groups()[0]
            user = user or name.lower()

            # Determine if name has changed
            if user not in self.participants or self.participants[user] != name:
                kwargs['name_changed'] = True

            # Update the participants
            self.participants[user] = name
            kwargs['user'] = user

        # Determine if a greeting has been made
        if 'greeting' in sents:
            # If we don't have a name for the user
            if not self.participants.get(user, None):
                kwargs['request_introduction'] = True

        # Determine if goodbye has been made
        if 'goodbye' in sents and user is not None:
            # Remove participant
            self.participants.pop(user)
            kwargs.pop('user', None)

        # If we've seen anything we're looking for, we're pretty confident
        return sents, 1.0, kwargs

    def respond(self, sents, confidence, **kwargs):
        """
        Gives a greeting or a goodbye depending on what's appropriate.
        """
        if confidence == 0:
            return None

        name = self.participants.get(kwargs.get('user', None), None)
        name_changed = kwargs.get('name_changed', False)
        request_introduction = kwargs.get('request_introduction', False)

        if 'greeting' in sents or 'introduction' in sents:
            if request_introduction:
                return "Hello, what is your name?"
            else:
                return "Hello, {}!".format(name)

        if 'goodbye' in sents:
            return "Talk to you later!"

        if 'rollcall' in sents:
            people = list(self.participants.values())

            if len(people) > 1:
                roster = ", ".join(people[:-1])
                roster += " and {}.".format(people[-1])
                return "Currently in the conversation are " + roster

            elif len(people) == 1:
                return "It's just you and me right now, {}.".format(name)
            else:
                return "So lonely in here by myself ... wait who is that?"

        raise Exception(
            "expected response to be returned, but could not find rule"
        )


In [6]:
if __name__ == '__main__':
    dialog = Greeting()
    # `listen` returns (response, confidence) tuples; just print the response
    print(dialog.listen("Hello!", user="jakevp321")[0])
    print(dialog.listen("my name is Jake", user="jakevp321")[0])
    print(dialog.listen("Roll call!", user="jakevp321")[0])
    print(dialog.listen("Have to go, goodbye!", user="jakevp321")[0])

Hello, what is your name?
Hello, Jake!
It's just you and me right now, Jake.
None


In [7]:
if __name__ == '__main__':
    dialog = Greeting()
    print(dialog.listen("hey", user="jillmonger")[0])
    print(dialog.listen("my name is Jill.", user="jillmonger")[0])
    print(dialog.listen("who's here?")[0])

Hello, what is your name?
Hello, Jill!
It's just you and me right now, None.


In [8]:
import pytest


class TestBaseClasses(object):
    """
    Tests for the Dialog class
    """
    @pytest.mark.parametrize("text", [
        "Gobbledeguk", "Gibberish", "Wingdings"
    ])
    def test_dialog_abc(self, text):
        """
        Test the Dialog ABC and the listen method
        """
        class SampleDialog(Dialog):
            def parse(self, text):
                return []

            def interpret(self, sents):
                return sents, 0.0, {}

            def respond(self, sents, confidence):
                return None

        sample = SampleDialog()
        reply, confidence = sample.listen(text)
        assert confidence == 0.0
        assert reply is None

In [9]:
class TestGreetingDialog(object):
    """
    Test expected input and responses for the Greeting dialog
    """

    @pytest.mark.parametrize("text", ["Hello!", "hello", 'hey', 'hi'])
    @pytest.mark.parametrize("user", [None, "jay"], ids=["w/ user", "w/o user"])
    def test_greeting_intro(self, user, text):
        """
        Test that an initial greeting requests an introduction
        """
        g = Greeting()
        reply, confidence = g.listen(text, user=user)
        assert confidence == 1.0
        assert reply is not None
        assert reply == "Hello, what is your name?"

In [10]:
    @pytest.mark.xfail(reason="a case that must be handled")
    @pytest.mark.parametrize("text", ["My name is Jake", "Hello, I'm Jake."])
    @pytest.mark.parametrize("user", [None, "jkm"], ids=["w/ user", "w/o user"])
    def test_initial_intro(self, user, text):
        """
        Test an initial introduction without greeting
        """
        g = Greeting()
        reply, confidence = g.listen(text, user=user)

        assert confidence == 1.0
        assert reply is not None
        assert reply == "Hello, Jake!"

        if user is None:
            user = 'jake'

        assert user in g.participants
        assert g.participants[user] == 'Jake'

In [11]:
import spacy
from spacy import displacy

# Required first: python -m spacy download en

spacy_nlp = spacy.load("en")

def plot_displacy_tree(sent):
    doc = spacy_nlp(sent)
    displacy.serve(doc, style='dep')

In [12]:
from nltk.parse.stanford import StanfordParser

stanford_parser = StanfordParser(
    model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
)
def print_stanford_tree(sent):
    """
    Use Stanford pretrained model to extract dependency tree
    for use by other methods
    Returns a list of trees
    """
    parse = stanford_parser.raw_parse(sent)
    return list(parse)

Please use [91mnltk.parse.corenlp.CoreNLPParser[0m instead.
  stanford_parser = StanfordParser(


LookupError: 

===========================================================================
  NLTK was unable to find stanford-parser\.jar! Set the CLASSPATH
  environment variable.

  For more information, on stanford-parser\.jar, see:
    <https://nlp.stanford.edu/software/lex-parser.shtml>
===========================================================================

In [13]:
def plot_stanford_tree(sent):
    """
    Visually inspect the Stanford dependency tree as an image
    """
    parse = stanford_parser.raw_parse(sent)
    tree = list(parse)
    tree[0].draw()

In [14]:
tree = print_stanford_tree("How many teaspoons are in a tablespoon?")
root = tree[0] # The root is the first item in the parsed sents tree
print(root)
print(root.pos())

NameError: name 'print_stanford_tree' is not defined

In [15]:
import os
import json
import inflect
import humanize

from nltk.stem.snowball import SnowballStemmer
from nltk.parse.stanford import StanfordParser
from nltk.tree import Tree
from nltk.util import breadth_first


class Converter(Dialog):
    """
    Answers questions about converting units
    """

    def __init__(self, conversion_path=CONVERSION_PATH):
        with open(conversion_path, 'r') as f:
            self.metrics = json.load(f)

        self.inflect = inflect.engine()
        self.stemmer = SnowballStemmer('english')
        self.parser = StanfordParser(model_path=STANFORD_PATH)

    def parse(self, text):
        parse = self.parser.raw_parse(text)
        return list(parse)

    def interpret(self, sents, **kwargs):
        measures = []
        confidence = 0
        results = dict()

        # The root is the first item in the parsed sents tree
        root = sents[0]

        # Make sure there are wh-adverb phrases
        if "WRB" in [tag for word, tag in root.pos()]:
            # If so, increment confidence & traverse parse tree
            confidence += .2
            # Set the maxdepth to limit recursion
            for clause in breadth_first(root, maxdepth=8):
                #find the simple declarative clauses (+S+)
                if isinstance(clause, Tree):
                    if clause.label() in ["S", "SQ", "WHNP"]:
                        for token,tag in clause.pos():
                            # Store nouns as target measures
                            if tag in ["NN", "NNS"]:
                                measures.append(token)
                            # Store numbers as target quantities
                            elif tag in ["CD"]:
                                results["quantity"] = token

            # Handle duplication for very nested trees
            measures = list(set([self.stemmer.stem(mnt) for mnt in measures]))

            # If both source and destination measures are provided...
            if len(measures) == 2:
                confidence += .4
                results["src"] = measures[0]
                results["dst"] = measures[1]

                # Check to see if they correspond to our lookup table
                if results["src"] in self.metrics.keys():
                    confidence += .2
                    if results["dst"] in self.metrics[results["src"]]):
                        confidence += .2

        return results, confidence, kwargs

    def convert(self, src, dst, quantity=1.0):
        """
        Converts from the source unit to the dest unit for the given quantity
        of the source unit.
        """
        # Stem source and dest to remove pluralization
        src, dst = tuple(map(self.stemmer.stem, (src,dst)))

        # Check that we can convert
        if dst not in self.metrics:
            raise KeyError("cannot convert to '{}' units".format(src))
        if src not in self.metrics[dst]:
            raise KeyError("cannot convert from {} to '{}'".format(src, dst))

        return self.metrics[dst][src] * float(quantity), src, dst

    def round(self, num):
        num = round(float(num), 4)
        if num.is_integer():
            return int(num)
        return num

    def pluralize(self, noun, num):
        return self.inflect.plural_noun(noun, num)

    def numericalize(self, amt):
        if amt > 100.0 and amt < 1e6:
            return humanize.intcomma(int(amt))
        if amt >= 1e6:
            return humanize.intword(int(amt))
        elif isinstance(amt, int) or amt.is_integer():
            return humanize.apnumber(int(amt))
        else:
            return humanize.fractional(amt)

    def respond(self, sents, confidence, **kwargs):
        """
        Response makes use of the humanize and inflect libraries to produce
        much more human understandable results.
        """
        if confidence < .5:
            return "I'm sorry, I don't know that one."

        try:
            quantity = sents.get('quantity', 1)
            amount, source, target = self.convert(**sents)

            # Perform numeric rounding
            amount = self.round(amount)
            quantity = self.round(quantity)

            # Pluralize
            source = self.pluralize(source, quantity)
            target = self.pluralize(target, amount)
            verb = self.inflect.plural_verb("is", amount)

            # Numericalize
            quantity = self.numericalize(quantity)
            amount = self.numericalize(amount)


            return "There {} {} {} in {} {}.".format(
                verb, amount, target, quantity, source
            )

        except KeyError as e:
            return "I'm sorry I {}".format(str(e))

SyntaxError: unmatched ')' (<ipython-input-15-e8973bd372c2>, line 66)

In [16]:
if __name__ == "__main__":
    dialog = Converter()
    print(dialog.listen("How many cups are in a gallon?"))
    print(dialog.listen("How many gallons are in 2 cups?"))
    print(dialog.listen("How many tablespoons are in a cup?"))
    print(dialog.listen("How many tablespoons are in 10 cups?"))
    print(dialog.listen("How many tablespoons are in a teaspoon?"))

NameError: name 'Converter' is not defined

In [17]:
import bs4

def titles(self, fileids=None, categories=None):
    """
    Parse HTML to identify titles from the head tag.
    """
    for doc in self.docs(fileids, categories):
        soup = bs4.BeautifulSoup(doc, 'lxml')
        try:
            yield soup.title.text
            soup.decompose()
        except AttributeError as e:
            continue

In [19]:
import pickle

import joblib
from sklearn.pipeline import Pipeline
from sklearn.neighbors import BallTree
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer


class BallTreeRecommender(BaseEstimator, TransformerMixin):
    """
    Given input terms, provide k recipe recommendations
    """
    def __init__(self, k=3, **kwargs):
        self.k = k
        self.trans_path = "svd.pkl"
        self.tree_path = "tree.pkl"
        self.transformer = False
        self.tree = None
        self.load()

    def load(self):
        """
        Load a pickled transformer and tree from disk,
        if they exist.
        """
        if os.path.exists(self.trans_path):
            self.transformer = joblib.load(open(self.trans_path, 'rb'))
            self.tree = joblib.load(open(self.tree_path, 'rb'))
        else:
            self.transformer = False
            self.tree = None

    def save(self):
        """
        It takes a long time to fit, so just do it once!
        """
        joblib.dump(self.transformer, open(self.trans_path, 'wb'))
        joblib.dump(self.tree, open(self.tree_path, 'wb'))

    def fit_transform(self, documents):
        if self.transformer == False:
            self.transformer = Pipeline([
                ('norm', TextNormalizer(minimum=50, maximum=200)),
                ('transform', Pipeline([
                    ('tfidf', TfidfVectorizer()),
                    ('svd', TruncatedSVD(n_components=200))
                ])
                 )
            ])
            self.lexicon = self.transformer.fit_transform(documents)
            self.tree = BallTree(self.lexicon)
            self.save()

    def query(self, terms):
        """
        Given input list of ingredient terms, return k closest matching recipes.
        """
        vect_doc = self.transformer.named_steps['transform'].fit_transform(terms)
        dists, inds = self.tree.query(vect_doc, k=self.k)
        return inds[0]

NameError: name 'BaseEstimator' is not defined

In [20]:
class RecipeRecommender(Dialog):
    """
    Recipe recommender dialog
    """

    def __init__(self, recipes, recommender=BallTreeRecommender(k=3)):
        self.recipes = list(corpus.titles())
        self.recommender = recommender
        # Fit the recommender model with the corpus
        self.recommender.fit_transform(list(corpus.docs()))

    def parse(self, text):
        """
        Extract ingredients from the text
        """
        return pos_tag(wordpunct_tokenize(text))

    def interpret(self, sents, **kwargs):
        # If feedback detected, update the model
        if 'feedback' in kwargs:
            self.recommender.update(kwargs['feedback'])

        n_nouns = sum(1 for pos, tag in sents if pos.startswith("N"))
        confidence = n_nouns/len(sents)

        terms = [tag for pos, tag in sents if pos.startswith("N")]
        return terms, confidence, kwargs

    def respond(self, terms, confidence, **kwargs):
        """
        Returns a recommendation if the confidence is > 0.15 otherwise None.
        """
        if confidence < 0.15:
            return None

        output = [
            "Here are some recipes related to {}".format(", ".join(terms))
        ]
        output += [
            "- {}".format(self.recipes[idx])
            for idx in self.recommender.query(terms)
        ]

        return "\n".join(output)

NameError: name 'BallTreeRecommender' is not defined

In [21]:
if __name__ == '__main__':
    corpus = HTMLPickledCorpusReader('../food_corpus_proc')
    recommender = RecipeRecommender(corpus)
    question = "What can I make with brie, tomatoes, capers, and pancetta?"
    print(recommender.listen(question))

NameError: name 'HTMLPickledCorpusReader' is not defined