In [43]:
import spacy
from __future__ import print_function, unicode_literals
import re

In [3]:
nlp = spacy.load('en')

In [4]:
texts = [
      "1 cup (2 sticks) butter, softened",
      "3/4 cup granulated sugar",
      "3/4 cup firmly packed brown sugar",
      "2 eggs",
      "1 tsp vanilla",
      "2 1/4 cups flour",
      "1 tsp baking soda",
      "1/4  tsp salt",
      "1  pkg (12 oz) BAKER'S Real Chocolate Chips",
      "1 cup chopped nuts (optional)",
      "1 lb Fettuccini",
      "1 lb Shrimp, peeled and cleaned",
      "4  Zucchini",
      "1 lb Sliced mushrooms",
      "8  Whole, fresh tomatoes",
      "2 cloves Minced garlic",
      "2 tsp Oregano",
      "1 tsp Salt",
      "1 tsp Pepper",
      "1 tsp Virgin Olive Oil",
      "1-1/2 lbs ground beef",
      "1/2  cup A.1. Thick Hearty Steak Sauce",
      "2 Tbs chopped oil-packed sun dried tomatoes",
      "3 Tbs chopped fresh basil",
      "1 Tbs toasted pine nuts",
      "6  Sourdough rolls, split",
      "3 oz goat cheese"
    ]

In [70]:
MEASURE_WORDS = [
    'tsp',
    'tbs',
    'tsps',
    'pkg',
    'cup',
    'pint',
    'quart',
    'package',
    'packet',
    'gallon',
    'ounce',
    'oz',
    'lb',
    'cups',
    'pints',
    'quarts',
    'gallons',
    'liter',
    'liters'
]

In [75]:

def clean_text(text):
    # removing any text in parenthesis
    text = re.sub(r'\(.+\)', '', text)
    # convert to lower case
    text = text.lower()
    text = text.split(' ')
    while '' in text:
        text.remove('')
    while " " in text:
        text.remove(" ")
    while "\n" in text:
        text.remove("\n")
    while "\n\n" in text:
        text.remove("\n\n")
    return u' '.join(text)

def find_and_store_digit(text):
    '''
    :text - text to take digit
    :return: - returns digit
    '''
    digits = re.findall(r'\d*.*\d', text)
    
    if len(digits) > 0:
        return digits[0]
    else:
        return None

def remove_digits(text):
    return re.sub(r'\d*.*\d', '', text)


for t in texts:
#     print(t)
    clean_text = cleanText(t)
    digit = find_and_store_digit(clean_text)
    clean_text = remove_digits(clean_text)
    
    doc = nlp(clean_text)
    desc = []
    core = []
    measure = []
    for word in doc:
        if word.pos_ == 'ADJ' or word.pos_ == 'VERB':
            desc.append(word.text)
        elif word.pos_ in ['ADV', 'SPACE', 'PUNCT', 'PART', 'CCONJ']:
            continue
        elif word.text in MEASURE_WORDS:
            measure.append(word.text)
        else:
            core.append(word.text)
    basic_text = u' '.join(core)
    # new parser    
    new_doc = nlp(basic_text)
    for word in new_doc:
        print(word.text, word.pos_, word.dep_, word.head)
    print("Original: %s" % t)
#     print("Cleaned: %s" % u' '.join(core))
    print("-----Done-----")
    

butter NOUN ROOT butter
Original: 1 cup (2 sticks) butter, softened
-----Done-----
sugar NOUN ROOT sugar
Original: 3/4 cup granulated sugar
-----Done-----
sugar NOUN ROOT sugar
Original: 3/4 cup firmly packed brown sugar
-----Done-----
eggs NOUN ROOT eggs
Original: 2 eggs
-----Done-----
vanilla NOUN ROOT vanilla
Original: 1 tsp vanilla
-----Done-----
Original: 2 1/4 cups flour
-----Done-----
baking VERB compound soda
soda NOUN ROOT soda
Original: 1 tsp baking soda
-----Done-----
salt NOUN ROOT salt
Original: 1/4  tsp salt
-----Done-----
baker NOUN compound chips
chocolate NOUN compound chips
chips NOUN ROOT chips
Original: 1  pkg (12 oz) BAKER'S Real Chocolate Chips
-----Done-----
Original: 1 cup chopped nuts (optional)
-----Done-----
fettuccini ADJ ROOT fettuccini
Original: 1 lb Fettuccini
-----Done-----
shrimp NOUN ROOT shrimp
Original: 1 lb Shrimp, peeled and cleaned
-----Done-----
zucchini NOUN ROOT zucchini
Original: 4  Zucchini
-----Done-----
mushrooms NOUN ROOT mushrooms
Origina

https://nicschrading.com/project/Intro-to-NLP-with-spaCy/
https://explosion.ai/blog/chatbot-node-js-spacy
https://github.com/explosion/spaCy/blob/master/examples/training/train_new_entity_type.py

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score
import string
import re

In [12]:
# A custom function to tokenize the text using spaCy
# and convert to lemmas
def tokenizeText(sample):

    # get the tokens using spaCy
    tokens = nlp(sample)

    # lemmatize
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    # stoplist the tokens
#     tokens = [tok for tok in tokens if tok not in STOPLIST]

    # stoplist symbols
#     tokens = [tok for tok in tokens if tok not in SYMBOLS]

    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")

    return tokens

def printNMostInformative(vectorizer, clf, N):
    """Prints features with the highest coefficient values, per class"""
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    topClass1 = coefs_with_fns[:N]
    topClass2 = coefs_with_fns[:-(N + 1):-1]
    print("Class 1 best: ")
    for feat in topClass1:
        print(feat)
    print("Class 2 best: ")
    for feat in topClass2:
        print(feat)
    

In [22]:
# Create a classifier: a support vector classifier
classifier = SVC(gamma=0.001)
vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(1,1))
# the pipeline to clean, tokenize, vectorize, and classify
pipe = Pipeline([('vectorizer', vectorizer), ('clf', classifier)])


train = ["1 oz chicken", "1 oz beef", "1 cup of mixed vegetables", "8 apples", "Two slices of chicken", "1/4 lb beef", "2 carrots"]

labelsTrain = ["meat/poultry", "meat/poultry", "vegetables", "fruits", "meat/poultry", "meat/poultry", "vegetables"]

pipe.fit(train, labelsTrain)

Pipeline(steps=[('vectorizer', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
    ...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [23]:
# test
test = ["1 cup of mixed vegetables"]
labelsTest = ["vegetables"]

preds = pipe.predict(test)
print("----------------------------------------------------------------------------------------------")
print("results:")
for (sample, pred) in zip(test, preds):
    print(sample, ":", pred)
print("accuracy:", accuracy_score(labelsTest, preds))

----------------------------------------------------------------------------------------------
results:
1 cup of mixed vegetables : meat/poultry
accuracy: 0.0
