# NLTK and Spacy tokenizers for recipes

In [1]:
import spacy
from nltk.stem.snowball import SnowballStemmer
import nltk
from nltk.tokenize import sent_tokenize

In [2]:
class Tokenizer(object):
    
    def __init__(self, language):
        try:
            self.iso, self.extended = language
        except:
            print('Error: language should be provided in the form (iso-2, extended)')
        self.stemmer = SnowballStemmer(self.extended)
        self.nlp = spacy.load(self.iso)
        self.keys = ['document', 'sentence', 'position', 'text', 'lower', 'lemma', 
                        'pos', 'tag', 'dep',
                        'shape', 'is_alpha', 'is_stop', 'stem']
    
    def tokenize(self, text_id, text, text_label='full', drop_apostrophe=False):
        if drop_apostrophe:
            text = text.replace("'", " ")
        tokens = []
        for j, sentence in enumerate(sent_tokenize(text)):
            doc = self.nlp(sentence.strip())
            for i, token in enumerate(doc):
                lower = token.text.lower()
                tag_data = [tuple(x.split('=')) for x in 
                            token.tag_.split('|')]
                try:
                    tag = dict(tag_data)
                except ValueError:
                    tag = tag_data[0][0]
                data = [text_id, j, i, token.text, lower, token.lemma_, 
                        token.pos_, tag, token.dep_,
                        token.shape_, token.is_alpha, token.is_stop, self.stemmer.stem(lower)]
                record = dict(zip(self.keys, data))
                record['label'] = text_label
                tokens.append(record)
        return tokens

## Tokenization
Store output of tokenization in the DB working on recepies text.

In [3]:
import pymongo
from IPython.display import clear_output

In [4]:
db = pymongo.MongoClient()['inforet']
source = db['epicurious']
target = db['epicurious_tokens']

In [5]:
tokenizer = Tokenizer(('en', 'english'))

In [6]:
def tokenize(record):
    all_tokens = []
    labels = ['title', 'directions', 'ingredients', 'desc']
    for label in labels:
        if label in record.keys():
            if isinstance(record[label], list):
                for text in record[label]:
                    tokens = tokenizer.tokenize(record['_id'], text, text_label=label)
                    all_tokens += tokens
            elif record[label] is not None:
                tokens = tokenizer.tokenize(record['_id'], record[label], text_label=label)
                all_tokens += tokens
    return all_tokens

In [7]:
N = source.count_documents({})
for x in range(0, 22):
    limit, entries, ids = x * 1000, [], []
    for i, recipe in enumerate(source.find().skip(limit).limit(1000)):
        ids.append(recipe['_id'])
        entries += tokenize(recipe)
        print(i+limit+1, 'of', N)
        clear_output(wait=True)
    if len(entries) > 0:
        target.insert_many(entries)

20130 of 20130
