spaCy: 

## Overview
- Method
    1. Tokenize
    2. Lemmatize
        - Remove stopwords (words that occur frequently but don't contain important information)
    3. Pattern Matching
        - Can be used to reference against a set of terms
    4. Text Classification
        - __Bag of Words__: vector representing word position and frequency
        - __TF-IDF (Term Frequency - Inverse Document Frequency)__: term count scaled by term's frequency in the corpus

References:

In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp("Tea is healthy and calming, don't you think?")

ModuleNotFoundError: No module named 'spacy'

In [None]:
for token in doc:
    print(token)
    print(token.lemma_)
    print(token.is_stop)
    if token.is_stop:
        doc.remove(token)

In [None]:
from spacy.matcher import PhraseMatcher
    matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
    terms = ['Galaxy Note', 'iPhone 11', 'iPhone XS', 'Google Pixel']
    patterns = [nlp(text) for text in terms]
    matcher.add('TerminologyList', patterns)
    original_text = "Glowing review overall, and some really interesting side-by-side "
            "photography tests pitting the iPhone 11 Pro against the "
            "Galaxy Note 10 Plus and last year’s iPhone XS and Google Pixel 3."
    text_doc = nlp(original_text)
    matches = matcher(text_doc)
    for match in matches:
        match_id, start, end = match
        print(nlp.vocab.strings[match_id], text_doc[start:end])

In [None]:
spam = pd.read_csv('../input/nlp-course/spam.csv')
textcat = nlp.add_pipe('textcat')
textcat.add_label('ham')
textcat.add_label('spam')
train_texts = spam['text'].values
train_lavels = [{'cats: {'ham': label=='ham',
                            'spam': label=='spam'}}
                for label in spam['label']]
train_data = list(zip(train_texts, train_labels))

from spacy.util import minibatch
from spacy.training.example import Example

spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

# Create the batch generator with batch size = 8
batches = minibatch(train_data, size=8)
# Iterate through minibatches
for batch in batches:
    # Each batch is a list of (text, label) 
    for text, labels in batch:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, labels)
        nlp.update([example], sgd=optimizer)

import random

random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

losses = {}
for epoch in range(10):
    random.shuffle(train_data)
    # Create the batch generator with batch size = 8
    batches = minibatch(train_data, size=8)
    # Iterate through minibatches
    for batch in batches:
        for text, labels in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, labels)
            nlp.update([example], sgd=optimizer, losses=losses)
    print(losses)

texts = ["Are you ready for the tea party????? It's gonna be wild",
"URGENT Reply to this message for GUARANTEED FREE TEA" ]
docs = [nlp.tokenizer(text) for text in texts]
    
# Use textcat to get the scores for each doc
textcat = nlp.get_pipe('textcat')
scores = textcat.predict(docs)

predicted_labels = scores.argmax(axis=1)
print([textcat.labels[label] for label in predicted_labels])
