# Using `spaCy` for text classification

Prior to running this code, it's necessary to install `spaCy` on your machine, and also to download its English libraries. 

In [1]:
import spacy
import pandas as pd
from spacy.tokens import Doc
from spacy.vocab import Vocab

In [3]:
# Load the pre-defined English model:
# nlp = spacy.load('en')
nlp = spacy.load('en_core_web_md')

### Example from `spacy` documentation

source: https://github.com/explosion/spaCy/issues/1997

In [4]:
# Establish the training data
train_data = [
    (u"That was very bad", {"cats": {"POSITIVE": 0, "NEGATIVE": 1}}),
    (u"it is so bad", {"cats": {"POSITIVE": 0, "NEGATIVE": 1}}),
    (u"so terrible", {"cats": {"POSITIVE": 0, "NEGATIVE": 1}}),
    (u"I like it", {"cats": {"POSITIVE": 1, "NEGATIVE": 0}}),
    (u"It is very good.", {"cats": {"POSITIVE": 1, "NEGATIVE": 0}}),
    (u"That was great!", {"cats": {"POSITIVE": 1, "NEGATIVE": 0}})
]

In [5]:
# Create an analytic "pipeline" of type "textcat"
mytextcat1 = nlp.create_pipe('textcat')
nlp.add_pipe(mytextcat1, last=True)

In [6]:
# Add labels to the pipeline. These will be called using the `.cats` attribute, below.
mytextcat1.add_label('POSITIVE')
mytextcat1.add_label('NEGATIVE')

1

In [7]:
# Begin training. Note that "gold" refers to the "ground truth" labels.
optimizer = nlp.begin_training()
for itn in range(10):
    for doc, gold in train_data:
        nlp.update([doc], [gold], sgd=optimizer)

In [8]:
# Provide a new text, and classify it. The predicted category is called using the `.cats` attribute.
doc = nlp(u'It is good.')
print(doc.cats)

{'POSITIVE': 0.9901213645935059, 'NEGATIVE': 0.018102729693055153}


In [9]:
# Provide a new text, and classify it.
doc = nlp(u'It is bad.')
print(doc.cats)

{'POSITIVE': 0.5460399985313416, 'NEGATIVE': 0.8994844555854797}
