# Using `spaCy` for text classification

Prior to running this code, it's necessary to install `spaCy` on your machine, and also to download its English libraries. 

In [1]:
import spacy
import pandas as pd
from spacy.tokens import Doc
from spacy.vocab import Vocab

In [2]:
# Load the pre-defined English model:
nlp = spacy.load('en')

### Example from `spacy` documentation

source: https://github.com/explosion/spaCy/issues/1997

In [3]:
# Establish the training data
train_data = [
    (u"That was very bad", {"cats": {"POSITIVE": 0, "NEGATIVE": 1}}),
    (u"it is so bad", {"cats": {"POSITIVE": 0, "NEGATIVE": 1}}),
    (u"so terrible", {"cats": {"POSITIVE": 0, "NEGATIVE": 1}}),
    (u"I like it", {"cats": {"POSITIVE": 1, "NEGATIVE": 0}}),
    (u"It is very good.", {"cats": {"POSITIVE": 1, "NEGATIVE": 0}}),
    (u"That was great!", {"cats": {"POSITIVE": 1, "NEGATIVE": 0}})
]

In [4]:
# Create an analytic "pipeline" of type "textcat"
textcat = nlp.create_pipe('textcat')
nlp.add_pipe(textcat, last=True)

In [5]:
# Add labels to the pipeline
textcat.add_label('POSITIVE')
textcat.add_label('NEGATIVE')

1

In [6]:
# Begin training. Note that "gold" refers to the "ground truth" labels.
optimizer = nlp.begin_training()
for itn in range(10):
    for doc, gold in train_data:
        nlp.update([doc], [gold], sgd=optimizer)



In [7]:
# Provide a new text, and classify it.
doc = nlp(u'It is good.')
print(doc.cats)

{'POSITIVE': 0.9991204142570496, 'NEGATIVE': 0.9986401200294495}


In [8]:
# Provide a new text, and classify it.
doc = nlp(u'It is bad.')
print(doc.cats)

{'POSITIVE': 0.9823446273803711, 'NEGATIVE': 0.9844799637794495}


### Example using Pandas dataframe

In [9]:
# Read in a CSV file with a column of text abstracts.
df = pd.read_csv('resources/fedreg.csv')
df.head(3)

Unnamed: 0,document_number,abstract
0,testing12345,The quick brown fox jumps over the lazy dog.
1,2018-10583,We are superseding Airworthiness Directive (AD...
2,2018-10902,The Commodity Futures Trading Commission (Comm...
