In [50]:
import spacy
from tqdm.auto import tqdm
from spacy.tokens import DocBin

# We want to classify movie reviews as positive or negative
# http://ai.stanford.edu/~amaas/data/sentiment/
from ml_datasets import imdb

# load movie reviews as a tuple (text, label)
train_data, valid_data = imdb()

# load a medium sized english language model in spacy
nlp = spacy.load("en_core_web_md")

In [52]:
def make_docs(data):
    """
    this will take a list of texts and labels 
    and transform them in spacy documents
    
    data: list(tuple(text, label))
    
    returns: List(spacy.Doc.doc)
    """
    
    docs = []
    # nlp.pipe([texts]) is way faster than running 
    # nlp(text) for each text
    # as_tuples allows us to pass in a tuple, 
    # the first one is treated as text
    # the second one will get returned as it is.
    
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total = len(data)):
        
        # we need to set the (text)cat(egory) for each document
        doc.cats["sentiment"] = int(label == "pos")
        
        # put them into a nice list
        docs.append(doc)
    
    return docs

In [53]:
# we are so far only interested in the first 5000 reviews
# this will keep the training time short.
# In practice take as much data as you can get.
# you can always reduce it to make the script even faster.
num_texts = 5000

# first we need to transform all the training data
train_docs = make_docs(train_data[:num_texts])

# then we save it in a binary file to disc
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("./data/train.spacy")

# repeat for validation data
valid_docs = make_docs(valid_data[:num_texts])
doc_bin = DocBin(docs=valid_docs)
doc_bin.to_disk("./data/valid.spacy")

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

Create a base_config.cfg, See: https://spacy.io/usage/training#quickstart

Run the following command to create full configuration
```python -m spacy init fill-config ./base_config.cfg ./config.cfg``` See: https://spacy.io/api/cli#init

Run the following command to start training
```python -m spacy train config.cfg --output ./output``` See: https://spacy.io/api/cli#train

# Running trained model

In [1]:
import spacy
nlp = spacy.load("output/model-best")

In [11]:
text = "Acting was great, story was ok."
doc = nlp(text)
doc.cats['sentiment']

0.5937117338180542