In [4]:
! python3 -m spacy download en_core_web_md

/usr/bin/python3: No module named spacy


In [1]:
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding


# new entity label
LABEL = 'MOVIE'

# training data
# Note: If you're using an existing model, make sure to mix in examples of
# other entity types that spaCy correctly recognized before. Otherwise, your
# model might learn the new type, but "forget" what it previously knew.
# https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
import pickle as pk
with open('/mnt/data/akshit.jain/kg/data/spacy_movie_ner_train.pk','rb') as f:
    TRAIN_DATA=pk.load(f)


@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    new_model_name=("New model name for model meta.", "option", "nm", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))
def main(model='en_core_web_md', new_model_name='movie', output_dir='data/model', n_iter=10):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    # spacy.require_gpu()
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
                           losses=losses)
            print('Losses', losses)

    # test the trained model
    test_text = 'Amir Khan gave a splendid performance in Dhoom 3'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)


plac.call(main)

usage: ipykernel_launcher.py [-h] [-m en_core_web_md] [-nm movie]
                             [-o data/model] [-n 10]
ipykernel_launcher.py: error: unrecognized arguments: -f /run/user/1004/jupyter/kernel-1c644a80-2330-4a94-9258-9adf1af17dbb.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [3]:
nlp = spacy.load('en_core_web_md')

In [6]:
for doc, meta in TRAIN_DATA[:20]:
    doc_parsed=nlp(doc)
    for ent in doc_parsed.ents:
        print(ent.label_, ent.text)

GPE Kurukshetra
PERSON Kirthi Chakra
GPE Mumbai
PERSON Shabana Azmi
ORG Global Leadership Ambassador
ORG Women in Public Service Project (
PERSON Hillary Clinton
WORK_OF_ART Tanu Weds Manu
GPE England
PERSON Kangana Ranaut
PERSON Jimmy Shergill
PERSON Kalanksui
PERSON Dhawan
PERSON Bahl
PERSON Hrithik Roshan
PERSON Salman Khan
ORG Race 3
ORG Salman
PERSON Dus Ka Dum
DATE nine years
PERSON Taimur Ali Khan
PERSON Innaya Kemmu
PERSON Yash
PERSON Roohi Johar
ORG Riteish Deshmukh
PERSON Genelia D'Souza's
ORG Rahyl
PERSON Naam Shabana
ORG Pink
PERSON Madhavan
CARDINAL two
PERSON Achanak
DATE 1973
PERSON Gulzar
PERSON Achanak
CARDINAL one
ORDINAL first
PERSON Vinod Khanna
CARDINAL Zero
DATE December
PERSON Anushka Sharma
PERSON Katrina Kaif
PERSON Abhishek Bachchan
PERSON Anurag Kashyap
PERSON Manmarziyaan
DATE Thursday
PERSON Manmarziyaan
DATE 10 days
ORG Firangi
GPE Punjab
GPE Rajasthan
PERSON Arjun Kapoor
DATE November 8, 2017  
PERSON Dibakar Banerjee
PERSON Sandeep Aur Pinky Faraar
PERSO

In [24]:
from spacy import displacy
# start_iter=0
for doc, meta in TRAIN_DATA[start_iter:start_iter+10]:
    start_iter+=10
    doc_parsed=nlp(doc)
    displacy.render(doc_parsed, style='ent', jupyter=True)
    for (start, end, label) in meta['entities']:
        print('expected: {}'.format(doc[start:end]))

expected: Kabali


expected: Sangamithra


expected: Firangi


expected: hichki


expected: Shabana


expected: Secret Superstar


expected: Kabali


expected: The Accidental Prime Minister


expected: detective


expected: Badhaai Ho


In [14]:
meta

{'entities': [(45, 56, 'MOVIE')]}

In [2]:
main()

Loaded model 'en_core_web_md'
Losses {'ner': 1341.7827462905614}
Losses {'ner': 1262.805256732811}
Losses {'ner': 1246.9496507780516}
Losses {'ner': 1219.2357510847858}
Losses {'ner': 1224.9805641085704}
Losses {'ner': 1219.3608174686274}
Losses {'ner': 1220.8676102825011}
Losses {'ner': 1200.7434523807528}
Losses {'ner': 1209.9190613545616}
Losses {'ner': 1184.8855585969877}
Entities in 'Amir Khan gave a splendid performance in Dhoom 3'
MOVIE Dhoom


FileNotFoundError: [Errno 2] No such file or directory: 'data/model'

In [13]:
import re
pattern = re.compile(r'\s+')
sentence=TRAIN_DATA[0][0]
sentence = re.sub(pattern, ' ', sentence)
print('{}\n{}'.format(sentence,TRAIN_DATA[0][0]))

Larger-than-life action sequences Salman is a perfect combination of an entertainer and an action hero.
Larger-than-life action sequences   Salman is a perfect combination of an entertainer and an action hero.
