In [3]:
import json
import spacy

In [38]:
# load the training data
with open('pvr_training_data.json') as fp:
  training_data = json.load(fp)

In [39]:
training_data

{'classes': ['NAME',
  'COMPANY',
  'ROLE',
  'PERCENTAGE',
  'WEEKDAY',
  'MONEY',
  'TIME PERIOD',
  'STATES'],
 'annotations': [['The share price of PVR rose over 7 percent on Wednesday after the multiplex chain said that it has reduced losses in Q2 despite nil revenue from the core movie exhibition business.',
   {'entities': [[19, 22, 'COMPANY'],
     [33, 42, 'PERCENTAGE'],
     [46, 55, 'WEEKDAY']]}],
  ['The company managed to get rent waivers from most landlords, CFO Nitin Sood said in an interview to CNBC-TV18. “The big focus for us right now as revenues have been nil is to really reduce our fixed cost and we have managed to do that, ” he added.',
   {'entities': [[61, 64, 'ROLE'], [65, 75, 'NAME']]}],
  ['Sood further said that they have brought down the fixed cost down by almost 75-80 percent.',
   {'entities': [[0, 4, 'NAME'], [76, 89, 'PERCENTAGE']]}],
  ["The stock rose as much as 7.6 percent to the day's high of Rs 1,186.85 per share on the BSE.",
   {'entities': [[26, 

In [40]:
# prepare an empty model to train
nlp = spacy.blank('en')
nlp.vocab.vectors.name = 'demo'
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last = True)

In [51]:
# Add the custome NER Tags as entities into the model
for label in training_data["classes"]:
  nlp.entity.add_label(label)

In [52]:
# Train the model
optimizer = nlp.begin_training()


In [26]:
training_data["annotations"]

[['The share price of PVR rose over 7 percent on Wednesday after the multiplex chain said that it has reduced losses in Q2 despite nil revenue from the core movie exhibition business.',
  {'entities': [[19, 22, 'COMPANY'],
    [33, 42, 'PERCENTAGE'],
    [46, 55, 'WEEKDAY']]}],
 ['The company managed to get rent waivers from most landlords, CFO Nitin Sood said in an interview to CNBC-TV18. “The big focus for us right now as revenues have been nil is to really reduce our fixed cost and we have managed to do that, ” he added.',
  {'entities': [[61, 64, 'ROLE'], [65, 75, 'NAME']]}],
 ['Sood further said that they have brought down the fixed cost down by almost 75-80 percent.',
  {'entities': [[0, 4, 'NAME'], [76, 89, 'PERCENTAGE']]}],
 ["The stock rose as much as 7.6 percent to the day's high of Rs 1,186.85 per share on the BSE.",
  {'entities': [[26, 37, 'PERCENTAGE'],
    [59, 70, 'PERCENTAGE'],
    [88, 91, 'COMPANY']]}],
 ['Meanwhile, for the September quarter, the company reported a 

In [57]:
for text, annotations in training_data["annotations"]:
    if len(text) > 0:
        nlp.update([text], [annotations], sgd=optimizer)

In [31]:
! python3 -m spacy download en_core_web_sm


Collecting en_core_web_sm==2.3.1 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz#egg=en_core_web_sm==2.3.1
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz (12.0MB)
[K     |████████████████████████████████| 12.1MB 727kB/s 
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py) ... [?25ldone
[?25h  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.3.1-cp37-none-any.whl size=12047110 sha256=09300e1ce2738e32b12a1e25fcc9381fa864c7832799b9f08e9bfe31f3ceba30
  Stored in directory: /private/var/folders/rf/vq7cvscj7ks7pnn874v8tg3m0000gn/T/pip-ephem-wheel-cache-wmyxmlyg/wheels/2b/3f/41/f0b92863355c3ba34bb32b37d8a0c662959da0058202094f46
Successfully built en-core-web-sm
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-2.3.1
You should consider upgrading via t

In [36]:
import en_core_web_sm

nlp = en_core_web_sm.load()

In [35]:
import spacy 
import random
nlp = spacy.load('en_core_web_sm-2.3.1')

OSError: [E050] Can't find model 'en_core_web_sm-2.3.1'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [37]:
nlp.entity.add_label('CELEBRITY')
TRAIN_DATA = [
        (u"Modi", {"entities": [(0, 4, "CELEBRITY")]})]

optimizer = nlp.begin_training()
for i in range(20):
    random.shuffle(TRAIN_DATA)
    for text, annotations in TRAIN_DATA:
        nlp.update([text], [annotations],drop=0.3, sgd=optimizer)


text = "But Modi is starting India. The company made a late push\ninto hardware, and Apple’s Siri and Google available on iPhones, and Amazon’s Alexa\nsoftware, which runs on its Echo and Dot devices, have clear leads in\nconsumer adoption."
doc = nlp(text)
for ent in doc.ents:
    print(ent.text,ent.label_)

Modi ORG
India GPE
Apple ORG
Google NORP
iPhones WORK_OF_ART
Amazon ORG
Alexa ORG
Dot FAC


In [46]:
for text, annotations in TRAIN_DATA:
    print(text,annotations)

Modi {'entities': [(0, 4, 'CELEBRITY')]}


In [45]:
training_data["annotations"]

[['Meanwhile, for the September quarter, the company reported a consolidated net loss of Rs 184.06 crore versus a net profit of Rs 47.67 crore in the year ago quarter.',
  {'entities': [[86, 101, 'MONEY'], [125, 139, 'MONEY']]}],
 ['Sood further said that they have brought down the fixed cost down by almost 75-80 percent.',
  {'entities': [[0, 4, 'NAME'], [76, 89, 'PERCENTAGE']]}],
 ['The company managed to get rent waivers from most landlords, CFO Nitin Sood said in an interview to CNBC-TV18. “The big focus for us right now as revenues have been nil is to really reduce our fixed cost and we have managed to do that, ” he added.',
  {'entities': [[61, 64, 'ROLE'], [65, 75, 'NAME']]}],
 ['PVR Ltd Chairman cum Managing Director Ajay Bijli said: " We are eagerly waiting for the reopening of other states, specifically Maharashtra and Telangana so that business can gradually get back to normal. We are taking all possible precautions so that both our customers and employees feel safe while vi

In [44]:
optimizer = nlp.begin_training()
for i in range(20):
    random.shuffle(training_data["annotations"])
    for text, annotations in training_data["annotations"]:
        nlp.update([text], [annotations],drop=0.3, sgd=optimizer)

ValueError: need at least one array to concatenate

In [56]:
for text, annotations in training_data["annotations"]:
    print(len(text),len([annotations]))

164 1
90 1
247 1
337 1
0 1
318 1
0 1
180 1
210 1
103 1
209 1
92 1


In [59]:
text = "The share price of PVR rose over 7 percent on Wednesday after the multiplex chain said that it has reduced losses in Q2 despite nil revenue from the core movie exhibition business. The company managed to get rent waivers from most landlords, CFO Nitin Sood said in an interview to CNBC-TV18. “The big focus for us right now as revenues have been nil is to really reduce our fixed cost and we have managed to do that,” he added. Sood further said that they have brought down the fixed cost down by almost 75-80 percent. The stock rose as much as 7.6 percent to the day's high of Rs 1,186.85 per share on the BSE.Meanwhile, for the September quarter, the company reported a consolidated net loss of Rs 184.06 crore versus a net profit of Rs 47.67 crore in the year ago quarter. Its total income was at Rs 110.61 crore during the quarter under review against Rs 979.40 crore in the corresponding quarter last fiscal. PVR's total expenses were at Rs 389.37 crore in July-September 2020-21.Financial performance of the company for Q2, FY 21 was impacted by the continued lockdown announced due to COVID-19 outbreak, which disrupted the company's operations, said PVR in a post-earnings statement. PVR said its results were not comparable as business was impacted due to temporary closures of cinemas.Under Unlock 5.0 guidelines, the government has permitted cinemas to reopen October 15 onwards with 50 percent capacity. So far, 16 states and UTs, where PVR has a presence, have permitted cinemas to restart operations. Out of total of 831 screens of the company, over 575 have received permission to reopen, it said.PVR Ltd Chairman cum Managing Director Ajay Bijli said: We are eagerly waiting for the reopening of other states, specifically Maharashtra and Telangana so that business can gradually get back to normal. We are taking all possible precautions so that both our customers and employees feel safe while visiting their favourite cinema."

In [60]:
doc = nlp(text)
for ent in doc.ents:
    print(ent.text,ent.label_)

7 percent PERCENT
Wednesday DATE
as much as 7.6 percent PERCENT
day DATE
Rs 1,186.85 MONEY
Rs 184.06 MONEY
Rs 47.67 crore MONEY
Rs 110.61 MONEY
Rs 979.40 MONEY
Rs 389.37 MONEY
October 15 DATE
50 percent PERCENT
Managing Director Ajay Bijli ORG
Maharashtra ORG
Telangana PERSON
