# Method 2 - Rehearsal
## Train a SpaCy model with code using the rehearse function

In [9]:
# sanity test, to ensure that training by code works (no rehearsal)

# original code modified by Brad Payne
# original code ref: https://www.machinelearningplus.com/nlp/training-custom-ner-model-in-spacy/

# Import and load the spacy model
import spacy
from spacy.training.example import Example
import json
activated = spacy.prefer_gpu()
nlp_md=spacy.load("en_core_web_md", enable=["ner"])
# nlp_md=spacy.load("en_core_web_md")
# nlp_lg=spacy.load("en_core_web_lg")
# nlp_trf=spacy.load("en_core_web_trf")

# Getting the ner component
ner = nlp_md.get_pipe("ner")

# Training examples in the required format
with open('../../data/annotated/echr_train_spacy.jsonl', "r", encoding="utf-8") as f1, open('../../data/annotated/rehearse_silver_sent_train.jsonl', "r", encoding="utf-8") as f2:

    train_data = json.load(f1)
    rehearse = json.load(f2)

    tab_ents =['DEM']
    for ent in tab_ents:
        ner.add_label(ent)

    # Resume training
    optimizer = nlp_md.resume_training()

    # List of pipes you want to train
    pipe_exceptions = ["ner"]

    # List of pipes which should remain unaffected in training
    other_pipes = [pipe for pipe in nlp_md.pipe_names if pipe not in pipe_exceptions]

    # Importing requirements
    from spacy.util import minibatch, compounding
    import random

    # Begin training by disabling other pipeline components
    with nlp_md.disable_pipes(*other_pipes) :

      sizes = compounding(1.0, 4.0, 1.001)
      # Training for 30 iterations
      for itn in range(10):
        # shuffle examples before training
        random.shuffle(train_data)
        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=sizes)
        # dictionary to store losses
        losses = {}
        for batch in batches:
          texts, annotations = zip(*batch)
          doc = nlp_md.make_doc(texts[0])
          spacy_entry = Example.from_dict(doc, annotations[0])
          # Calling update() over the iteration
          nlp_md.update([spacy_entry], sgd=optimizer, drop=0.35, losses=losses)
          print("Losses", losses)

nlp_md.to_disk("../../data/models/spacy/code")

Losses {'ner': 80.88438052476357}
Losses {'ner': 129.34152767517537}
Losses {'ner': 192.5361600763011}
Losses {'ner': 261.18575975568007}
Losses {'ner': 292.67075365985335}
Losses {'ner': 402.3844054443162}
Losses {'ner': 453.8582282725881}
Losses {'ner': 537.2274083372947}
Losses {'ner': 566.0719477703612}
Losses {'ner': 585.9472401619373}
Losses {'ner': 599.3312381481715}
Losses {'ner': 636.9945505082727}
Losses {'ner': 655.9266814344242}
Losses {'ner': 859.2306893570776}
Losses {'ner': 895.0060092412929}
Losses {'ner': 903.7040792867676}
Losses {'ner': 938.6574043246287}
Losses {'ner': 960.3622066543851}
Losses {'ner': 972.7109038626147}
Losses {'ner': 989.1333007695513}
Losses {'ner': 1012.5653322365479}
Losses {'ner': 1035.086916379125}
Losses {'ner': 1130.1623839151464}
Losses {'ner': 1253.8490465863463}
Losses {'ner': 1265.1532528323473}
Losses {'ner': 1320.3973080959242}
Losses {'ner': 1354.8918921239792}
Losses {'ner': 1364.5157496470106}
Losses {'ner': 1615.2685221657935}
Los

In [18]:
# now that we've established the code works, we are going to set a baseline for an untrained model against the ECHR test dataset.

!python -m spacy benchmark accuracy --gpu-id=0 "en_core_web_md" "../../data/annotated/dev.spacy"

[38;5;4mℹ Using GPU: 0[0m
[1m

TOK      100.00
TAG      -     
POS      -     
MORPH    -     
LEMMA    -     
UAS      -     
LAS      -     
NER P    31.29 
NER R    49.38 
NER F    38.31 
SENT P   -     
SENT R   -     
SENT F   -     
SPEED    8085  

[1m

                  P       R       F
CARDINAL       0.00    0.00    0.00
GPE           44.95   67.34   53.91
ORG            9.57   13.56   11.22
LAW            0.00    0.00    0.00
NORP           0.00    0.00    0.00
PERSON        18.41   20.68   19.48
DATE          78.61   91.17   84.42
ORDINAL        0.00    0.00    0.00
DEM            0.00    0.00    0.00
FAC            0.00    0.00    0.00
TIME           0.00    0.00    0.00
QUANTITY       0.00    0.00    0.00
MONEY          0.00    0.00    0.00
PERCENT        0.00    0.00    0.00
LANGUAGE       0.00    0.00    0.00
PRODUCT        0.00    0.00    0.00
LOC            0.00    0.00    0.00
WORK_OF_ART    0.00    0.00    0.00
EVENT       

In [12]:
# try Method 2, rehearsing while updating

# this code has been modified from the original
# original ref: https://www.machinelearningplus.com/nlp/training-custom-ner-model-in-spacy/

# Import requirements
import spacy
import json
from collections import Counter
from spacy.training.example import Example
from spacy.util import minibatch, compounding
import random

activated = spacy.prefer_gpu()
nlp_md=spacy.load("en_core_web_md", enable=["ner"])
# nlp_md=spacy.load("en_core_web_sm")
# nlp_lg=spacy.load("en_core_web_lg")
# nlp_trf=spacy.load("en_core_web_trf")

# Getting the ner component
ner = nlp_md.get_pipe("ner")

# Training examples in the required format
with open('../../data/annotated/echr_train_spacy.jsonl', "r", encoding="utf-8") as f1, open('../../data/annotated/rehearse_silver_sent_train.jsonl', "r", encoding="utf-8") as f2:

    train_data = json.load(f1)
    rehearse = json.load(f2)

    # fetch and add labels
    tab_ents =['DEM']
    for ent in tab_ents:
        ner.add_label(ent)

    # fetch and add labels for rehearsal
    count_per_entity_new = Counter()
    for _, annotations in rehearse:
        for ent in annotations.get("entities"):
           count_per_entity_new[ent[2]] +=1

    for k in count_per_entity_new:
        ner.add_label(k)

    # Resume training
    # ref: https://spacy.io/api/language#rehearse
    optimizer = nlp_md.resume_training()

    # List of pipes you want to train
    pipe_exceptions = ["ner"]

    # List of pipes which should remain unaffected in training
    other_pipes = [pipe for pipe in nlp_md.pipe_names if pipe not in pipe_exceptions]

    # Begin training by disabling other pipeline components
    with nlp_md.disable_pipes(*other_pipes) :
      sizes = compounding(1.0, 4.0, 1.001)

      # Training for 30 iterations
      for itn in range(30):

        # shuffle examples before training
        random.shuffle(train_data)
        random.shuffle(rehearse)

        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=sizes)
        batches2 = minibatch(rehearse, size=sizes)

        # dictionary to store losses
        losses = {}
        r_losses = {}

        '''
        Example
        >>> raw_text_batches = minibatch(raw_texts)
        >>> for labelled_batch in minibatch(examples):
        >>>     nlp.update(labelled_batch)
        >>>     raw_batch = [Example.from_dict(nlp.make_doc(text), {}) for text in next(raw_text_batches)]
        >>>     nlp.rehearse(raw_batch)
        '''

        for batch in batches:
          texts, annotations = zip(*batch)
          doc = nlp_md.make_doc(texts[0])
          spacy_entry = Example.from_dict(doc, annotations[0])
          # Calling update() over the iteration
          nlp_md.update([spacy_entry], sgd=optimizer, drop=0.35, losses=losses)
          raw_batch = [Example.from_dict(nlp_md.make_doc(text[0]), {}) for text in next(batches2)]
          nlp_md.rehearse(raw_batch, sgd=optimizer, losses=r_losses, exclude=['DATE', 'GPE'])
          print("Losses Update ", itn, losses)
          print("Losses Rehearsal ", itn, r_losses)

nlp_md.to_disk("../../data/models/spacy/code-rehearse")

Losses Update  0 {'ner': 140.9401456043056}
Losses Rehearsal  0 {'ner': 3.6570789434970954}
Losses Update  0 {'ner': 255.91502660580636}
Losses Rehearsal  0 {'ner': 11.563036483373397}
Losses Update  0 {'ner': 495.6253242126918}
Losses Rehearsal  0 {'ner': 19.497437560252653}
Losses Update  0 {'ner': 620.8056827348223}
Losses Rehearsal  0 {'ner': 27.199945532181125}
Losses Update  0 {'ner': 678.5603868299172}
Losses Rehearsal  0 {'ner': 35.05115486795972}
Losses Update  0 {'ner': 1084.9576899392073}
Losses Rehearsal  0 {'ner': 42.23081790079887}
Losses Update  0 {'ner': 1147.7873995563114}
Losses Rehearsal  0 {'ner': 49.74991264509808}
Losses Update  0 {'ner': 1178.0865346267817}
Losses Rehearsal  0 {'ner': 57.504604527109294}
Losses Update  0 {'ner': 1245.2790992473647}
Losses Rehearsal  0 {'ner': 63.89812714289525}
Losses Update  0 {'ner': 1260.7024588145532}
Losses Rehearsal  0 {'ner': 71.25187339638744}
Losses Update  0 {'ner': 1348.929001442123}
Losses Rehearsal  0 {'ner': 79.8493

In [13]:
# evaluate against the ECHR test dataset (trained on ECHR, rehearsed on Resume)
!python -m spacy benchmark accuracy --gpu-id=0 "../../data/models/spacy/code-rehearse" "../../data/annotated/dev.spacy"

[38;5;4mℹ Using GPU: 0[0m
[1m

TOK     100.00
NER P   88.19 
NER R   83.63 
NER F   85.85 
SPEED   21329 

[1m

               P       R       F
ORG        82.20   77.51   79.79
DEM        83.84   39.71   53.90
PERSON     90.99   94.17   92.56
DATE       91.40   91.36   91.38
GPE        89.09   81.93   85.36
LANGUAGE    0.00    0.00    0.00



In [15]:
# evaluate against the Resume test dataset (trained on ECHR, rehearsed on Resume)
!python -m spacy benchmark accuracy --gpu-id=0 "../../data/models/spacy/code-rehearse" "../../data/annotated/test_silver_resume.spacy"

[38;5;4mℹ Using GPU: 0[0m
[1m

TOK     100.00
NER P   79.01 
NER R   55.06 
NER F   64.90 
SPEED   23285 

[1m

                   P       R       F
ORG            75.39   64.33   69.42
PERSON         69.16   33.48   45.12
PRODUCT        70.59   25.26   37.21
CARDINAL      100.00    2.04    4.00
DATE           88.22   64.32   74.39
GPE            90.87   64.29   75.30
WORK_OF_ART    33.33    5.00    8.70
NORP          100.00   33.33   50.00
LOC             0.00    0.00    0.00
LAW             0.00    0.00    0.00
LANGUAGE      100.00   37.50   54.55
FAC             0.00    0.00    0.00
ORDINAL         0.00    0.00    0.00
QUANTITY        0.00    0.00    0.00
EVENT           0.00    0.00    0.00
MONEY           0.00    0.00    0.00
PERCENT         0.00    0.00    0.00
TIME            0.00    0.00    0.00



In [19]:
import spacy
import re
activated = spacy.prefer_gpu()
nlp = spacy.load('../../data/models/spacy/code-rehearse')

text = ['''SIDDHARTH RAGHUVANSHI                                Roll No. 06CS3025                                            DOB: 08/08/1988
Email: siddharth.iitkharagpur@gmail.com                                                                                          Mobile No.:   +91 9932584135
Degree/Certificate
Dual Degree[B. Tech (H) + M. Tech]
(Computer Science & Engineering)
Class XII:  C.B.S.E.
Class X:   C.B.S.E.
ACADEMIC ACHIEVEMENTS
Institute/ School, City
Indian Institute of Technology, Kharagpur
Central Hindu School, Varanasi
St. Atulanand Convent School, Varanasi
CGPA/ %  Completion
8.26/10
86.0%
90.8%
2011
2005
2003
Competitive
Examinations
  All India Rank 116 in AIEEE, 2006 among 470,000 students, State Rank 8 in Uttar Pradesh.
  All India Rank 119 in 7th National Science Olympiad, 2005.
  All India Rank 22 in All India Level Mathematics & Science Test organized by Central Institute  for
Proficiency in English Language (CIPEL).
Scholastic
Achievements
  National top 1% out of 26968 candidates appeared in National Standard Examination in Physics’05
  Receiving CBSE Merit Scholarship for the past 4 years.
ACADEMIC PROJECTS
M. Tech Project                                                                                      IIT Kharagpur                                           May’10-Nov’10
•
•
Studied the performance of text indexing algorithms on Hadoop MapReduce architecture.
Future work includes implementing more efficient indexing and retrieval techniques in MapReduce for distributed parallel
computing.
B. Tech Project                                                                                        IIT Kharagpur                                           Aug’09-May’10
  Developed a software with can handle all sorts of query related to geographical information extracted from maps.
  Developed a client interface which can fetch data from different incompatible geospatial web services and make that data
compatible for resolving queries.
Integrated my framework engine with different underlying heterogeneous spatial databases.

Static Instrumentation Of Java Programs                                          IIT Kharagpur                                                   May’08
  Developed a program using Byte Code Engineering Library to do automated testing of java program at byte code level.
WORK EXPERIENCE / INTERNSHIP
Extreme Blue Internship Program                                                                   ISL, IBM, Pune, India                                       May’09 – July’09
Business
Perspective
Technical
Perspective
 Achievements
  Conducted survey in Pune region on the current home delivery status of organized retails

Proposed and implemented a solution on how to increase home delivery sales in order to compete with the
localized general (kirana) stores

Built an independent Home Delivery module on Java EE platform using open standards such as XML and
Web Services
Integrated the Home Delivery module with IBM WebSphere Commerce.

  Received highest grade 10/10 in summer internship evaluation at IIT Kharagpur, 2009.
RELEVANT COURSES TAKEN
  Machine learning
  Algorithms-I
  Algorithms-II
Information Retrieval

  Distributed Systems

Probability and Statistics
POSITION OF RESPONSIBILITY

Student coordinator of IIT Kharagpur Student Counselling Service.
  Student member of team that conceptualized and publicized Counselling Centre in IIT Kharagpur after 5 successive suicides
in the campus within a span of 6 months in between Feb’09 and Jul’09.
  More than 100 students are counselled every month.
  No mishaps in the campus as of Sep’10 after the establishment of the centre.
  Went through Gate Keepers Training to identify behavioral change in a person.
  Managed  the  systems  team  of  Bitwise-2010,  an  international  algorithmic  intensive  programming  contest  leading  to  the
participation of 3000 teams across 75 countries.

Family Sub-head of accommodation team in Spring Fest, 2008.
  Head boy of my Senior Secondary School (Central Hindu School).
e
EXTRA CURRICULAR ACHIEVEMENTS
  Member of Silver winning team in inter hall OPENSOFT Competition in the session 2007-08.
  National Sports Organization: Among Top 30 students in Lawn Tennis Team at IIT Kharagpur’06. ''']

# normalize whitespace as per https://github.com/explosion/spaCy/discussions/10243
r = []
for t in text:
    r.append(re.sub(r"\s+", " ", t))

for doc in nlp.pipe(r):
    print([(ent.text, ent.label_) for ent in doc.ents])

[('Computer Science & Engineering', 'ORG'), ('Kharagpur Central Hindu School', 'ORG'), ('Varanasi St. Atulanand Convent School', 'ORG'), ('Varanasi', 'GPE'), ('2011 2005', 'DATE'), ('2003', 'DATE'), ('AIEEE', 'ORG'), ('2006', 'DATE'), ('Uttar Pradesh', 'GPE'), ('2005', 'DATE'), ('India Level Mathematics & Science Test', 'ORG'), ('Central Institute for Proficiency', 'ORG'), ('26968', 'DATE'), ('National Standard Examination', 'ORG'), ('ACADEMIC PROJECTS M. Tech Project', 'ORG'), ('Hadoop', 'ORG'), ('B. Tech Project', 'ORG'), ('Byte Code Engineering Library', 'ORG'), ('IBM', 'ORG'), ('Pune', 'GPE'), ('India', 'GPE'), ('Pune', 'GPE'), ('Java EE', 'ORG'), ('XML', 'ORG'), ('IBM WebSphere Commerce', 'ORG'), ('summer', 'DATE'), ('IIT Kharagpur', 'ORG'), ('2009', 'DATE'), ('IIT Kharagpur Student Counselling Service', 'ORG'), ('Counselling Centre', 'ORG'), ('IIT Kharagpur', 'GPE'), ('6 months', 'DATE'), ('every month', 'DATE'), ('Bitwise-2010', 'ORG'), ('Spring Fest', 'ORG'), ('2008', 'DATE'), 

In [20]:
# try Method 2, rehearsing while updating, don't exclude any entities from the rehearsal process. Exclude = null.

# this code has been modified from the original
# original ref: https://www.machinelearningplus.com/nlp/training-custom-ner-model-in-spacy/

# Import requirements
import spacy
import json
from collections import Counter
from spacy.training.example import Example
from spacy.util import minibatch, compounding
import random

activated = spacy.prefer_gpu()
nlp_md=spacy.load("en_core_web_md", enable=["ner"])
# nlp_md=spacy.load("en_core_web_sm")
# nlp_lg=spacy.load("en_core_web_lg")
# nlp_trf=spacy.load("en_core_web_trf")

# Getting the ner component
ner = nlp_md.get_pipe("ner")

# Training examples in the required format
with open('../../data/annotated/echr_train_spacy.jsonl', "r", encoding="utf-8") as f1, open('../../data/annotated/rehearse_silver_sent_train.jsonl', "r", encoding="utf-8") as f2:

    train_data = json.load(f1)
    rehearse = json.load(f2)

    # fetch and add labels
    tab_ents =['DEM']
    for ent in tab_ents:
        ner.add_label(ent)

    # fetch and add labels for rehearsal
    count_per_entity_new = Counter()
    for _, annotations in rehearse:
        for ent in annotations.get("entities"):
           count_per_entity_new[ent[2]] +=1

    for k in count_per_entity_new:
        ner.add_label(k)

    # Resume training
    # ref: https://spacy.io/api/language#rehearse
    optimizer = nlp_md.resume_training()

    # List of pipes you want to train
    pipe_exceptions = ["ner"]

    # List of pipes which should remain unaffected in training
    other_pipes = [pipe for pipe in nlp_md.pipe_names if pipe not in pipe_exceptions]

    # Begin training by disabling other pipeline components
    with nlp_md.disable_pipes(*other_pipes) :
      sizes = compounding(1.0, 4.0, 1.001)

      # Training for 30 iterations
      for itn in range(30):

        # shuffle examples before training
        random.shuffle(train_data)
        random.shuffle(rehearse)

        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=sizes)
        batches2 = minibatch(rehearse, size=sizes)

        # dictionary to store losses
        losses = {}
        r_losses = {}

        '''
        Example
        >>> raw_text_batches = minibatch(raw_texts)
        >>> for labelled_batch in minibatch(examples):
        >>>     nlp.update(labelled_batch)
        >>>     raw_batch = [Example.from_dict(nlp.make_doc(text), {}) for text in next(raw_text_batches)]
        >>>     nlp.rehearse(raw_batch)
        '''

        for batch in batches:
          texts, annotations = zip(*batch)
          doc = nlp_md.make_doc(texts[0])
          spacy_entry = Example.from_dict(doc, annotations[0])
          # Calling update() over the iteration
          nlp_md.update([spacy_entry], sgd=optimizer, drop=0.35, losses=losses)
          raw_batch = [Example.from_dict(nlp_md.make_doc(text[0]), {}) for text in next(batches2)]
          nlp_md.rehearse(raw_batch, sgd=optimizer, losses=r_losses)
          print("Losses Update ", itn, losses)
          print("Losses Rehearsal ", itn, r_losses)

nlp_md.to_disk("../../data/models/spacy/code-rehearse")

Losses Update  0 {'ner': 102.59177653949894}
Losses Rehearsal  0 {'ner': 1.360908873351939}
Losses Update  0 {'ner': 338.5931518230792}
Losses Rehearsal  0 {'ner': 6.374382149343525}
Losses Update  0 {'ner': 423.4938221126605}
Losses Rehearsal  0 {'ner': 10.763660023238632}
Losses Update  0 {'ner': 529.8392184430987}
Losses Rehearsal  0 {'ner': 14.653126157334436}
Losses Update  0 {'ner': 717.2605874414868}
Losses Rehearsal  0 {'ner': 21.85620509839708}
Losses Update  0 {'ner': 795.1265146436558}
Losses Rehearsal  0 {'ner': 29.23268177748988}
Losses Update  0 {'ner': 842.5079517700738}
Losses Rehearsal  0 {'ner': 35.25369889672849}
Losses Update  0 {'ner': 933.3521866904298}
Losses Rehearsal  0 {'ner': 42.77182525882163}
Losses Update  0 {'ner': 960.3015802374404}
Losses Rehearsal  0 {'ner': 50.30884645454034}
Losses Update  0 {'ner': 985.1793954938132}
Losses Rehearsal  0 {'ner': 57.744181909091694}
Losses Update  0 {'ner': 1166.4579746663023}
Losses Rehearsal  0 {'ner': 65.2968630613

In [21]:
# evaluate (No entity exclusions) against the ECHR test dataset (trained on ECHR, rehearsed on Resume)
!python -m spacy benchmark accuracy --gpu-id=0 "../../data/models/spacy/code-rehearse" "../../data/annotated/dev.spacy"

[38;5;4mℹ Using GPU: 0[0m
[1m

TOK     100.00
NER P   85.35 
NER R   84.90 
NER F   85.12 
SPEED   20059 

[1m

               P       R       F
ORG        75.34   80.31   77.75
DEM        76.10   45.69   57.10
PERSON     90.33   94.27   92.26
DATE       91.95   92.58   92.27
GPE        85.71   76.64   80.92
LANGUAGE    0.00    0.00    0.00



In [22]:
# evaluate (No entity exclusions) against the Resume test dataset (trained on ECHR, rehearsed on Resume)
!python -m spacy benchmark accuracy --gpu-id=0 "../../data/models/spacy/code-rehearse" "../../data/annotated/test_silver_resume.spacy"

[38;5;4mℹ Using GPU: 0[0m
[1m

TOK     100.00
NER P   75.51 
NER R   62.69 
NER F   68.50 
SPEED   23146 

[1m

                   P       R       F
ORG            71.21   73.81   72.48
PERSON         70.37   34.39   46.20
PRODUCT        78.57   11.58   20.18
CARDINAL      100.00    4.08    7.84
GPE            83.46   75.51   79.29
DATE           86.84   75.55   80.80
WORK_OF_ART     0.00    0.00    0.00
DEM             0.00    0.00    0.00
NORP          100.00   33.33   50.00
LOC             0.00    0.00    0.00
LAW             0.00    0.00    0.00
LANGUAGE      100.00   37.50   54.55
FAC             0.00    0.00    0.00
ORDINAL         0.00    0.00    0.00
QUANTITY        0.00    0.00    0.00
EVENT           0.00    0.00    0.00
MONEY           0.00    0.00    0.00
PERCENT         0.00    0.00    0.00
TIME            0.00    0.00    0.00



In [23]:
import spacy
import re
activated = spacy.prefer_gpu()
nlp = spacy.load('../../data/models/spacy/code-rehearse')

text = ['''SIDDHARTH RAGHUVANSHI                                Roll No. 06CS3025                                            DOB: 08/08/1988
Email: siddharth.iitkharagpur@gmail.com                                                                                          Mobile No.:   +91 9932584135
Degree/Certificate
Dual Degree[B. Tech (H) + M. Tech]
(Computer Science & Engineering)
Class XII:  C.B.S.E.
Class X:   C.B.S.E.
ACADEMIC ACHIEVEMENTS
Institute/ School, City
Indian Institute of Technology, Kharagpur
Central Hindu School, Varanasi
St. Atulanand Convent School, Varanasi
CGPA/ %  Completion
8.26/10
86.0%
90.8%
2011
2005
2003
Competitive
Examinations
  All India Rank 116 in AIEEE, 2006 among 470,000 students, State Rank 8 in Uttar Pradesh.
  All India Rank 119 in 7th National Science Olympiad, 2005.
  All India Rank 22 in All India Level Mathematics & Science Test organized by Central Institute  for
Proficiency in English Language (CIPEL).
Scholastic
Achievements
  National top 1% out of 26968 candidates appeared in National Standard Examination in Physics’05
  Receiving CBSE Merit Scholarship for the past 4 years.
ACADEMIC PROJECTS
M. Tech Project                                                                                      IIT Kharagpur                                           May’10-Nov’10
•
•
Studied the performance of text indexing algorithms on Hadoop MapReduce architecture.
Future work includes implementing more efficient indexing and retrieval techniques in MapReduce for distributed parallel
computing.
B. Tech Project                                                                                        IIT Kharagpur                                           Aug’09-May’10
  Developed a software with can handle all sorts of query related to geographical information extracted from maps.
  Developed a client interface which can fetch data from different incompatible geospatial web services and make that data
compatible for resolving queries.
Integrated my framework engine with different underlying heterogeneous spatial databases.

Static Instrumentation Of Java Programs                                          IIT Kharagpur                                                   May’08
  Developed a program using Byte Code Engineering Library to do automated testing of java program at byte code level.
WORK EXPERIENCE / INTERNSHIP
Extreme Blue Internship Program                                                                   ISL, IBM, Pune, India                                       May’09 – July’09
Business
Perspective
Technical
Perspective
 Achievements
  Conducted survey in Pune region on the current home delivery status of organized retails

Proposed and implemented a solution on how to increase home delivery sales in order to compete with the
localized general (kirana) stores

Built an independent Home Delivery module on Java EE platform using open standards such as XML and
Web Services
Integrated the Home Delivery module with IBM WebSphere Commerce.

  Received highest grade 10/10 in summer internship evaluation at IIT Kharagpur, 2009.
RELEVANT COURSES TAKEN
  Machine learning
  Algorithms-I
  Algorithms-II
Information Retrieval

  Distributed Systems

Probability and Statistics
POSITION OF RESPONSIBILITY

Student coordinator of IIT Kharagpur Student Counselling Service.
  Student member of team that conceptualized and publicized Counselling Centre in IIT Kharagpur after 5 successive suicides
in the campus within a span of 6 months in between Feb’09 and Jul’09.
  More than 100 students are counselled every month.
  No mishaps in the campus as of Sep’10 after the establishment of the centre.
  Went through Gate Keepers Training to identify behavioral change in a person.
  Managed  the  systems  team  of  Bitwise-2010,  an  international  algorithmic  intensive  programming  contest  leading  to  the
participation of 3000 teams across 75 countries.

Family Sub-head of accommodation team in Spring Fest, 2008.
  Head boy of my Senior Secondary School (Central Hindu School).
e
EXTRA CURRICULAR ACHIEVEMENTS
  Member of Silver winning team in inter hall OPENSOFT Competition in the session 2007-08.
  National Sports Organization: Among Top 30 students in Lawn Tennis Team at IIT Kharagpur’06. ''']

# normalize whitespace as per https://github.com/explosion/spaCy/discussions/10243
r = []
for t in text:
    r.append(re.sub(r"\s+", " ", t))

for doc in nlp.pipe(r):
    print([(ent.text, ent.label_) for ent in doc.ents])

[('Computer Science & Engineering', 'ORG'), ('City Indian Institute of Technology', 'ORG'), ('Kharagpur Central Hindu School', 'ORG'), ('Varanasi St. Atulanand Convent School', 'ORG'), ('Varanasi', 'GPE'), ('2011 2005', 'DATE'), ('2003', 'DATE'), ('AIEEE', 'ORG'), ('2006', 'DATE'), ('Uttar Pradesh', 'GPE'), ('2005', 'DATE'), ('India Level Mathematics & Science Test', 'ORG'), ('Central Institute for Proficiency', 'ORG'), ('26968', 'DATE'), ('National Standard Examination', 'ORG'), ('IIT', 'ORG'), ('Hadoop', 'ORG'), ('B. Tech Project', 'ORG'), ('IIT Kharagpur', 'ORG'), ('IIT', 'ORG'), ('Byte Code Engineering Library', 'ORG'), ('ISL', 'ORG'), ('IBM', 'ORG'), ('Pune', 'GPE'), ('India', 'GPE'), ('Pune', 'GPE'), ('Java EE', 'ORG'), ('XML', 'ORG'), ('IBM WebSphere Commerce', 'ORG'), ('IIT Kharagpur', 'ORG'), ('2009', 'DATE'), ('IIT Kharagpur Student Counselling Service', 'ORG'), ('Counselling Centre', 'ORG'), ('IIT Kharagpur', 'GPE'), ('6 months', 'DATE'), ('every month', 'DATE'), ('Bitwise-2

In [24]:
# try Method 2, rehearsing while updating, don't exclude any entities from the rehearsal process. Exclude = null.

# this code has been modified from the original
# original ref: https://www.machinelearningplus.com/nlp/training-custom-ner-model-in-spacy/

# Import requirements
import spacy
import json
from collections import Counter
from spacy.training.example import Example
from spacy.util import minibatch, compounding
import random

activated = spacy.prefer_gpu()
nlp_md=spacy.load("en_core_web_md", enable=["ner"])
# nlp_md=spacy.load("en_core_web_sm")
# nlp_lg=spacy.load("en_core_web_lg")
# nlp_trf=spacy.load("en_core_web_trf")

# Getting the ner component
ner = nlp_md.get_pipe("ner")

# Training examples in the required format
with open('../../data/annotated/echr_train_spacy.jsonl', "r", encoding="utf-8") as f1, open('../../data/annotated/rehearse_silver_sent_train.jsonl', "r", encoding="utf-8") as f2:

    train_data = json.load(f1)
    rehearse = json.load(f2)

    # fetch and add labels
    tab_ents =['DEM']
    for ent in tab_ents:
        ner.add_label(ent)

    # fetch and add labels for rehearsal
    count_per_entity_new = Counter()
    for _, annotations in rehearse:
        for ent in annotations.get("entities"):
           count_per_entity_new[ent[2]] +=1

    for k in count_per_entity_new:
        ner.add_label(k)

    # Resume training
    # ref: https://spacy.io/api/language#rehearse
    optimizer = nlp_md.resume_training()

    # List of pipes you want to train
    pipe_exceptions = ["ner"]

    # List of pipes which should remain unaffected in training
    other_pipes = [pipe for pipe in nlp_md.pipe_names if pipe not in pipe_exceptions]

    # Begin training by disabling other pipeline components
    with nlp_md.disable_pipes(*other_pipes) :
      sizes = compounding(1.0, 4.0, 1.001)

      # Training for 30 iterations
      for itn in range(30):

        # shuffle examples before training
        random.shuffle(train_data)
        random.shuffle(rehearse)

        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=sizes)
        batches2 = minibatch(rehearse, size=sizes)

        # dictionary to store losses
        losses = {}
        r_losses = {}

        '''
        Example
        >>> raw_text_batches = minibatch(raw_texts)
        >>> for labelled_batch in minibatch(examples):
        >>>     nlp.update(labelled_batch)
        >>>     raw_batch = [Example.from_dict(nlp.make_doc(text), {}) for text in next(raw_text_batches)]
        >>>     nlp.rehearse(raw_batch)
        '''

        for batch in batches:
          texts, annotations = zip(*batch)
          doc = nlp_md.make_doc(texts[0])
          spacy_entry = Example.from_dict(doc, annotations[0])
          # Calling update() over the iteration
          nlp_md.update([spacy_entry], sgd=optimizer, drop=0.35, losses=losses)
          raw_batch = [Example.from_dict(nlp_md.make_doc(text[0]), {}) for text in next(batches2)]
          nlp_md.rehearse(raw_batch, sgd=optimizer, losses=r_losses, exclude=['DATE', 'GPE', 'PERSON', 'ORG'])
          print("Losses Update ", itn, losses)
          print("Losses Rehearsal ", itn, r_losses)

nlp_md.to_disk("../../data/models/spacy/code-rehearse")

Losses Update  0 {'ner': 105.92794870108547}
Losses Rehearsal  0 {'ner': 4.1536034217247595}
Losses Update  0 {'ner': 171.75243119687104}
Losses Rehearsal  0 {'ner': 10.918676953438002}
Losses Update  0 {'ner': 213.81402842219552}
Losses Rehearsal  0 {'ner': 16.993621616319068}
Losses Update  0 {'ner': 290.05844680087955}
Losses Rehearsal  0 {'ner': 23.395238508902757}
Losses Update  0 {'ner': 381.6983803479428}
Losses Rehearsal  0 {'ner': 29.125454278903277}
Losses Update  0 {'ner': 418.88762364927777}
Losses Rehearsal  0 {'ner': 36.692658135989795}
Losses Update  0 {'ner': 450.0174328250548}
Losses Rehearsal  0 {'ner': 45.95358388198602}
Losses Update  0 {'ner': 471.70008415214744}
Losses Rehearsal  0 {'ner': 52.04510726501184}
Losses Update  0 {'ner': 700.1640848238201}
Losses Rehearsal  0 {'ner': 59.17089086251785}
Losses Update  0 {'ner': 726.7670482756691}
Losses Rehearsal  0 {'ner': 67.5170477995546}
Losses Update  0 {'ner': 749.3676764184804}
Losses Rehearsal  0 {'ner': 73.7843

In [25]:
# evaluate (All entity exclusions) against the ECHR test dataset (trained on ECHR, rehearsed on Resume)
!python -m spacy benchmark accuracy --gpu-id=0 "../../data/models/spacy/code-rehearse" "../../data/annotated/dev.spacy"

[38;5;4mℹ Using GPU: 0[0m
[1m

TOK     100.00
NER P   87.13 
NER R   83.98 
NER F   85.53 
SPEED   21342 

[1m

               P       R       F
ORG        79.59   77.97   78.77
DEM        73.93   45.45   56.30
PERSON     90.68   93.50   92.07
DATE       91.88   92.62   92.25
GPE        90.17   75.36   82.11
LANGUAGE    0.00    0.00    0.00



In [26]:
# evaluate (All entity exclusions) against the Resume test dataset (trained on ECHR, rehearsed on Resume)
!python -m spacy benchmark accuracy --gpu-id=0 "../../data/models/spacy/code-rehearse" "../../data/annotated/test_silver_resume.spacy"

[38;5;4mℹ Using GPU: 0[0m
[1m

TOK     100.00
NER P   76.91 
NER R   53.68 
NER F   63.23 
SPEED   22198 

[1m

                   P       R       F
ORG            72.34   64.47   68.18
PERSON         77.65   29.86   43.14
PRODUCT        75.00   18.95   30.25
CARDINAL      100.00    2.04    4.00
DATE           90.37   59.91   72.05
GPE            83.94   62.24   71.48
WORK_OF_ART     0.00    0.00    0.00
DEM             0.00    0.00    0.00
NORP          100.00   37.78   54.84
LOC             0.00    0.00    0.00
LAW             0.00    0.00    0.00
LANGUAGE      100.00   25.00   40.00
FAC             0.00    0.00    0.00
ORDINAL         0.00    0.00    0.00
QUANTITY        0.00    0.00    0.00
EVENT           0.00    0.00    0.00
MONEY           0.00    0.00    0.00
PERCENT         0.00    0.00    0.00
TIME            0.00    0.00    0.00



In [27]:
import spacy
import re
activated = spacy.prefer_gpu()
nlp = spacy.load('../../data/models/spacy/code-rehearse')

text = ['''SIDDHARTH RAGHUVANSHI                                Roll No. 06CS3025                                            DOB: 08/08/1988
Email: siddharth.iitkharagpur@gmail.com                                                                                          Mobile No.:   +91 9932584135
Degree/Certificate
Dual Degree[B. Tech (H) + M. Tech]
(Computer Science & Engineering)
Class XII:  C.B.S.E.
Class X:   C.B.S.E.
ACADEMIC ACHIEVEMENTS
Institute/ School, City
Indian Institute of Technology, Kharagpur
Central Hindu School, Varanasi
St. Atulanand Convent School, Varanasi
CGPA/ %  Completion
8.26/10
86.0%
90.8%
2011
2005
2003
Competitive
Examinations
  All India Rank 116 in AIEEE, 2006 among 470,000 students, State Rank 8 in Uttar Pradesh.
  All India Rank 119 in 7th National Science Olympiad, 2005.
  All India Rank 22 in All India Level Mathematics & Science Test organized by Central Institute  for
Proficiency in English Language (CIPEL).
Scholastic
Achievements
  National top 1% out of 26968 candidates appeared in National Standard Examination in Physics’05
  Receiving CBSE Merit Scholarship for the past 4 years.
ACADEMIC PROJECTS
M. Tech Project                                                                                      IIT Kharagpur                                           May’10-Nov’10
•
•
Studied the performance of text indexing algorithms on Hadoop MapReduce architecture.
Future work includes implementing more efficient indexing and retrieval techniques in MapReduce for distributed parallel
computing.
B. Tech Project                                                                                        IIT Kharagpur                                           Aug’09-May’10
  Developed a software with can handle all sorts of query related to geographical information extracted from maps.
  Developed a client interface which can fetch data from different incompatible geospatial web services and make that data
compatible for resolving queries.
Integrated my framework engine with different underlying heterogeneous spatial databases.

Static Instrumentation Of Java Programs                                          IIT Kharagpur                                                   May’08
  Developed a program using Byte Code Engineering Library to do automated testing of java program at byte code level.
WORK EXPERIENCE / INTERNSHIP
Extreme Blue Internship Program                                                                   ISL, IBM, Pune, India                                       May’09 – July’09
Business
Perspective
Technical
Perspective
 Achievements
  Conducted survey in Pune region on the current home delivery status of organized retails

Proposed and implemented a solution on how to increase home delivery sales in order to compete with the
localized general (kirana) stores

Built an independent Home Delivery module on Java EE platform using open standards such as XML and
Web Services
Integrated the Home Delivery module with IBM WebSphere Commerce.

  Received highest grade 10/10 in summer internship evaluation at IIT Kharagpur, 2009.
RELEVANT COURSES TAKEN
  Machine learning
  Algorithms-I
  Algorithms-II
Information Retrieval

  Distributed Systems

Probability and Statistics
POSITION OF RESPONSIBILITY

Student coordinator of IIT Kharagpur Student Counselling Service.
  Student member of team that conceptualized and publicized Counselling Centre in IIT Kharagpur after 5 successive suicides
in the campus within a span of 6 months in between Feb’09 and Jul’09.
  More than 100 students are counselled every month.
  No mishaps in the campus as of Sep’10 after the establishment of the centre.
  Went through Gate Keepers Training to identify behavioral change in a person.
  Managed  the  systems  team  of  Bitwise-2010,  an  international  algorithmic  intensive  programming  contest  leading  to  the
participation of 3000 teams across 75 countries.

Family Sub-head of accommodation team in Spring Fest, 2008.
  Head boy of my Senior Secondary School (Central Hindu School).
e
EXTRA CURRICULAR ACHIEVEMENTS
  Member of Silver winning team in inter hall OPENSOFT Competition in the session 2007-08.
  National Sports Organization: Among Top 30 students in Lawn Tennis Team at IIT Kharagpur’06. ''']

# normalize whitespace as per https://github.com/explosion/spaCy/discussions/10243
r = []
for t in text:
    r.append(re.sub(r"\s+", " ", t))

for doc in nlp.pipe(r):
    print([(ent.text, ent.label_) for ent in doc.ents])

[('Computer Science & Engineering', 'ORG'), ('City Indian Institute of Technology', 'ORG'), ('Kharagpur Central Hindu School', 'ORG'), ('Varanasi St. Atulanand Convent School', 'ORG'), ('Varanasi', 'GPE'), ('2011 2005', 'DATE'), ('AIEEE', 'ORG'), ('2006', 'DATE'), ('Uttar Pradesh', 'GPE'), ('7th National Science Olympiad', 'ORG'), ('2005', 'DATE'), ('India Level Mathematics & Science Test', 'ORG'), ('Central Institute for Proficiency', 'ORG'), ('National Standard Examination', 'ORG'), ('IIT', 'GPE'), ('Hadoop', 'ORG'), ('B. Tech Project', 'ORG'), ('IIT Kharagpur Aug’09-May’10 \uf0a7', 'ORG'), ('Byte Code Engineering Library', 'ORG'), ('IBM', 'ORG'), ('Pune', 'GPE'), ('India May’09', 'GPE'), ('Pune', 'GPE'), ('Home Delivery', 'ORG'), ('Java EE', 'ORG'), ('XML', 'ORG'), ('IBM WebSphere Commerce', 'ORG'), ('IIT Kharagpur', 'ORG'), ('2009', 'DATE'), ('IIT Kharagpur Student Counselling Service', 'ORG'), ('Counselling Centre', 'ORG'), ('IIT Kharagpur', 'GPE'), ('6 months', 'DATE'), ('Feb’09'