In [69]:
# extract text from resumes, output to jsonl

import re
import json
import spacy as spacy
import srsly
'''
Make a "rehearsal" update to the models in the pipeline, to prevent forgetting. Rehearsal updates run an initial copy of the model over some data, and update the model so its current predictions are more like the initial ones. This is useful for keeping a pretrained model on-track, even if you're updating it with a smaller set of examples.
'''

# avoids JSONDecodeError due to malformed json file
data = [json.loads(line)
        for line in open("../../data/annotated/resumedata.json", "r", encoding="utf-8")]

content = []
for k in data:
    dct = {
        "text": re.sub(r"\s+", " ", k["content"])
    }
    content.append(dct)

srsly.write_json("../../data/resume.jsonl", content)


In [124]:
# create a silver annotated dataset
'''
Rehearsal updates run an initial copy of the model over some data
'''
nlp = spacy.load("en_core_web_md")

with open("../../data/resume.jsonl", "r", encoding="utf-8") as f1:
    resume = json.load(f1)

    annotated = list()
    for k in resume:
        doc = nlp(k["text"])
        for sent in doc.sents:
            labels = list()
            for e in sent.ents:
                labels.append([e.start_char, e.end_char, e.label_])
            if labels:
                spacy_entry = (sent.text, {"entities": labels})
                annotated.append(spacy_entry)

    srsly.write_json("../../data/annotated/rehearse_silver_spacy.jsonl", annotated)


In [155]:
# convert echr training data to jsonl file so that we can train a spacy model

from spacy.tokens import DocBin

doc_bin = DocBin().from_disk("../../data/annotated/train.spacy")
examples = []
for doc in doc_bin.get_docs(nlp.vocab):
    entities = []
    for ent in doc.ents:
        entities.append((ent.start_char, ent.end_char, ent.label_))

    spacy_entry = (doc.text, {"entities": entities})
    examples.append(spacy_entry)

    srsly.write_json("../../data/annotated/echr_train_spacy.jsonl", examples)


In [None]:
# convert echr training data to jsonl file so that we can test a spacy model

doc_bin = DocBin().from_disk("../../data/annotated/dev.spacy")
examples = []
for doc in doc_bin.get_docs(nlp.vocab):
    entities = []
    for ent in doc.ents:
        entities.append((ent.start_char, ent.end_char, ent.label_))

    spacy_entry = (doc.text, {"entities": entities})
    examples.append(spacy_entry)

    srsly.write_json("../../data/annotated/echr_dev_spacy.jsonl", examples)

In [None]:
# Stage 1 of Method 1 (Incremental), train the first model on echr data

import json
from spacy.training.example import Example
from collections import Counter
import spacy
nlp = spacy.load("en_core_web_md", enable="ner")
ner = nlp.get_pipe('ner')

# from GH https://github.com/explosion/spaCy/issues/7161
output_dir = "../../data/models/spacy/output"

with open('../../data/annotated/echr_train_spacy.jsonl', "r", encoding="utf-8") as f1:
    train = json.load(f1)

    # fetch and add labels
    count_per_entity_new = Counter()
    for _, annotations in train:
        for ent in annotations.get("entities"):
           count_per_entity_new[ent[2]] +=1

    for k in count_per_entity_new:
        ner.add_label(k)

optimizer = nlp.create_optimizer()
for itn in range(2):
    for raw_text,entity_offsets in train:
        doc = nlp.make_doc(raw_text)
        example = Example.from_dict(doc,entity_offsets)
        nlp.update([example],sgd=optimizer)

nlp.to_disk(output_dir)


In [19]:
# testing the model
!python -m spacy benchmark accuracy "../../../data/models/spacy/output" "../../../data/annotated/dev.spacy"

[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK     100.00
NER P   29.27 
NER R   39.50 
NER F   33.63 
SPEED   22339 

[1m

                  P       R       F
CARDINAL       0.00    0.00    0.00
GPE            0.68    0.36    0.48
ORG            9.32   10.29    9.78
LAW            0.00    0.00    0.00
NORP           0.00    0.00    0.00
PERSON        18.95   20.78   19.82
DATE          89.70   82.87   86.15
LOC            0.00    0.00    0.00
PRODUCT        0.00    0.00    0.00
ORDINAL        0.00    0.00    0.00
DEM            0.00    0.00    0.00
TIME           0.00    0.00    0.00
QUANTITY       0.00    0.00    0.00
MONEY          0.00    0.00    0.00
PERCENT        0.00    0.00    0.00
LANGUAGE       0.00    0.00    0.00
WORK_OF_ART    0.00    0.00    0.00
FAC            0.00    0.00    0.00
EVENT          0.00    0.00    0.00



In [164]:
# stage 2 of Method 1 (Incremental) involves separating the rehearsal step in another training session
# rehearsal

import random
from collections import Counter
# step 2
# ref: https://github.com/explosion/spaCy/discussions/10041

with open('../../data/annotated/rehearse_silver_sent_spacy.jsonl', "r") as f2:
    rehearse = json.load(f2)


    # fetch and add labels
    count_per_entity_new = Counter()
    for _, annotations in rehearse:
        for ent in annotations.get("entities"):
           count_per_entity_new[ent[2]] +=1

    for k in count_per_entity_new:
        ner.add_label(k)

    nlp_incremental_try = spacy.load("../../data/models/spacy/output")
    ner = nlp_incremental_try.get_pipe('ner')

# '''
# Make a "rehearsal" update to the models in the pipeline, to prevent forgetting. Rehearsal updates run an initial copy of the model over some data, and update the model so its current predictions are more like the initial ones. This is useful for keeping a pretrained model on-track, even if you're updating it with a smaller set of examples.
#
# rehearsal, collect samples of text you want the models to retain performance on, and call nlp.rehearse() with a batch of Example objects.
# raw_batch = [Example.from_dict(nlp.make_doc(text), {}) for text in next(raw_text_batches)]
# '''
# # https://spacy.io/api/pipe#rehearse
#
    optimizer = nlp_incremental_try.resume_training()
    for itn in range(20):
        random.shuffle(rehearse)
        for raw_text, entity_offsets in rehearse:
            doc = nlp_incremental_try.make_doc(raw_text)
            example = Example.from_dict(doc,entity_offsets)
            nlp_incremental_try.rehearse([example],sgd=optimizer)
    nlp.to_disk("../../data/models/spacy/rehearse")




In [165]:
!python -m spacy benchmark accuracy "../../data/models/spacy/rehearse" "../../data/annotated/rehearse_silver_sent_test.jsonl"

[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK     -
NER P   -
NER R   -
NER F   -
SPEED   0



In [177]:
# sanity test, to ensure that training by code works
# original code modified by Brad Payne
# original code ref: https://www.machinelearningplus.com/nlp/training-custom-ner-model-in-spacy/

# Import and load the spacy model
import spacy
import json
from collections import Counter
nlp_md=spacy.load("en_core_web_md", enable=["ner"])
# nlp_md=spacy.load("en_core_web_md")
# nlp_lg=spacy.load("en_core_web_lg")
# nlp_trf=spacy.load("en_core_web_trf")

# Getting the ner component
ner = nlp_md.get_pipe("ner")

# Training examples in the required format
with open('../../data/annotated/echr_train_spacy.jsonl', "r", encoding="utf-8") as f1, open('../../data/annotated/rehearse_silver_sent_train.jsonl', "r", encoding="utf-8") as f2:

    train_data = json.load(f1)
    rehearse = json.load(f2)

    tab_ents =['DEM']
    for ent in tab_ents:
        ner.add_label(ent)

    # Resume training
    optimizer = nlp_md.resume_training()
    move_names = list(ner.move_names)

    # List of pipes you want to train
    pipe_exceptions = ["ner"]

    # List of pipes which should remain unaffected in training
    other_pipes = [pipe for pipe in nlp_md.pipe_names if pipe not in pipe_exceptions]

    # Importing requirements
    from spacy.util import minibatch, compounding
    import random

    # Begin training by disabling other pipeline components
    with nlp_md.disable_pipes(*other_pipes) :

      sizes = compounding(1.0, 4.0, 1.001)
      # Training for 30 iterations
      for itn in range(30):
        # shuffle examples before training
        random.shuffle(train_data)
        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=sizes)
        # dictionary to store losses
        losses = {}
        for batch in batches:
          texts, annotations = zip(*batch)
          doc = nlp.make_doc(texts[0])
          spacy_entry = Example.from_dict(doc, annotations[0])
          # Calling update() over the iteration
          nlp_md.update([spacy_entry], sgd=optimizer, drop=0.35, losses=losses)
          print("Losses", losses)
        # random.shuffle(rehearse)
        # for raw_text, entity_offsets in rehearse:
        #   doc = nlp_md.make_doc(raw_text)
        #   example = Example.from_dict(doc,entity_offsets)
        #   nlp_md.rehearse([example],sgd=optimizer)

nlp.to_disk("../../data/models/spacy/code")


Losses {'ner': 587.816671035951}
Losses {'ner': 629.0669782317264}
Losses {'ner': 680.3494846069867}
Losses {'ner': 762.3984505268845}
Losses {'ner': 806.9999703111314}
Losses {'ner': 839.342910930684}
Losses {'ner': 895.7072057904684}
Losses {'ner': 1003.2232497902346}
Losses {'ner': 1023.9745774587267}
Losses {'ner': 1102.682226476216}
Losses {'ner': 1125.4257169029254}
Losses {'ner': 1302.893343617917}
Losses {'ner': 1322.9219930381364}
Losses {'ner': 1340.8177488766041}
Losses {'ner': 1536.7606172018204}
Losses {'ner': 1611.8914114106428}
Losses {'ner': 1650.6642842192882}
Losses {'ner': 1685.3306310063504}
Losses {'ner': 1800.552590148352}
Losses {'ner': 1813.6691235031926}
Losses {'ner': 1846.1021724618054}
Losses {'ner': 1903.668818478241}
Losses {'ner': 1917.6283925232315}
Losses {'ner': 1983.0518439421762}
Losses {'ner': 1994.9272618833363}
Losses {'ner': 2013.8959575703118}
Losses {'ner': 2069.31874041786}
Losses {'ner': 2092.267217419614}
Losses {'ner': 2221.7035829647593}
L

In [183]:
!python -m spacy benchmark accuracy "../../data/models/spacy/code" "../../data/annotated/dev.spacy"

[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK     100.00
NER P   85.29 
NER R   83.97 
NER F   84.62 
SPEED   20987 

[1m

             P       R       F
ORG      74.44   75.64   75.03
DEM      69.28   50.72   58.56
PERSON   91.26   91.26   91.26
DATE     92.19   93.46   92.82
GPE      88.62   79.56   83.85



In [180]:
# stage 2 - rehearsal
# ref: https://www.machinelearningplus.com/nlp/training-custom-ner-model-in-spacy/

# Import and load the spacy model
import spacy
import json
from collections import Counter
nlp_md=spacy.load("../../data/models/spacy/code", enable=["ner"])
# nlp_md=spacy.load("en_core_web_md")
# nlp_lg=spacy.load("en_core_web_lg")
# nlp_trf=spacy.load("en_core_web_trf")

# Getting the ner component
ner = nlp_md.get_pipe("ner")

# Training examples in the required format
with open('../../data/annotated/rehearse_silver_sent_train.jsonl', "r", encoding="utf-8") as f2:

    rehearse = json.load(f2)

    # fetch and add labels
    count_per_entity_new = Counter()
    for _, annotations in rehearse:
        for ent in annotations.get("entities"):
           count_per_entity_new[ent[2]] +=1

    for k in count_per_entity_new:
        ner.add_label(k)

    # Resume training
    optimizer = nlp_md.resume_training()
    move_names = list(ner.move_names)

    # List of pipes you want to train
    pipe_exceptions = ["ner"]

    # List of pipes which should remain unaffected in training
    other_pipes = [pipe for pipe in nlp_md.pipe_names if pipe not in pipe_exceptions]

    # Importing requirements
    from spacy.util import minibatch, compounding
    import random

    # Begin training by disabling other pipeline components
    with nlp_md.disable_pipes(*other_pipes) :

      sizes = compounding(1.0, 4.0, 1.001)
      # Training for 30 iterations
      for itn in range(30):
        # shuffle examples before training
        random.shuffle(rehearse)
        # batch up the examples using spaCy's minibatch
        batches = minibatch(rehearse, size=sizes)
        # dictionary to store losses
        losses = {}
        for batch in batches:
          texts, annotations = zip(*batch)
          doc = nlp.make_doc(texts[0])
          spacy_entry = Example.from_dict(doc, annotations[0])
          # Calling update() over the iteration
          nlp_md.rehearse([spacy_entry], sgd=optimizer, losses=losses)
          print("Losses", losses)
        # random.shuffle(rehearse)
        # for raw_text, entity_offsets in rehearse:
        #   doc = nlp_md.make_doc(raw_text)
        #   example = Example.from_dict(doc,entity_offsets)
        #   nlp_md.rehearse([example],sgd=optimizer)

nlp.to_disk("../../data/models/spacy/code/code-rehearse")

Losses {'ner': 1.8689805666605632}
Losses {'ner': 6.469605159759522}
Losses {'ner': 16.2473093855512}
Losses {'ner': 22.247847551303906}
Losses {'ner': 33.109042694721694}
Losses {'ner': 47.55709203008147}
Losses {'ner': 54.5108162519503}
Losses {'ner': 67.17891772051449}
Losses {'ner': 76.28926502815985}
Losses {'ner': 81.57012497407743}
Losses {'ner': 95.58951654746986}
Losses {'ner': 105.28572089877888}
Losses {'ner': 120.68903495502488}
Losses {'ner': 136.8572349954998}
Losses {'ner': 154.24307908248917}
Losses {'ner': 164.08489530936563}
Losses {'ner': 168.99545660196503}
Losses {'ner': 176.29801926352644}
Losses {'ner': 185.15297567435954}
Losses {'ner': 190.01778623893864}
Losses {'ner': 194.91515848880275}
Losses {'ner': 202.27526661982017}
Losses {'ner': 217.94647301700277}
Losses {'ner': 227.62196067484032}
Losses {'ner': 234.388779223641}
Losses {'ner': 243.6288383394559}
Losses {'ner': 256.70241156819037}
Losses {'ner': 270.6231336572322}
Losses {'ner': 287.2323440719964}
L

In [258]:
# try Method 2, rehearsing while updating

# this code has been modified from the original
# original ref: https://www.machinelearningplus.com/nlp/training-custom-ner-model-in-spacy/

# Import requirements
import spacy
import json
from collections import Counter
from spacy.util import minibatch, compounding
import random
nlp_md=spacy.load("en_core_web_md", enable=["ner"])
# nlp_md=spacy.load("en_core_web_sm")
# nlp_lg=spacy.load("en_core_web_lg")
# nlp_trf=spacy.load("en_core_web_trf")

# Getting the ner component
ner = nlp_md.get_pipe("ner")

# Training examples in the required format
with open('../../data/annotated/echr_train_spacy.jsonl', "r", encoding="utf-8") as f1, open('../../data/annotated/rehearse_silver_sent_train.jsonl', "r", encoding="utf-8") as f2:

    train_data = json.load(f1)
    rehearse = json.load(f2)

    # fetch and add labels
    tab_ents =['DEM']
    for ent in tab_ents:
        ner.add_label(ent)

    # fetch and add labels for rehearsal
    # count_per_entity_new = Counter()
    # for _, annotations in rehearse:
    #     for ent in annotations.get("entities"):
    #        count_per_entity_new[ent[2]] +=1
    #
    # for k in count_per_entity_new:
    #     ner.add_label(k)

    # Resume training
    # ref: https://spacy.io/api/language#rehearse
    optimizer = nlp_md.resume_training()
    # move_names = list(ner.move_names)

    # List of pipes you want to train
    pipe_exceptions = ["ner"]

    # List of pipes which should remain unaffected in training
    other_pipes = [pipe for pipe in nlp_md.pipe_names if pipe not in pipe_exceptions]

    # Begin training by disabling other pipeline components
    with nlp_md.disable_pipes(*other_pipes) :
      sizes = compounding(1.0, 4.0, 1.001)

      # Training for 30 iterations
      for itn in range(30):

        # shuffle examples before training
        random.shuffle(train_data)
        random.shuffle(rehearse)

        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=sizes)
        batches2 = minibatch(rehearse, size=sizes)

        # dictionary to store losses
        losses = {}
        r_losses = {}

        '''
        Example
        >>> raw_text_batches = minibatch(raw_texts)
        >>> for labelled_batch in minibatch(examples):
        >>>     nlp.update(labelled_batch)
        >>>     raw_batch = [Example.from_dict(nlp.make_doc(text), {}) for text in next(raw_text_batches)]
        >>>     nlp.rehearse(raw_batch)
        '''

        for batch in batches:
          texts, annotations = zip(*batch)
          doc = nlp_md.make_doc(texts[0])
          spacy_entry = Example.from_dict(doc, annotations[0])
          # Calling update() over the iteration
          nlp_md.update([spacy_entry], sgd=optimizer, drop=0.35, losses=losses)
          raw_batch = [Example.from_dict(nlp_md.make_doc(text[0]), {}) for text in next(batches2)]
          nlp_md.rehearse(raw_batch, sgd=optimizer, losses=r_losses)
          print("Losses Update ", itn, losses)
          print("Losses Rehearsal ", itn, r_losses)


nlp.to_disk("../../data/models/spacy/code-rehearse")


Losses Update  0 {'ner': 320.4926636480959}
Losses Rehearsal  0 {'ner': 8.881138795286745}
Losses Update  0 {'ner': 630.8037558966316}
Losses Rehearsal  0 {'ner': 16.616528728331403}
Losses Update  0 {'ner': 808.2258282176153}
Losses Rehearsal  0 {'ner': 24.932976998688975}
Losses Update  0 {'ner': 1038.7047583584026}
Losses Rehearsal  0 {'ner': 34.78655114280032}
Losses Update  0 {'ner': 1078.753635403684}
Losses Rehearsal  0 {'ner': 41.39555964494542}
Losses Update  0 {'ner': 1147.2179577117267}
Losses Rehearsal  0 {'ner': 48.825309125624926}
Losses Update  0 {'ner': 1200.0641468633187}
Losses Rehearsal  0 {'ner': 57.04939542411918}
Losses Update  0 {'ner': 1250.8126078264313}
Losses Rehearsal  0 {'ner': 64.75556754281267}
Losses Update  0 {'ner': 1387.3654764897647}
Losses Rehearsal  0 {'ner': 71.91297648417242}
Losses Update  0 {'ner': 1425.6881966659007}
Losses Rehearsal  0 {'ner': 79.22220529044102}
Losses Update  0 {'ner': 1455.324885173281}
Losses Rehearsal  0 {'ner': 86.040124

In [262]:
!python -m spacy benchmark accuracy "../../data/models/spacy/code-rehearse" "../../data/annotated/dev.spacy"

[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK      100.00
TAG      -     
POS      -     
MORPH    -     
LEMMA    -     
UAS      -     
LAS      -     
NER P    31.04 
NER R    49.66 
NER F    38.20 
SENT P   -     
SENT R   -     
SENT F   -     
SPEED    9031  

[1m

                  P       R       F
CARDINAL       0.00    0.00    0.00
GPE           40.98   61.31   49.12
ORG            7.83   12.36    9.59
LAW            0.00    0.00    0.00
WORK_OF_ART    0.00    0.00    0.00
NORP           0.00    0.00    0.00
PERSON        29.34   28.74   29.03
DATE          82.44   90.83   86.43
PRODUCT        0.00    0.00    0.00
EVENT          0.00    0.00    0.00
ORDINAL        0.00    0.00    0.00
DEM            0.00    0.00    0.00
TIME           0.00    0.00    0.00
MONEY          0.00    0.00    0.00
PERCENT        0.00    0.00    0.00
LOC            0.00    0.00    0.00
FAC            0.00    0.0

In [261]:
import spacy

nlp = spacy.load('../../data/models/spacy/code-rehearse')

text = ['''SIDDHARTH RAGHUVANSHI                                Roll No. 06CS3025                                            DOB: 08/08/1988
Email: siddharth.iitkharagpur@gmail.com                                                                                          Mobile No.:   +91 9932584135
Degree/Certificate
Dual Degree[B. Tech (H) + M. Tech]
(Computer Science & Engineering)
Class XII:  C.B.S.E.
Class X:   C.B.S.E.
ACADEMIC ACHIEVEMENTS
Institute/ School, City
Indian Institute of Technology, Kharagpur
Central Hindu School, Varanasi
St. Atulanand Convent School, Varanasi
CGPA/ %  Completion
8.26/10
86.0%
90.8%
2011
2005
2003
Competitive
Examinations
  All India Rank 116 in AIEEE, 2006 among 470,000 students, State Rank 8 in Uttar Pradesh.
  All India Rank 119 in 7th National Science Olympiad, 2005.
  All India Rank 22 in All India Level Mathematics & Science Test organized by Central Institute  for
Proficiency in English Language (CIPEL).
Scholastic
Achievements
  National top 1% out of 26968 candidates appeared in National Standard Examination in Physics’05
  Receiving CBSE Merit Scholarship for the past 4 years.
ACADEMIC PROJECTS
M. Tech Project                                                                                      IIT Kharagpur                                           May’10-Nov’10
•
•
Studied the performance of text indexing algorithms on Hadoop MapReduce architecture.
Future work includes implementing more efficient indexing and retrieval techniques in MapReduce for distributed parallel
computing.
B. Tech Project                                                                                        IIT Kharagpur                                           Aug’09-May’10
  Developed a software with can handle all sorts of query related to geographical information extracted from maps.
  Developed a client interface which can fetch data from different incompatible geospatial web services and make that data
compatible for resolving queries.
Integrated my framework engine with different underlying heterogeneous spatial databases.

Static Instrumentation Of Java Programs                                          IIT Kharagpur                                                   May’08
  Developed a program using Byte Code Engineering Library to do automated testing of java program at byte code level.
WORK EXPERIENCE / INTERNSHIP
Extreme Blue Internship Program                                                                   ISL, IBM, Pune, India                                       May’09 – July’09
Business
Perspective
Technical
Perspective
 Achievements
  Conducted survey in Pune region on the current home delivery status of organized retails

Proposed and implemented a solution on how to increase home delivery sales in order to compete with the
localized general (kirana) stores

Built an independent Home Delivery module on Java EE platform using open standards such as XML and
Web Services
Integrated the Home Delivery module with IBM WebSphere Commerce.

  Received highest grade 10/10 in summer internship evaluation at IIT Kharagpur, 2009.
RELEVANT COURSES TAKEN
  Machine learning
  Algorithms-I
  Algorithms-II
Information Retrieval

  Distributed Systems

Probability and Statistics
POSITION OF RESPONSIBILITY

Student coordinator of IIT Kharagpur Student Counselling Service.
  Student member of team that conceptualized and publicized Counselling Centre in IIT Kharagpur after 5 successive suicides
in the campus within a span of 6 months in between Feb’09 and Jul’09.
  More than 100 students are counselled every month.
  No mishaps in the campus as of Sep’10 after the establishment of the centre.
  Went through Gate Keepers Training to identify behavioral change in a person.
  Managed  the  systems  team  of  Bitwise-2010,  an  international  algorithmic  intensive  programming  contest  leading  to  the
participation of 3000 teams across 75 countries.

Family Sub-head of accommodation team in Spring Fest, 2008.
  Head boy of my Senior Secondary School (Central Hindu School).
e
EXTRA CURRICULAR ACHIEVEMENTS
  Member of Silver winning team in inter hall OPENSOFT Competition in the session 2007-08.
  National Sports Organization: Among Top 30 students in Lawn Tennis Team at IIT Kharagpur’06. ''']

for doc in nlp.pipe(text):
    print([(ent.text, ent.label_) for ent in doc.ents])

[('SIDDHARTH', 'ORG'), ('Computer Science & Engineering', 'ORG'), ('School, City', 'GPE'), ('Indian Institute of Technology', 'ORG'), ('Kharagpur\nCentral Hindu School', 'ORG'), ('Varanasi', 'GPE'), ('St. Atulanand Convent School', 'ORG'), ('Varanasi', 'GPE'), ('8.26/10', 'CARDINAL'), ('86.0%', 'PERCENT'), ('90.8%', 'PERCENT'), ('2011\n2005', 'DATE'), ('2003', 'DATE'), ('Competitive\nExaminations', 'ORG'), ('All India Rank', 'ORG'), ('116', 'PRODUCT'), ('AIEEE', 'ORG'), ('2006', 'DATE'), ('470,000', 'CARDINAL'), ('State Rank 8', 'ORG'), ('Uttar', 'GPE'), ('All India Rank', 'ORG'), ('119', 'CARDINAL'), ('7th', 'ORDINAL'), ('National Science Olympiad', 'ORG'), ('2005', 'DATE'), ('All India Rank', 'ORG'), ('All India Level Mathematics & Science Test', 'ORG'), ('Central Institute', 'ORG'), ('English', 'LANGUAGE'), ('Scholastic', 'NORP'), ('National', 'ORG'), ('1%', 'PERCENT'), ('26968', 'DATE'), ('National Standard Examination', 'ORG'), ('Receiving CBSE Merit Scholarship', 'PERSON'), ('the

In [208]:
# import spacy
# import random
#
# # Load the pre-trained model
# model_name = "en_core_web_sm"
# nlp = spacy.load(model_name)
#
# with open('../../data/annotated/echr_train_spacy.jsonl', "r", encoding="utf-8") as f1, open('../../data/annotated/rehearse_silver_sent_train.jsonl', "r", encoding="utf-8") as f2:
#
#     training_data = json.load(f1)
#     rehearse = json.load(f2)
#
#     # fetch and add labels
#     tab_ents =['DEM']
#     for ent in tab_ents:
#         ner.add_label(ent)
#
#     # fetch and add labels for rehearsal
#     count_per_entity_new = Counter()
#     for _, annotations in rehearse:
#         for ent in annotations.get("entities"):
#            count_per_entity_new[ent[2]] +=1
#
#     for k in count_per_entity_new:
#         ner.add_label(k)
#
#     # Extract the text and annotations from the training data
#     texts = [example[0] for example in training_data]
#     annotations = [example[1] for example in training_data]
#     texts_rehearse = [example2[0] for example2 in rehearse]
#     annotations_rehearse = [example2[1] for example2 in rehearse]
#
#     # Prepare the data in the format expected by spaCy
#     train_data = list(zip(texts, [{"entities": annots} for annots in annotations]))
#     rehearse = list(zip(texts_rehearse, [{"entities": annots2} for annots2 in annotations_rehearse]))
#
#     # Set random seed for reproducibility
#     random.seed(42)
#
#     # Disable other pipeline components except NER
#     disable_components = [component for component in nlp.pipe_names if component != "ner"]
#
#     # Train the NER component only
#     with nlp.select_pipes(enable=disable_components):
#         optimizer = nlp.resume_training()
#
#         # Prevent catastrophic forgetting
#         nlp.initialize(rehearse)
#
#         for _ in range(10):  # You can adjust the number of iterations based on your dataset size
#             random.shuffle(train_data)
#             losses = {}
#             for text, annotations in train_data:
#                 nlp.update([text], [annotations], drop=0.5, losses=losses)
#             print(losses)
#
#     # Save the fine-tuned model
#     nlp.to_disk("./fine_tuned_ner_model")


TypeError: [E930] Received invalid get_examples callback in `Language.initialize`. Expected function that returns an iterable of Example objects but got: <class 'list'>