In [38]:
import random
import pandas
import json
import re

In [39]:
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    training_data = []
    lines=[]
    with open(dataturks_JSON_FilePath, 'r', encoding="utf8") as f:
        lines = f.readlines()

    for line in lines:
        data = json.loads(line)
        text = data['content'].replace("\n", " ")
        entities = []
        data_annotations = data['annotation']
        if data_annotations is not None:
            for annotation in data_annotations:
                #only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    point_start = point['start']
                    point_end = point['end']
                    point_text = point['text']

                    lstrip_diff = len(point_text) - len(point_text.lstrip())
                    rstrip_diff = len(point_text) - len(point_text.rstrip())
                    if lstrip_diff != 0:
                        point_start = point_start + lstrip_diff
                    if rstrip_diff != 0:
                        point_end = point_end - rstrip_diff
                    entities.append((point_start, point_end + 1 , label))
        training_data.append((text, {"entities" : entities}))
    return training_data

def trim_entity_spans(data: list) -> list:
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])
    return cleaned_data

In [40]:
data = trim_entity_spans(convert_dataturks_to_spacy("Entity Recognition in Resumes.json"))
data[0]

["Abhishek Jha Application Development Associate - Accenture  Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a  • To work for an organization which provides me the opportunity to improve my skills and knowledge for my individual and company's growth in best possible ways.  Willing to relocate to: Bangalore, Karnataka  WORK EXPERIENCE  Application Development Associate  Accenture -  November 2017 to Present  Role: Currently working on Chat-bot. Developing Backend Oracle PeopleSoft Queries for the Bot which will be triggered based on given input. Also, Training the bot for different possible utterances (Both positive and negative), which will be given as input by the user.  EDUCATION  B.E in Information science and engineering  B.v.b college of engineering and technology -  Hubli, Karnataka  August 2013 to June 2017  12th in Mathematics  Woodbine modern school  April 2011 to March 2013  10th  Kendriya Vidyalaya  April 2001 to March 2011  SKILLS  C (Le

In [41]:
!pip install spacy==2.1.4

Collecting spacy==2.1.4
  Using cached spacy-2.1.4.tar.gz (29.8 MB)
  Installing build dependencies: started
  Installing build dependencies: still running...
  Installing build dependencies: finished with status 'error'


  error: subprocess-exited-with-error
  
  × pip subprocess to install build dependencies did not run successfully.
  │ exit code: 1
  ╰─> [367 lines of output]
      Collecting setuptools
        Using cached setuptools-65.3.0-py3-none-any.whl (1.2 MB)
      Collecting wheel>0.32.0.<0.33.0
        Using cached wheel-0.37.1-py2.py3-none-any.whl (35 kB)
      Collecting Cython
        Using cached Cython-0.29.32-py2.py3-none-any.whl (986 kB)
      Collecting cymem<2.1.0,>=2.0.2
        Using cached cymem-2.0.6-cp310-cp310-win_amd64.whl (36 kB)
      Collecting preshed<2.1.0,>=2.0.1
        Using cached preshed-2.0.1.tar.gz (113 kB)
        Preparing metadata (setup.py): started
        Preparing metadata (setup.py): finished with status 'done'
      Collecting murmurhash<1.1.0,>=0.28.0
        Using cached murmurhash-1.0.8-cp310-cp310-win_amd64.whl (18 kB)
      Collecting thinc==7.0.0.dev6
        Using cached thinc-7.0.0.dev6.tar.gz (1.9 MB)
        Preparing metadata (setup.py): star

In [42]:

model = spacy.blank('en')

def train(data) :
    if 'ner' not in model.pipe_names :
        model.add_pipe('ner', last = True)
        ner = model.get_pipe(name = 'ner')
        #print('yes')
    
    for _, annotation in data :
        for entity in annotation['entities'] :
            ner.add_label(entity[2])
            #print(entity[2])

    other_pipes = [pipe for pipe in model.pipe_names if pipe != 'ner']
    print(other_pipes)
    with model.disable_pipes(*other_pipes) :
        optimizer = model.begin_training()
        for i in range(10) :
            print(f"Iteration : {i}")
            #print(data[0])
            random.shuffle(data)
            losses = {}
            index = 0
            for text, annotations in data :
                try :
                    model.update(
                        [text],
                        [annotations],
                        drop = 0.2,
                        sgd = optimizer,
                        losses = losses
                    )
                except Exception as e:
                    print(e)
                    break
            print(losses)

train(data)

[]
Iteration : 0
[E989] `nlp.update()` was called with two positional arguments. This may be due to a backwards-incompatible change to the format of the training data in spaCy 3.0 onwards. The 'update' function should now be called with a batch of Example objects, instead of `(text, annotation)` tuples. 
{}
Iteration : 1
[E989] `nlp.update()` was called with two positional arguments. This may be due to a backwards-incompatible change to the format of the training data in spaCy 3.0 onwards. The 'update' function should now be called with a batch of Example objects, instead of `(text, annotation)` tuples. 
{}
Iteration : 2
[E989] `nlp.update()` was called with two positional arguments. This may be due to a backwards-incompatible change to the format of the training data in spaCy 3.0 onwards. The 'update' function should now be called with a batch of Example objects, instead of `(text, annotation)` tuples. 
{}
Iteration : 3
[E989] `nlp.update()` was called with two positional arguments. T