In [1]:
import json
import random
import logging
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from sklearn.metrics import accuracy_score
import spacy

In [2]:
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    try:
        training_data = []
        lines=[]
        with open(dataturks_JSON_FilePath, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content']
            entities = []
            for annotation in data['annotation']:
                #only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    #dataturks indices are both inclusive [start, end] but spacy is not [start, end)
                    entities.append((point['start'], point['end'] + 1 ,label))


            training_data.append((text, {"entities" : entities}))

        return training_data
    except Exception as e:
        logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
        return None

In [3]:
def train_spacy():

    TRAIN_DATA = convert_dataturks_to_spacy("traindata.json")
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
       

    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(10):
            print("Statring iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
            

    #test the model and evaluate it
    examples = convert_dataturks_to_spacy("testdata.json")
    tp=0
    tr=0
    tf=0

    ta=0
    c=0        
    for text,annot in examples:

        f=open("resume"+str(c)+".txt","w")
        doc_to_test=nlp(text)
        d={}
        for ent in doc_to_test.ents:
            d[ent.label_]=[]
        for ent in doc_to_test.ents:
            d[ent.label_].append(ent.text)

        for i in set(d.keys()):

            f.write("\n\n")
            f.write(i +":"+"\n")
            for j in set(d[i]):
                f.write(j.replace('\n','')+"\n")
        d={}
        for ent in doc_to_test.ents:
            d[ent.label_]=[0,0,0,0,0,0]
        for ent in doc_to_test.ents:
            doc_gold_text= nlp.make_doc(text)
            gold = GoldParse(doc_gold_text, entities=annot.get("entities"))
            y_true = [ent.label_ if ent.label_ in x else 'Not '+ent.label_ for x in gold.ner]
            y_pred = [x.ent_type_ if x.ent_type_ ==ent.label_ else 'Not '+ent.label_ for x in doc_to_test]  
            if(d[ent.label_][0]==0):
                #f.write("For Entity "+ent.label_+"\n")   
                #f.write(classification_report(y_true, y_pred)+"\n")
                (p,r,f,s)= precision_recall_fscore_support(y_true,y_pred,average='weighted')
                a=accuracy_score(y_true,y_pred)
                d[ent.label_][0]=1
                d[ent.label_][1]+=p
                d[ent.label_][2]+=r
                d[ent.label_][3]+=f
                d[ent.label_][4]+=a
                d[ent.label_][5]+=1
        c+=1
    for i in d:
        print("\n For Entity "+i+"\n")
        print("Accuracy : "+str((d[i][4]/d[i][5])*100)+"%")
        print("Precision : "+str(d[i][1]/d[i][5]))
        print("Recall : "+str(d[i][2]/d[i][5]))
        print("F-score : "+str(d[i][3]/d[i][5]))

In [4]:
spacy.prefer_gpu()
train_spacy()

Statring iteration 0
{'ner': 6024.980022157161}
Statring iteration 1
{'ner': 4156.934329157899}
Statring iteration 2
{'ner': 3471.2245884451604}
Statring iteration 3
{'ner': 2950.4357274090185}
Statring iteration 4
{'ner': 2647.330996542007}
Statring iteration 5
{'ner': 2372.3683268549976}
Statring iteration 6
{'ner': 2151.2988492280506}
Statring iteration 7
{'ner': 1874.086188932862}
Statring iteration 8
{'ner': 1677.6490600345708}
Statring iteration 9
{'ner': 1612.822831661878}


  'recall', 'true', average, warn_for)



 For Entity Name

Accuracy : 99.83805668016194%
Precision : 0.9983831936194594
Recall : 0.9983805668016195
F-score : 0.9981113185060555

 For Entity Location

Accuracy : 99.27125506072875%
Precision : 0.9927657005623397
Recall : 0.9927125506072875
F-score : 0.9897446574315648

 For Entity Email Address

Accuracy : 99.43319838056681%
Precision : 1.0
Recall : 0.994331983805668
F-score : 0.9971579374746244

 For Entity Companies worked at

Accuracy : 98.78542510121457%
Precision : 1.0
Recall : 0.9878542510121457
F-score : 0.9938900203665988

 For Entity Designation

Accuracy : 99.83805668016194%
Precision : 1.0
Recall : 0.9983805668016195
F-score : 0.9991896272285252

 For Entity Graduation Year

Accuracy : 99.83805668016194%
Precision : 0.9983805668016195
Recall : 0.9983805668016195
F-score : 0.9983805668016195

 For Entity College Name

Accuracy : 100.0%
Precision : 1.0
Recall : 1.0
F-score : 1.0

 For Entity Skills

Accuracy : 100.0%
Precision : 1.0
Recall : 1.0
F-score : 1.0


In [None]:
# add labels
TRAIN_DATA = convert_dataturks_to_spacy("traindata.json")
    

In [9]:
for txt, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        print(f"L:{ent[2]} --> {txt[ent[0]:ent[1]]}")

L:Companies worked at --> Oracle
L:Companies worked at --> Oracle
L:Companies worked at --> Oracle
L:Skills --> Languages: Core Java, Go Lang, Data Structures & Algorithms, Oracle
PL-SQL programming, Sales Force with APEX.
Tools: RADTool, Jdeveloper, NetBeans, Eclipse, SQL developer,
PL/SQL Developer, WinSCP, Putty
Web Technologies: JavaScript, XML, HTML, Webservice

Operating Systems: Linux, Windows
Version control system SVN & Git-Hub
Databases: Oracle
Middleware: Web logic, OC4J
Product FLEXCUBE: Oracle FLEXCUBE Versions 10.x, 11.x and 12.x
L:Companies worked at --> Oracle
L:Skills --> APEX. (Less than 1 year), Data Structures (3 years), FLEXCUBE (5 years), Oracle (5 years),
Algorithms (3 years)

L:Graduation Year --> 2012
L:College Name --> Adithya Institute of Technology
L:Degree --> B.E in Computer Science Engineering
L:Graduation Year --> 2012
L:Companies worked at --> Oracle
L:Designation --> Associate Consultant
L:Companies worked at --> Oracle
L:Designation --> Staff Consulta