In [1]:
#import required libraries
import spacy
import random
import time
import numpy as np
import sys
from spacy import displacy
from itertools import chain
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

In [2]:
def load_data_spacy(file_path):
    ''' Converts data from:
    word \t label\n word \t label \n \n word \t label
    to: sentence, { entities : [(start, end, label), (start, end, label)]}
    '''
    file = open(file_path, 'r')
    training_data, entities, sentence, unique_labels = [], [], [], []
    current_annotation = None
    start = 0
    end = 0 #initialize counter to keep track of start and end characters
    for line in file:
        line = line.strip("\n").split("\t")
        #Lines with Len > 1 are words
        if len(line) > 1:
            label = line[1]
            if(label != '0'):
                label = line[1]+"_Disease"  #the .txt is formatted: label \t word, label[0:2] = label_type
            #label_type = line[0][0] # beginning of annotations - "B", intermediate - "I"
            word = line[0]
            sentence.append(word)
            start = end
            end += (len(word) + 1) #length of the word + trailing space
            
            if label == 'I_Disease' : #if at the end of an annotation
                entities.append(( start,end-1, label)) #append the annotation
                
            if label == 'B_Disease' : #if beginning a new annotation
                entities.append(( start,end-1, label)) #start annotation at beginning of word
                
            if label != 'O' and label not in unique_labels:
                unique_labels.append(label)
                
                
        #Lines with Len == 1 are breaks between sentences
        if len(line) == 1:
            if(len(entities) > 0):
                sentence = " ".join(sentence)
                training_data.append([sentence, {'entities' : entities}])
            #reset the counters and temporary lists
            end = 0
            start = 0
            entities, sentence = [], []
            
    file.close()
    return training_data, unique_labels

In [3]:
TRAIN_DATA, LABELS = load_data_spacy("C:\\NERdata\\BC5CDR-disease\\train.tsv")
print(TRAIN_DATA)
print(len(TRAIN_DATA))
TEST_DATA, _ = load_data_spacy("C:\\NERdata\\BC5CDR-disease\\test.tsv")
print(len(TEST_DATA))
VALID_DATA, _ = load_data_spacy("C:\\NERdata\\BC5CDR-disease\\train_dev.tsv")
print(len(VALID_DATA))


2658


2842
5385


In [6]:
import pandas as pd
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin
nlp = spacy.blank("en") #load a new spacy model
db = DocBin() #create a docbin object

for text, annot in tqdm(TRAIN_DATA): #data in previous format
    doc = nlp.make_doc(text) #create doc object from text
    ents = []
    for start, end, label in annot["entities"]: #add character indexes
        span = doc.char_span(start, end, label = label, alignment_mode="contract")
        if span is None:
            print("Skipping Entity")
        else:
            ents.append(span)
    doc.ents = ents #label the text with the ents
    db.add(doc)
    
db.to_disk("./train.spacy") #save the docbin object

db = DocBin()

for text, annot in tqdm(VALID_DATA): 
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in annot["entities"]: #add character indexes
        span = doc.char_span(start, end, label = label, alignment_mode="contract")
        if span is None:
            print("Skipping Entity")
        else:
            ents.append(span)
    doc.ents = ents #label the text with the ents
    db.add(doc)
    
db.to_disk("./valid.spacy") #save the docbin object

100%|████████████████████████████████████████████████████████████████████████████| 2658/2658 [00:02<00:00, 1188.86it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5385/5385 [00:03<00:00, 1539.03it/s]


In [2]:
!python -m spacy init fill-config ./base_config.cfg ./config.cfg

[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


2022-01-12 17:47:33.477631: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2022-01-12 17:47:33.478026: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [None]:
!python -m spacy train ./config.cfg --verbose --output ./ner_demo/training/ --paths.train train.spacy --paths.dev valid.spacy 

In [None]:
ner = spacy_load(R"ner_demo/training/model-best") #load the best model


test_sentences = [ner(x[0]) for x in TEST_DATA[0:4000]] #extract the sentences from [sentence, entity]
for x in test_sentences:
    doc = ner(x)
    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)
    displacy.render(doc, jupyter=True, style="ent")
    