In [1]:
import json
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

In [2]:
def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

In [3]:
def write_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [4]:
nlp = spacy.load("commodities_ner_rules")

In [5]:
def extract_results(text):
    doc = nlp(text)
    results = []
    entities = []
    for ent in doc.ents:
        entities.append((ent.start_char, ent.end_char, ent.label_))
    if len(entities) > 0:
        results = [text, {"entities": entities}]
        return results

In [6]:
TRAINING_DATA = []

In [7]:
with open("data/ft-articles-training.txt") as f:
    text = f.read()
    articles = text.split("\n")
    for article in articles:
        articleUuid, articleBodyText = article.split("|||")
        segments = articleBodyText.split("||")
        hits = []
        for segment in segments:
            results = extract_results(segment)
            if results != None:
                TRAINING_DATA.append(results)
                
print(TRAINING_DATA[0])
print(len(TRAINING_DATA))

['China Hongqiao, the world’s largest producer of aluminium by output, used its latest annual report to warn that fierce competition in its home market would lead to “survival of the fittest”.', {'entities': [(48, 57, 'COMMODITY')]}]
16648


In [8]:
write_data("data/training_data.json", TRAINING_DATA)

In [9]:
nlp = spacy.blank("en")

In [10]:
training_data = load_data("data/training_data.json")

In [11]:
def create_spacy_format_data(DATA):
    docBin = DocBin() # create a DocBin object
    for text, annotation in tqdm(DATA): # data in previous format
        doc = nlp.make_doc(text) # create doc object from text
        ents = []
        for start, end, label in annotation["entities"]: # add character indexes
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print ("Skipping entity")
            else:
                ents.append(span)
        doc.ents = ents # label the text with the ents
        docBin.add(doc)
    return (docBin)

In [12]:
training_data = create_spacy_format_data(training_data)
training_data.to_disk("data/training_data.spacy")

100%|██████████████████████████████████████████████████████████████████████████████████████████| 16648/16648 [00:03<00:00, 5056.86it/s]
