In [19]:
import json
import random
import spacy
from spacy.language import Language
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
from spacy.tokens import DocBin
from tqdm import tqdm

In [20]:
def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

In [21]:
def write_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [22]:
commodities = load_data("data/commodities.json")
print(commodities)

['aluminium', 'Aluminium', 'cattle', 'Cattle', 'cobalt', 'Cobalt', 'cocoa', 'Cocoa', 'coffee', 'Coffee', 'copper', 'Copper', 'corn', 'Corn', 'cotton', 'Cotton', 'crude oil', 'Crude oil', 'gold', 'Gold', 'iron', 'Iron', 'lithium', 'Lithium', 'natural gas', 'Natural gas', 'palm oil', 'Palm oil', 'poultry', 'Poultry', 'rice', 'Rice', 'silver', 'Silver', 'sugar', 'Sugar', 'wheat', 'Wheat', 'zinc', 'Zinc']


In [23]:
def create_patterns(file, type):
    data = load_data(file)
    patterns = []
    for item in data:
        pattern = {
            "label": type,
            "pattern": item
        }
        patterns.append(pattern)
    return (patterns)

In [24]:
def generate_rules(patterns):
    nlp = English()
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    nlp.to_disk("commodities_ner")

In [25]:
patterns = create_patterns("data/commodities.json", "COMMODITY")
print(patterns)

[{'label': 'COMMODITY', 'pattern': 'aluminium'}, {'label': 'COMMODITY', 'pattern': 'Aluminium'}, {'label': 'COMMODITY', 'pattern': 'cattle'}, {'label': 'COMMODITY', 'pattern': 'Cattle'}, {'label': 'COMMODITY', 'pattern': 'cobalt'}, {'label': 'COMMODITY', 'pattern': 'Cobalt'}, {'label': 'COMMODITY', 'pattern': 'cocoa'}, {'label': 'COMMODITY', 'pattern': 'Cocoa'}, {'label': 'COMMODITY', 'pattern': 'coffee'}, {'label': 'COMMODITY', 'pattern': 'Coffee'}, {'label': 'COMMODITY', 'pattern': 'copper'}, {'label': 'COMMODITY', 'pattern': 'Copper'}, {'label': 'COMMODITY', 'pattern': 'corn'}, {'label': 'COMMODITY', 'pattern': 'Corn'}, {'label': 'COMMODITY', 'pattern': 'cotton'}, {'label': 'COMMODITY', 'pattern': 'Cotton'}, {'label': 'COMMODITY', 'pattern': 'crude oil'}, {'label': 'COMMODITY', 'pattern': 'Crude oil'}, {'label': 'COMMODITY', 'pattern': 'gold'}, {'label': 'COMMODITY', 'pattern': 'Gold'}, {'label': 'COMMODITY', 'pattern': 'iron'}, {'label': 'COMMODITY', 'pattern': 'Iron'}, {'label': '

In [26]:
generate_rules(patterns)

In [27]:
nlp = spacy.load("commodities_ner")

In [28]:
def test_model(model, text):
    doc = nlp(text)
    results = []
    entities = []
    for ent in doc.ents:
        entities.append((ent.start_char, ent.end_char, ent.label_))
    if len(entities) > 0:
        results = [text, {"entities": entities}]
        return results

In [29]:
TRAINING_DATA = []

In [30]:
with open("data/ft-articles-training.txt") as f:
    text = f.read()
    articles = text.split("\n")
    for article in articles:
        articleUuid, articleBodyText = article.split("|||")
        segments = articleBodyText.split("||")
        hits = []
        for segment in segments:
            results = test_model(nlp, segment)
            if results != None:
                TRAINING_DATA.append(results)
                
print(TRAINING_DATA[0])
print(len(TRAINING_DATA))

['For much of the past two years, aluminium was the metal everyone loved to hate. While copper, the darling of hedge funds, has raced to one record after another, hitting $10,000 a tonne last week for the first time, aluminium, the most widely used metal after steel, lagged far behind.', {'entities': [(32, 41, 'COMMODITY'), (86, 92, 'COMMODITY'), (215, 224, 'COMMODITY')]}]
19220


In [31]:
write_data("data/training_data.json", TRAINING_DATA)

In [32]:
nlp = spacy.blank("en")

In [33]:
training_data = load_data("data/training_data.json")

In [34]:
def create_spacy_format_data(DATA):
    docBin = DocBin() # create a DocBin object
    for text, annotation in tqdm(DATA): # data in previous format
        doc = nlp.make_doc(text) # create doc object from text
        ents = []
        for start, end, label in annotation["entities"]: # add character indexes
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print ("Skipping entity")
            else:
                ents.append(span)
        doc.ents = ents # label the text with the ents
        docBin.add(doc)
    return (docBin)

In [35]:
training_data = create_spacy_format_data(training_data)
training_data.to_disk("data/training_data.spacy")

100%|███████████████████████████████████████████████████████████████████| 19220/19220 [00:03<00:00, 4983.94it/s]
