In [10]:
import json
import spacy

In [11]:
def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

In [12]:
def write_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [13]:
def create_training_data(file, type):
    data = load_data(file)
    patterns = []
    for item in data:
        pattern = {
            "label": type,
            "pattern": item
        }
        patterns.append(pattern)
    return (patterns)

In [14]:
def generate_rules(patterns):
    nlp = English()
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    nlp.to_disk("commodities_ner")

In [15]:
patterns = create_training_data("data/commodities.json", "COMMODITY")
print(patterns)

[{'label': 'COMMODITY', 'pattern': 'aluminium'}, {'label': 'COMMODITY', 'pattern': 'Aluminium'}, {'label': 'COMMODITY', 'pattern': 'cattle'}, {'label': 'COMMODITY', 'pattern': 'Cattle'}, {'label': 'COMMODITY', 'pattern': 'cobalt'}, {'label': 'COMMODITY', 'pattern': 'Cobalt'}, {'label': 'COMMODITY', 'pattern': 'cocoa'}, {'label': 'COMMODITY', 'pattern': 'Cocoa'}, {'label': 'COMMODITY', 'pattern': 'coffee'}, {'label': 'COMMODITY', 'pattern': 'Coffee'}, {'label': 'COMMODITY', 'pattern': 'copper'}, {'label': 'COMMODITY', 'pattern': 'Copper'}, {'label': 'COMMODITY', 'pattern': 'corn'}, {'label': 'COMMODITY', 'pattern': 'Corn'}, {'label': 'COMMODITY', 'pattern': 'cotton'}, {'label': 'COMMODITY', 'pattern': 'Cotton'}, {'label': 'COMMODITY', 'pattern': 'crude oil'}, {'label': 'COMMODITY', 'pattern': 'Crude oil'}, {'label': 'COMMODITY', 'pattern': 'gold'}, {'label': 'COMMODITY', 'pattern': 'Gold'}, {'label': 'COMMODITY', 'pattern': 'iron'}, {'label': 'COMMODITY', 'pattern': 'Iron'}, {'label': '

In [16]:
nlp = spacy.load("commodities_ner")

In [17]:
def test_model(model, text):
    doc = nlp(text)
    results = []
    for ent in doc.ents:
        results.append(ent.text)
    return results

In [18]:
ie_data = {}

with open("data/ft-articles-training.txt") as f:
    text = f.read()
    articles = text.split("\n")
    for article in articles:
        articleUuid, articleBodyText = article.split("|||")
        segments = articleBodyText.split("||")
        hits = []
        for segment in segments:
            results = test_model(nlp, segment)
            for result in results:
                hits.append(result)
        ie_data[articleUuid] = hits

In [19]:
print(ie_data['c424190e-7e7f-11dc-8fac-0000779fd2ac'])

['ethanol', 'Cotton', 'corn', 'ethanol', 'corn', 'wheat', 'cotton', 'cotton', 'wheat', 'cotton', 'wheat', 'cotton', 'cotton', 'nickel', 'cotton', 'cotton']


In [20]:
write_data("data/entities_per_article_data.json", ie_data)