In [1]:
import json
import random
import spacy
from spacy.language import Language
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
from spacy.tokens import DocBin
from tqdm import tqdm

In [2]:
def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

In [3]:
def write_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [4]:
commodities = load_data("data/commodities.json")
print(commodities)

['aluminium', 'Aluminium', 'amber', 'Amber', 'Brent crude', 'cattle', 'Cattle', 'cobalt', 'Cobalt', 'cocoa', 'Cocoa', 'coffee', 'Coffee', 'copper', 'Copper', 'corn', 'Corn', 'cotton', 'Cotton', 'crude oil', 'Crude oil', 'ethanol', 'Ethanol', 'gold', 'Gold', 'grain', 'Grain', 'heating oil', 'Heating oil', 'hogs', 'Hogs', 'iron', 'Iron', 'lead', 'Lead', 'lithium', 'Lithium', 'milk', 'Milk', 'molybdenum', 'Molybdenum', 'natural gas', 'Natural gas', 'nickel', 'Nickel', 'oats', 'Oats', 'palladium', 'Palladium', 'palm oil', 'Palm oil', 'platinum', 'Platinum', 'poultry', 'Poultry', 'propane', 'Propane', 'rapeseed', 'Rapeseed', 'rice', 'Rice', 'rubber', 'Rubber', 'silver', 'Silver', 'soybeans', 'Soybeans', 'soya beans', 'Soya beans', 'sugar', 'Sugar', 'tin', 'Tin', 'wheat', 'Wheat', 'wool', 'Wool', 'zinc', 'Zinc']


In [5]:
def create_training_data(file, type):
    data = load_data(file)
    patterns = []
    for item in data:
        pattern = {
            "label": type,
            "pattern": item
        }
        patterns.append(pattern)
    return (patterns)

In [6]:
def generate_rules(patterns):
    nlp = English()
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    nlp.to_disk("commodities_ner")

In [7]:
patterns = create_training_data("data/commodities.json", "COMMODITY")
print(patterns)

[{'label': 'COMMODITY', 'pattern': 'aluminium'}, {'label': 'COMMODITY', 'pattern': 'Aluminium'}, {'label': 'COMMODITY', 'pattern': 'amber'}, {'label': 'COMMODITY', 'pattern': 'Amber'}, {'label': 'COMMODITY', 'pattern': 'Brent crude'}, {'label': 'COMMODITY', 'pattern': 'cattle'}, {'label': 'COMMODITY', 'pattern': 'Cattle'}, {'label': 'COMMODITY', 'pattern': 'cobalt'}, {'label': 'COMMODITY', 'pattern': 'Cobalt'}, {'label': 'COMMODITY', 'pattern': 'cocoa'}, {'label': 'COMMODITY', 'pattern': 'Cocoa'}, {'label': 'COMMODITY', 'pattern': 'coffee'}, {'label': 'COMMODITY', 'pattern': 'Coffee'}, {'label': 'COMMODITY', 'pattern': 'copper'}, {'label': 'COMMODITY', 'pattern': 'Copper'}, {'label': 'COMMODITY', 'pattern': 'corn'}, {'label': 'COMMODITY', 'pattern': 'Corn'}, {'label': 'COMMODITY', 'pattern': 'cotton'}, {'label': 'COMMODITY', 'pattern': 'Cotton'}, {'label': 'COMMODITY', 'pattern': 'crude oil'}, {'label': 'COMMODITY', 'pattern': 'Crude oil'}, {'label': 'COMMODITY', 'pattern': 'ethanol'},

In [8]:
generate_rules(patterns)

In [9]:
nlp = spacy.load("commodities_ner")

In [10]:
def test_model(model, text):
    doc = nlp(text)
    results = []
    entities = []
    for ent in doc.ents:
        entities.append((ent.start_char, ent.end_char, ent.label_))
    if len(entities) > 0:
        results = [text, {"entities": entities}]
        return results

In [11]:
TRAINING_DATA = []
VALIDATION_DATA = []

In [12]:
with open("data/ft-commodities-articles.txt") as f:
    text = f.read()
    articles = text.split("\n")
    for article in articles:
        articleUuid, articleBodyText = article.split("|||")
        segments = articleBodyText.split("||")
        hits = []
        for segment in segments:
            results = test_model(nlp, segment)
            if results != None:
                if len(TRAINING_DATA) > len(VALIDATION_DATA):
                    VALIDATION_DATA.append(results)
                else:
                    TRAINING_DATA.append(results)
                
print(TRAINING_DATA[0])
print(len(TRAINING_DATA))

print(VALIDATION_DATA[0])
print(len(VALIDATION_DATA))

['China giveth and China taketh away. That certainly resonates with those taking a punt on aluminium. The metals market is focused on one huge long position built up recently. It is thought to be equivalent to between 50 and 80 per cent of all the aluminium sitting in London Metal Exchange’s warehouses, just over 800,000 tonnes. There is also an unusually large number of March 2007 aluminium calls at strike prices of $3,000 a tonne or more outstanding.', {'entities': [(89, 98, 'COMMODITY'), (246, 255, 'COMMODITY'), (383, 392, 'COMMODITY')]}]
1433
['Yet in spite of several runs up towards $3,000, spot aluminium has not breached that level and now sits at less than $2,800. The squeeze means even high-cost producers can keep their smelters running, and metal has been flooding into LME warehouses, with inventories rising by almost 70,000 tonnes in the past month alone. Producer stocks, held outside the LME, have also seen a big increase across the world.', {'entities': [(53, 62, 'COMMODITY

In [13]:
write_data("data/commodities_training_data.json", TRAINING_DATA)
write_data("data/commodities_validation_data.json", VALIDATION_DATA)

In [14]:
nlp = spacy.blank("en")

In [16]:
commodities_training_data = load_data("data/commodities_training_data.json")
commodities_validation_data = load_data("data/commodities_validation_data.json")

In [17]:
def create_spacy_format_data(TRAINING_DATA):
    docBin = DocBin() # create a DocBin object
    for text, annotation in tqdm(TRAINING_DATA): # data in previous format
        doc = nlp.make_doc(text) # create doc object from text
        ents = []
        for start, end, label in annotation["entities"]: # add character indexes
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print ("Skipping entity")
            else:
                ents.append(span)
        doc.ents = ents # label the text with the ents
        docBin.add(doc)
    return (docBin)

In [18]:
commodities_training_data = create_spacy_format_data(commodities_training_data)
commodities_training_data.to_disk("data/commodities_training_data.spacy")

100%|████████████████████████████████████████████████████████████████████████████████| 1433/1433 [00:00<00:00, 2850.10it/s]


In [19]:
commodities_validation_data = create_spacy_format_data(commodities_validation_data)
commodities_validation_data.to_disk("data/commodities_validation_data.spacy")

100%|████████████████████████████████████████████████████████████████████████████████| 1432/1432 [00:00<00:00, 3666.48it/s]
