In [None]:
!pip install spacy==3.0.6 --quiet
!python -m spacy download en_core_web_trf --quiet
!pip install spacy-lookups-data --quiet

[K     |████████████████████████████████| 12.8MB 212kB/s 
[K     |████████████████████████████████| 51kB 7.9MB/s 
[K     |████████████████████████████████| 460kB 39.9MB/s 
[K     |████████████████████████████████| 1.1MB 40.7MB/s 
[K     |████████████████████████████████| 9.1MB 52.9MB/s 
[K     |████████████████████████████████| 122kB 51.6MB/s 
[?25h  Building wheel for smart-open (setup.py) ... [?25l[?25hdone
2021-05-11 19:14:39.840013: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[K     |████████████████████████████████| 459.7MB 35kB/s 
[K     |████████████████████████████████| 1.0MB 42.6MB/s 
[K     |████████████████████████████████| 2.1MB 42.6MB/s 
[K     |████████████████████████████████| 901kB 53.2MB/s 
[K     |████████████████████████████████| 3.3MB 47.9MB/s 
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')
[K     |███████

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# config file
!cp drive/MyDrive/ner/model_config/config.cfg .

# train file
!cp drive/MyDrive/ner/data/laptop-ner.json .

In [None]:
import pandas as pd
from sklearn.model_selection import KFold
from spacy.tokens import DocBin, Span
from spacy.util import filter_spans
from tqdm import tqdm
from pathlib import Path
import json
import spacy
import os

In [None]:
class CFG:
    seed = 42
    labels = [
        "PRODUCT",
        "POS_EXP",
        "ISSUE",
        "BRAND",
        "ATTR",
        "PERSON",
        "CONTXT_USE",
        "RETAILER"
    ]
    balance_labels = False
    single_model = False
    filename = "laptop-ner.json"
    exp_name = "A"

In [None]:
def separate_data():
    examples = []

    with open(CFG.filename) as f:
        for line in f.readlines():
            example = json.loads(line)
            spans = example.get("spans")
            if spans:
                ents = {"entities": []}
                for span in spans:
                    start = span["start"]
                    end = span["end"] 
                    ents["entities"].append((start, end, span["label"]))
                
                new_example = (example["text"], ents)
                examples.append(new_example)

    return examples

def separate_data_exclusive():
    """
    separate overlapping spans into unique non-overlapping sentences
    ex: samsung is great
        -BRAND-
        ----POS_EXP---

    becomes two sentences
    a)
    samsung is great
    -BRAND-
    b)
    samsung is great
    ----POS_EXP----

    also returns a dict of {sentence: [labels]}
    so that we can lookup which labels the original sentence had
    """
    sent_label_lookup = {}
    examples = {}

    for label in CFG.labels:
        examples[label] = []

    with open(CFG.filename) as f:
        for line in f.readlines():
            example = json.loads(line)
            for label in CFG.labels:
                flag_match = 0
                spans = example.get("spans")
                global_ents = []
                if spans:
                    sep_ents = {"entities": []}
                    for span in spans:
                        global_ents.append(span["label"])
                        if span["label"] == label:
                            flag_match = 1
                            start = span["start"]
                            end = span["end"] 

                            sep_ents["entities"].append((start, end, label))
                else:
                    flag_match = 0
                
                if flag_match:
                    new_example = (example["text"], sep_ents)
                    examples[label].append(new_example)
                
                sent_label_lookup[example["text"]] = global_ents

    return examples, sent_label_lookup

def balance_examples(examples, ex_label_pairing):
    for label in CFG.labels:
        pos_ex = len(examples[label])
        neg_ex = 0
        for ex, entities in ex_label_pairing.items():
            if neg_ex == pos_ex:
                break
            else:
                if label not in ex_label_pairing:
                    examples[label].append((ex, {"entities": []}))
                    neg_ex += 1

    return examples

def data_to_spacy(examples):
    """convert annotations to spacy format"""
    nlp = spacy.blank("en") # load a new spacy model
    for label in tqdm(CFG.labels):
        for i in range(5):
            kf = KFold(n_splits=2, shuffle=True, random_state=CFG.seed)
            k = 1
            for train_idxs, test_idxs in kf.split(examples[label]):
                train = list(map(examples[label].__getitem__, train_idxs))
                test = list(map(examples[label].__getitem__, test_idxs))

                train_db = DocBin()
                test_db = DocBin()
                for text, annot in train: # data in previous format
                    doc = nlp.make_doc(text) # create doc object from text
                    ents = []
                    for start, end, label in annot["entities"]: # add character indexes
                        span = doc.char_span(start, end, label=label, alignment_mode="contract")
                        if span is None:
                            print("Skipping entity")
                        else:
                            ents.append(span)
                    try:
                        doc.ents = ents # label the text with the ents
                        train_db.add(doc)
                    except:
                        pass

                train_db.to_disk(f"./{CFG.exp_name}-train-{label}-it{i+1}-fold{k}.spacy")

                for text, annot in test: # data in previous format
                    doc = nlp.make_doc(text) # create doc object from text
                    ents = []
                    for start, end, label in annot["entities"]: # add character indexes
                        span = doc.char_span(start, end, label=label, alignment_mode="contract")
                        if span is None:
                            pass
                        else:
                            ents.append(span)
                    try:
                        doc.ents = ents # label the text with the ents
                        test_db.add(doc)
                    except:
                        continue

                test_db.to_disk(f"./{CFG.exp_name}-dev-{label}-it{i+1}-fold{k}.spacy")
                
                k += 1

def get_longest_span(examples):
    new_examples = []
    nlp = spacy.blank("en")
    for example in examples:
        # construct filter class and filter spans
        text = example[0]
        spans = example[1]["entities"]
        doc = nlp(text)
        ents = []
        for span in spans:
            char_span = doc.char_span(
                span[0],
                span[1],
                label=span[2],
                alignment_mode="contract"
            )
            ents.append(char_span)

        filtered_spans = filter_spans(ents)
        formatted_spans = []
        # reshape into original formatting
        for filtered_span in filtered_spans:
            start = filtered_span.start_char
            end = filtered_span.end_char
            label = filtered_span.label_

            formatted_spans.append((start, end, label))

        new_examples.append((text, {"entities": formatted_spans}))
    return new_examples


In [None]:
examples = separate_data()
long_examples = get_longest_span(examples)

In [None]:
if CFG.single_model:
    examples = separate_data()
else:
    examples, ex_label_pairing = separate_data_exclusive()

In [None]:
if CFG.balance_labels:
    examples = balance_examples(examples, ex_label_pairing)

In [None]:
for label, exs in examples.items():
    print(label, len(exs))

PRODUCT 841
POS_EXP 1066
ISSUE 1042
BRAND 325
ATTR 2139
PERSON 140
CONTXT_USE 668
RETAILER 81


In [None]:
data_to_spacy(examples)

100%|██████████| 8/8 [01:16<00:00,  9.60s/it]


In [None]:
os.environ["exp_name"] = CFG.exp_name

In [None]:
%%writefile train.sh
#!/bin/bash
echo "RUNNING EXPERIMENT $exp_name"
for label in ATTR BRAND CONTXT_USE ISSUE PERSON POS_EXP PRODUCT RETAILER
do
    for i in 1 2 3 4 5
    do
        for k in 1 2
        do
            echo "Training model for $label at fold $k"
            python -m spacy train "config.cfg" --output "./$exp_name-$label-it$i-fold$k" --paths.train "$exp_name-train-$label-it$i-fold$k.spacy" --paths.dev "$exp_name-dev-$label-it$i-fold$k.spacy" --gpu-id 0 --training.patience 400 --verbose
            zip -FSr $exp_name-$label-it$i-fold$k.zip $exp_name-$label-it$i-fold$k/model-best/*
            cp $exp_name-$label-it$i-fold$k.zip "drive/MyDrive/ner/$exp_name/$label/"
            rm -r $exp_name-$label-it$i-fold$k.zip "$exp_name-$label-it$i-fold$k"
        done
    done
done

Overwriting train.sh


In [None]:
!sh train.sh

RUNNING EXPERIMENT A
Training model for ATTR at fold 1
2021-05-11 15:09:36.570066: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[38;5;2m✔ Created output directory: A-ATTR-it1-fold1[0m
[2021-05-11 15:09:38,242] [DEBUG] Config overrides from CLI: ['paths.train', 'paths.dev', 'training.patience']
[38;5;4mℹ Using GPU: 0[0m
[1m
[2021-05-11 15:09:38,924] [INFO] Set up nlp object from config
[2021-05-11 15:09:38,933] [DEBUG] Loading corpus from path: A-dev-ATTR-it1-fold1.spacy
[2021-05-11 15:09:38,934] [DEBUG] Loading corpus from path: A-train-ATTR-it1-fold1.spacy
[2021-05-11 15:09:38,934] [INFO] Pipeline: ['transformer', 'ner']
[2021-05-11 15:09:38,940] [DEBUG] Loading lookups from spacy-lookups-data: ['lexeme_norm']
[2021-05-11 15:09:38,952] [INFO] Added vocab lookups: lexeme_norm
[2021-05-11 15:09:38,952] [INFO] Created vocabulary
[2021-05-11 15:09:38,952] [INFO] Finished initializing nlp object
[2021-05-11 15:09

In [None]:
!cp "$exp_name-logfile.txt" "drive/MyDrive/ner/$exp_name/$label/"