# Export annotations from Prodigy

In [35]:
from collections import defaultdict
import copy
import json
import sys

import pandas as pd

sys.path.append("../..")
from src.training.dataset_utils import train_dev_test_split

In [9]:
db_file = "/Users/Carol/Documents/epicurious-recipes-with-rating-and-nutrition/labeled_data/food_ner_gold_2.jsonl"

## Count records and see if they're complete

In [10]:
records = defaultdict(list)
with open(db_file, 'r') as infile:
    for line in infile:
        record = json.loads(line)
        num = record['meta']['row']
        part = record['meta']['subpart']
        records[num].append(part)

In [11]:
# compare to input data
orig_file = "/Users/Carol/Documents/epicurious-recipes-with-rating-and-nutrition/use_directions.jsonl"
orig_records = defaultdict(list)
with open(db_file, 'r') as infile:
    for line in infile:
        record = json.loads(line)
        num = record['meta']['row']
        part = record['meta']['subpart']
        orig_records[num].append(part)

In [12]:
for rec in records:
    assert orig_records[rec] == records[rec]

In [13]:
# count labeled records
recipes = 0
sentences = 0
for rec in records:
    recipes += 1
    sentences += len(records[rec])
print(recipes)
print(sentences)

301
981


# Convert to CoNLL format

In [19]:
def tokens_overlap(token_1, token_2):
    token_1_range = {i for i in range(token_1['start'], token_1['end'])}
    token_2_range = {i for i in range(token_2['start'], token_2['end'])}
    if token_1_range.intersection(token_2_range):
        return True
    return False

def assign_labels_to_tokens(token_spans, label_spans):
    token_spans = copy.deepcopy(token_spans)
    for tok in token_spans:
        for lab in label_spans:
            if tokens_overlap(tok, lab):
                tok['label'] = lab['label']
    return token_spans

def assign_metadata_to_tokens(token_spans, record_id):
    token_spans = copy.deepcopy(token_spans)
    for token in token_spans:
        token['record_id'] = record_id
    return token_spans

def add_bio_tags(token_spans):
    token_spans = copy.deepcopy(token_spans)
    prev_label = 'O'
    for tok in token_spans:
        label = tok.get('label', 'O')
        if label == 'O':
            tok['bio-tag'] = 'O'
            prev_label = 'O'
            continue
        elif label == prev_label:
            tok['bio-tag'] = "I-" + label
            prev_label = label
        else:
            tok['bio-tag'] = "B-" + label
            prev_label = label
    return token_spans


def get_token_and_label_spans_from_jsonl(db_file):
    """Returns a nested list where each document is a sublist."""
    token_spans = []
    label_spans = []
    record_ids = []
    with open(db_file, 'r') as infile:
        for line in infile:
            record = json.loads(line)
            token_spans.append(record['tokens'])
            label_spans.append(record['spans'])
            num = record['meta']['row']
            part = record['meta']['subpart']
            record_ids.append(str(num) + "_" + str(part))
    return token_spans, label_spans, record_ids


def get_bio_tagged_spans_from_jsonl(db_file):
    result = []
    token_spans, label_spans, record_ids = get_token_and_label_spans_from_jsonl(db_file)
    for tok, lab, rec_id in zip(token_spans, label_spans, record_ids):
        tagged_spans = assign_labels_to_tokens(tok, lab)
        tagged_spans = assign_metadata_to_tokens(tagged_spans, rec_id)
        bio_tagged_spans = add_bio_tags(tagged_spans)
        result.append(bio_tagged_spans)
    return result


def write_conll(bio_tagged_spans, outfile):
    with open(outfile, 'w') as out:
        for sent in bio_tagged_spans:
            for tok in sent:
                out.write(f"{tok['text']} {tok['start']} {int(tok['end'])+len(tok['text'])} {tok['record_id']} {tok['bio-tag']}\n")
            out.write("\n")

                
token_spans = get_bio_tagged_spans_from_jsonl(db_file)                   

In [17]:
len(token_spans)

981

In [36]:
# split into training, dev, and test sets
train, dev, test = train_dev_test_split(token_spans, 0.6, 0.2)

In [38]:
print(len(train), len(dev), len(test))

588 196 197


In [39]:
outfilestem = "/Users/Carol/Documents/epicurious-recipes-with-rating-and-nutrition/food_gold_{}.conll"
for dataset, name in zip([train, dev, test], ["train", "dev", "test"]):
    write_conll(dataset, outfilestem.format(name))   

# To import back to Prodigy

prodigy db-in [dataset] [in_file] [--loader] [--answer] [--overwrite] [--dry]
