# Previously I split the data into train, dev, and test sets without keeping sentences from the same recipe together.   

## Now, make a new split where sentences from the same recipe are kept together.

In [1]:
from collections import defaultdict
import json
import sys

sys.path.append("../..")
from src.data_prep.split_dataset import train_dev_test_split
from src.data_prep.prodigy_to_conll import get_bio_tagged_spans_from_jsonl, write_conll

In [2]:
# labeled data from Prodigy
db_file = "/Users/Carol/Dropbox/epicurious-recipes-with-rating-and-nutrition/labeled_data/food_ner_gold_2.jsonl"

In [3]:
records = defaultdict(list)  # recipe num as key, list of sentences as value
with open(db_file, 'r') as infile:
    for line in infile:
        record = json.loads(line)
        num = record['meta']['row']
        part = record['meta']['subpart']
        records[num].append(part)

In [4]:
recipe_ids = list((records.keys()))
len(recipe_ids)

301

In [5]:
# choose recipe indices to put in train, dev, and test sets
train, dev, test = train_dev_test_split(recipe_ids, 0.6, 0.2, random_seed=16)

In [6]:
# count the number of recipes and sentences in each split of the data
for name, id_list in zip(["Training", "Dev", "Test"], [train, dev, test]):
    print("{} set".format(name))
    print("Total recipes: ", len(id_list))
    ctr = 0
    for recipe_id in id_list:
        ctr += len(records[recipe_id])
    print("Total sentences: ", ctr)
    print("_________________")

Training set
Total recipes:  180
Total sentences:  592
_________________
Dev set
Total recipes:  60
Total sentences:  195
_________________
Test set
Total recipes:  61
Total sentences:  194
_________________


In [7]:
token_spans = get_bio_tagged_spans_from_jsonl(db_file)

In [8]:
len(token_spans)

981

In [9]:
token_spans[0]

[{'text': 'Combine',
  'start': 0,
  'end': 7,
  'id': 0,
  'record_id': '17006_1',
  'bio-tag': 'O'},
 {'text': 'all',
  'start': 8,
  'end': 11,
  'id': 1,
  'record_id': '17006_1',
  'bio-tag': 'O'},
 {'text': 'ingredients',
  'start': 12,
  'end': 23,
  'id': 2,
  'record_id': '17006_1',
  'bio-tag': 'O'},
 {'text': 'in',
  'start': 24,
  'end': 26,
  'id': 3,
  'record_id': '17006_1',
  'bio-tag': 'O'},
 {'text': 'large',
  'start': 27,
  'end': 32,
  'id': 4,
  'record_id': '17006_1',
  'bio-tag': 'O'},
 {'text': 'bowl',
  'start': 33,
  'end': 37,
  'id': 5,
  'record_id': '17006_1',
  'bio-tag': 'O'},
 {'text': ';',
  'start': 37,
  'end': 38,
  'id': 6,
  'record_id': '17006_1',
  'bio-tag': 'O'},
 {'text': 'toss',
  'start': 39,
  'end': 43,
  'id': 7,
  'record_id': '17006_1',
  'bio-tag': 'O'},
 {'text': 'to',
  'start': 44,
  'end': 46,
  'id': 8,
  'record_id': '17006_1',
  'bio-tag': 'O'},
 {'text': 'blend',
  'start': 47,
  'end': 52,
  'id': 9,
  'record_id': '17006_1'

In [10]:
# make separate token span lists for train, dev, and test
train_docs = []
dev_docs = []
test_docs = []

for doc in token_spans:
    record_id = doc[0]['record_id']
    recipe_id = int(record_id.split("_")[0])
    if recipe_id in train:
        train_docs.append(doc)
    elif recipe_id in dev:
        dev_docs.append(doc)
    elif recipe_id in test:
        test_docs.append(doc)
    else:
        raise Exception("Couldn't find recipe id {} in train, dev, or test!".format(recipe_id))
    

In [11]:
len(train_docs)

592

In [12]:
len(dev_docs)

195

In [13]:
len(test_docs)

194

# Convert to CoNLL format

In [14]:
# write out training, dev, and test sets to conll
outfilestem = "/Users/Carol/Dropbox/epicurious-recipes-with-rating-and-nutrition/20200523_food_gold_{}.conll"
for dataset, name in zip([train_docs, dev_docs, test_docs], ["train", "dev", "test"]):
    write_conll(dataset, outfilestem.format(name))   