# Data Validation

After getting sentences that match our patterns using SPIKE's API, we would want to make sure the following hold, before tagging the data.

1. Sentences are not too short (like titles).
2. Captures make sense ( no non-alphabetical results etc.)
3. Spike is captures-oriented, that is, it returns a match per set of capture. We'd like to merge matches that are the same sentence that is because it has more than a single capture - for example
  a. sent 1: [David Bowie] and Freddie Mercury
  b. sent 2: David Bowie and [Freddie Mercury]
This should be merge that both are labeled with the musician label.
5. Similarly for non-musicians we'd like to ignore the captures and just look at the NER results.
6. `'s` is not part of the entity
7. Sentences in the train set do not appear in the test/dev sets.

and so on...

In [3]:
import string
import json
import pandas as pd
import glob
import jsonlines
from random import sample, shuffle
from termcolor import colored

In [4]:
LABEL = 'positive'
DATAPATH = '../data'
DATASET_PATH = f'{DATAPATH}/schools'
SPIKE_MATCH_PATH = f'{DATAPATH}/spike_matches'
VERSION_NAME = "all_without_person"
SUFFIX = "_schools"

### Extract dev/test sentences

In [5]:
def remove_tags(sentence):
    tokens = []
    for t in sentence.split():
        if t:
            tokens.append(t.split('-[',1)[0])
    return clean_punct(" ".join(tokens))

def clean_punct(sentence):
    s = sentence.translate(str.maketrans('', '', string.punctuation))
    s = s.replace("  ", " ")
    return s

def get_dev_and_test_sentences(dataset_path):
    test_path = dataset_path + '/test.txt'
    dev_path = dataset_path + '/dev.txt'
    with open(test_path, 'r') as ft, open(dev_path, 'r') as fd:
        test_set = [remove_tags(sent.strip()) for sent in ft.readlines()]
        dev_set = [remove_tags(sent.strip()) for sent in fd.readlines()]
    dev_and_test = dev_set + test_set
    return dev_and_test

### validations

In [10]:
#validations
def sentence_is_not_too_short(sentence_text):
    return len(sentence_text) > 50

def capture_is_not_non_alphabetical(capture_text):
    alphabet = 'q w e r t y u i o p a s d f g h j k l z x c v b n m'.split()
    return any(x in capture_text for x in alphabet)


def validate_sentence(sentence_dict, label, capture_text, sentence_text, dev_and_test):
    if not capture_is_not_non_alphabetical(capture_text): 
        return False
    if not sentence_is_not_too_short(sentence_text): 
        return False
#     if sentence_text in dev_and_test: 
#         return False
    return True

### Collect sentences

In [12]:
def get_capture(sentence, label):
    tokens = sentence["words"]
    capture = sentence['captures'].get(label)
    if capture:
        first = capture['first'] 
        last = capture['last']
        capture_tokens = [t for i, t in enumerate(tokens) if first <= i <= last ]
        return " ".join(capture_tokens), first, last
    else:
        return "", -1, -1

def get_entities(sentence, cap_first, cap_last):
    entities = set()
    for e in sentence['entities']:
        all_entity_indices = [*range(e['first'], e['last'])]
        if all(x not in all_entity_indices for x in [cap_first, cap_last]):
            entities.add((e['first'], e['last']))
    return entities


def collect_train_set_sentences():
    train_set = dict()
#     dev_and_test = get_dev_and_test_sentences(DATASET_PATH)
    dev_and_test = None
    invalids = 0
    same_sent = 0
    for file in glob.glob(f'{SPIKE_MATCH_PATH}/**/*{SUFFIX}.jsonl', recursive=True):
        with jsonlines.open(file, "r") as f:
            for sentence_dict in f:
                label = LABEL if 'positive' in file else 'negative'
                sentence_text = clean_punct(" ".join(sentence_dict["words"])).strip()
                capture_text, cap_first, cap_last = get_capture(sentence_dict, label)
                if capture_text:
                    if not validate_sentence(sentence_dict, label, capture_text, sentence_text, dev_and_test):
                        invalids += 1
                        continue
                    if sentence_text not in train_set.keys():
                        if label == 'positive':
                            train_set[sentence_text] = {
                                "id": sentence_dict["sentence_index"],
                                "label": label,
                                "words": sentence_dict["words"],
                                "captures": {(cap_first, cap_last)},
                                "entities": get_entities(sentence_dict, cap_first, cap_last),
                                "need_tagging": True
                            }
                        else:
                            entities = get_entities(sentence_dict, cap_first, cap_last)
                            entities.update({(cap_first, cap_last)})
                            train_set[sentence_text] = {
                                "id": sentence_dict["sentence_index"],
                                "label": label,
                                "words": sentence_dict["words"],
                                "captures": {},
                                "entities": entities,
                                "need_tagging": True
                            }    
                    else:
                        if label == 'positive':
                            train_set[sentence_text]["captures"].add((cap_first, cap_last))
                            new_entities = get_entities(sentence_dict, cap_first, cap_last)
                            train_set[sentence_text]["entities"].update(new_entities)
                        elif (cap_first, cap_last) not in train_set[sentence_text]["captures"]:
                            train_set[sentence_text]["entities"].add((cap_first, cap_last))
                        else:
                            # not a true negative!
                            continue
                else:
                    if sentence_text not in train_set:
                        train_set[sentence_text] = {
                            "id": sentence_dict["sentence_index"],
                            "label": label,
                            "words": sentence_dict["words"],
                            "captures": {},
                            "entities": {},
                            "need_tagging": False
                        }
                    else:
                        same_sent += 1
                 
    
    # make sure there are significantly more negative examples than positive ones. 
    print("Number of negatives: ", len([x for x, y in train_set.items() if y["label"] != 'positive']))
    print("Number of positives: ", len([x for x, y in train_set.items() if y["label"] == 'positive']))
    print("invalids: ", invalids)
    print("same_sent: ", same_sent)
    
    return train_set


## Tag Dataset

In [8]:
# Alternatively, save as one token per line
def flatten_list(ent_list):
    return [item for sublist in ent_list for item in sublist]
    

def tag_sentence_one_token_per_row(sentence, with_person=True):
    if sentence["need_tagging"]:
        tags = []
        captures = [[*range(span[0], span[1]+1)] for span in sentence["captures"]]
        entities = [[*range(span[0], span[1]+1)] for span in sentence["entities"]]
        flat_captures = flatten_list(captures)
        flat_entities = flatten_list(entities)

        for i, word in enumerate(sentence['words']):
            if word != "'s":
                if i in flat_captures:
                    captures, tags = tag_span(captures, i, word, 'MUS', tags)
                elif i in flat_entities and with_person:
                    entities, tags = tag_span(entities, i, word, 'PER', tags)
                else:
                    tags.append((word,"O"))
            else:
                tags.append((word,"O"))
        return tags
    else:
        return [(word, "O") for word in sentence['words']]


def tag_span(span_list, i, word, tag_suffix, tags):
    for span in span_list:
        if i == span[0]:
            tags.append((word, f"B-{tag_suffix}"))
        elif i in span:
            tags.append((word,f"I-{tag_suffix}"))
        elif i == span[-1]:
            tags.append((word,f"I-{tag_suffix}"))
            span_list.remove(cap)
    return span_list, tags

In [13]:
train_set = collect_train_set_sentences()

# with jsonlines.open(f'{DATASET_PATH}/{VERSION_NAME}.jsonl', 'w') as f:
#     for sent in sample([v for v in train_set.values()], len(train_set)):
#         tags = tag_sentence_one_token_per_row(sent, with_person=False)
#         sent_json = {"id": sent["id"], "sent_items": tags}
#         f.write(sent_json)

Number of negatives:  22424
Number of positives:  22744
invalids:  5986
same_sent:  0


In [14]:
train_set.keys()



In [16]:
[x for x, y in train_set.items() if y["label"] == "negative"]

['He contributed both his voice and his musical compositions to advertisements for companies such as Yamaha which used the music from Silver Bird as the background to one of its commercials',
 'Produced by Touchstone Television the show was to star Eddie Cibrian as Brock KaDee Strickland Mario Van Peebles and Hal Holbrook',
 'The Masonite agency contracts unlike those in General Electric  effectually removed the most important factor in competition between Masonite and its distributors—price',
 'Established in 1983 Sportsmans Warehouse is a sporting goods retailer operating in 4 states across Australia',
 'On 2 May 2015 Surman was part of the Bournemouth team which won the Championship title and promotion to the Premier League',
 'Despite this grand lineage however the name of Temple Bar street seems to have been more directly borrowed from the storied Temple Bar district in London where the main toll gate into London was located dating back to medieval times',
 'Ahead of the second le

In [22]:
a = 'The Masonite agency contracts unlike those in General Electric  effectually removed the most important factor in competition between Masonite and its distributors—price'
print(train_set[a])
tag_sentence_one_token_per_row(train_set[a])

{'id': 67043, 'label': 'negative', 'words': ['The', '"', 'Masonite', '"', 'agency', 'contracts', ',', 'unlike', 'those', 'in', '"', 'General', 'Electric', '"', ',', '"', 'effectually', 'removed', 'the', 'most', 'important', 'factor', 'in', 'competition', 'between', 'Masonite', 'and', 'its', 'distributors"—price', '.'], 'captures': {}, 'entities': {(11, 12), (25, 25)}, 'need_tagging': True}


[('The', 'O'),
 ('"', 'O'),
 ('Masonite', 'O'),
 ('"', 'O'),
 ('agency', 'O'),
 ('contracts', 'O'),
 (',', 'O'),
 ('unlike', 'O'),
 ('those', 'O'),
 ('in', 'O'),
 ('"', 'O'),
 ('General', 'B-ORG'),
 ('Electric', 'I-ORG'),
 ('"', 'O'),
 (',', 'O'),
 ('"', 'O'),
 ('effectually', 'O'),
 ('removed', 'O'),
 ('the', 'O'),
 ('most', 'O'),
 ('important', 'O'),
 ('factor', 'O'),
 ('in', 'O'),
 ('competition', 'O'),
 ('between', 'O'),
 ('Masonite', 'B-ORG'),
 ('and', 'O'),
 ('its', 'O'),
 ('distributors"—price', 'O'),
 ('.', 'O')]

In [23]:
def tag_sentence(sentence, with_superclass=True):
    if sentence["need_tagging"]:
        tags = []
        captures = [[*range(span[0], span[1] + 1)] for span in sentence["captures"]]
        entities = [[*range(span[0], span[1] + 1)] for span in sentence["entities"]]
        flat_captures = flatten_list(captures)
        flat_entities = flatten_list(entities)

        for i, word in enumerate(sentence['words']):
            if word != "'s":
                if i in flat_captures:
                    captures, tags = tag_span(captures, i, word, 'SCHOOL', tags)
                elif i in flat_entities and with_superclass:
                    entities, tags = tag_span(entities, i, word, 'ORG', tags)
                else:
                    tags.append((word, "O"))
            else:
                tags.append((word, "O"))
        return tags
    else:
        return [(word, "O") for word in sentence['words']]

## Split Sets

Create dev and test sets by splitting the current train set. The code below splits the current train set into 80%-10%-10% for train dev and test sets respectively 

In [11]:
def split_train_dev_test(fp, sample=False):
    with open(fp, "r") as f:
        all_lines = f.readlines()
        shuffle(all_lines)
        datasize = len(all_lines)
        dev_border = int(datasize*0.1) if not sample else 300
        test_border = int(datasize*0.9) if not sample else datasize-300
        with open(fp.replace("train", "split_dev"), "w") as f:
            for line in all_lines[0:dev_border]:
                f.write(line)
        with open(fp.replace("train", "split_train"), "w") as f:
            for line in all_lines[dev_border:test_border]:
                f.write(line)
        with open(fp.replace("train", "split_test"), "w") as f:
            for line in all_lines[test_border:]:
                f.write(line)
    
split_train_dev_test(f'{DATASET_PATH}/{VERSION_NAME}.jsonl', sample=True)

In [12]:
%%bash -s "$DATASET_PATH" "$VERSION_NAME"
du -sh $1/split_$2.jsonl

8.7M	../data/musicians_dataset/split_train_only_hearst_uniques.jsonl


## convert dumps to pandas DF

In [None]:
def import_jsons_to_df(fp):
    tagged_data = []
    with jsonlines.open(fp, 'r') as f:
        for line in f:
            for token in line["sent_items"]:
                tagged_data.append([line["id"], token[0], token[1]])
    return pd.DataFrame(tagged_data, columns=["sentence_id", "words", "labels"])
    

## Inspect Dev Set

Here are some methods that can help us detect bad sentences in the dev set

In [None]:
df = import_jsons_to_df(f"{DATASET_PATH}/sample_dev.jsonl")
df.head(50)

In [None]:
first_mus = df[df["labels"] == "B-MUS"]["sentence_id"].unique().tolist()[1]
df[df["sentence_id"] == first_mus]

In [None]:
non_alpha_persons = [x for x in df[df["labels"] == "I-PER"]["words"].unique().tolist() if not x.isalpha()]
# this c
sentences_with_non_alpha_persons = df[df["words"].isin(non_alpha_persons)]
for i, row in sentences_with_non_alpha_persons.iterrows():
    print(row["sentence_id"], row["words"])

In [None]:
all_musicians = df[df["labels"] == "B-MUS"]["words"].unique().tolist()
all_musicians

### Print Dev sentences with PER and MUS highlighted
Another way of reading the sentences comfortably

In [2]:
with jsonlines.open("../data/schools/split_dev_py_scripts.jsonl", 'r') as f:
    for i, line in enumerate(f):
        color_text = [x[0] if x[1] == "O" else colored(x[0], 'red') if "ORG" in x[1] else colored(x[0], 'green') for x in line["sent_items"]]
        print(i, line["id"], " ".join(color_text))
        print()

0 8204098 [32mTauhara[0m [32mCollege[0m is one of three high schools in Taupo ; the others are Taupo - nui - a - Tia College and Lake Taupo Christian School ( state integrated ) .

1 86357507 He has been a panelist at the Museum of Contemporary Art ( MOCA ) , Los Angeles County Museum of Art ( LACMA ) , galleries and universities such as [32mThe[0m [32mUniversidad[0m [32mAutonoma[0m [32mde[0m [32mMexico[0m ( UAM ) and [32mCalArts[0m .

2 102153 The 2001 Qatar Total Fina Elf Open was a tennis tournament played on outdoor hard courts at the Khalifa International Tennis Complex in Doha in Qatar and was part of Tier III of the 2001 WTA Tour .

3 111295 Israel Contreras ( born December 27 , 1960 ) is a Venezuelan former professional boxer who won the World Boxing Association bantamweight title .

4 6490526 His papers were donated to [32mthe[0m [32mAdministrative[0m [32mStaff[0m [32mCollege[0m , by then renamed Henley Management College .

5 159513928 After an extensi

In [None]:
with jsonlines.open(f"{DATASET_PATH}/dev_converted.jsonl", 'r') as f:
    for i, line in enumerate(f):
        color_text = [x[0] if x[1] == "O" else colored(x[0], 'red') if "PER" in x[1] else colored(x[0], 'green') for x in line["sent_items"]]
        print(i, line["id"], " ".join(color_text))
        print()

### Convert Dev Set format
When collecting data using the `tag_sentences` notebook, we created sentences in the following format:
```
The-[O] General-[O] --[O] Director-[O] of-[O] Rustavi-[O] 2-[O]
```
We would like to convert the manually curated dev and test sets to the same format like the jsonlines train set.
```
{"id": 15731, "sent_items": [["His", "O"], ["wife", "O"], ["Elizabeth", "B-PER"], ["died", "O"], ...]}
```

In [None]:
LABELS = {"B": "B-MUS", "PB": "B-PER", "I": "I-MUS", "PI": "I-PER", "O": "O"}

def convert_format_to_jsonl(fp):
    """
    input: The-[O] General-[O] --[O] Director-[O] of-[O] Rustavi-[O] 2-[O] ...
    output : {"id": 15731, "sent_items": [["His", "O"], ["wife", "O"], ["Elizabeth", "B-PER"], ["died", "O"], ...]}
    
    """
    def convert_token(token):
        t, l = token.split("-[")
        l = LABELS[l[:-1]]
        return [t, l]
        
    with open(fp, "r") as fin:
        with jsonlines.open(f"{fp.replace('.txt', '_converted.jsonl')}", "w") as fout:
            for i, line in enumerate(fin.readlines()):
                line = line.split()
                sent_items = [convert_token(x) for x in line]
                fout.write({"id": i, "sent_items": sent_items})

convert_format_to_jsonl("../data/musicians_dataset/test.txt")
convert_format_to_jsonl("../data/musicians_dataset/dev.txt")