# Data Validation

After getting sentences that match our patterns using SPIKE's API, we would want to make sure the following hold, before tagging the data.

1. Sentences are not too short (like titles).
2. Captures make sense ( no non-alphabetical results etc.)
3. Spike is captures-oriented, that is, it returns a match per set of capture. We'd like to merge matches that are the same sentence that is because it has more than a single capture - for example
  a. sent 1: [David Bowie] and Freddie Mercury
  b. sent 2: David Bowie and [Freddie Mercury]
This should be merge that both are labeled with the musician label.
5. Similarly for non-musicians we'd like to ignore the captures and just look at the NER results.
6. `'s` is not part of the entity
7. Sentences in the train set do not appear in the test/dev sets.

and so on...

In [1]:
import string
import json
import pandas as pd
import glob
import jsonlines
from random import sample, shuffle

In [2]:
LABEL = 'musician'
DATAPATH = '../data'
DATASET_PATH = f'{DATAPATH}/musicians_dataset'
SPIKE_MATCH_PATH = f'{DATAPATH}/spike_matches'

### Extract dev/test sentences

In [3]:
def remove_tags(sentence):
    tokens = []
    for t in sentence.split():
        if t:
            tokens.append(t.split('-[',1)[0])
    return clean_punct(" ".join(tokens))

def clean_punct(sentence):
    s = sentence.translate(str.maketrans('', '', string.punctuation))
    s = s.replace("  ", " ")
    return s

def get_dev_and_test_sentences(dataset_path):
    test_path = dataset_path + '/test.txt'
    dev_path = dataset_path + '/dev.txt'
    with open(test_path, 'r') as ft, open(dev_path, 'r') as fd:
        test_set = [remove_tags(sent.strip()) for sent in ft.readlines()]
        dev_set = [remove_tags(sent.strip()) for sent in fd.readlines()]
    dev_and_test = dev_set + test_set
    return dev_and_test

### validations

In [4]:
#validations
def sentence_is_not_too_short(sentence_text):
    return len(sentence_text) > 50

def capture_is_not_non_alphabetical(capture_text):
    alphabet = 'q w e r t y u i o p a s d f g h j k l z x c v b n m'.split()
    return any(x in capture_text for x in alphabet)


def validate_sentence(sentence_dict, label, capture_text, sentence_text, dev_and_test):
    if not capture_is_not_non_alphabetical(capture_text): return False
    if not sentence_is_not_too_short(sentence_text): return False
    if sentence_text in dev_and_test: return False
    return True

### Collect sentences

In [5]:
def get_capture(sentence, label):
    tokens = sentence["words"]
    first = sentence['captures'][label]['first']
    last = sentence['captures'][label]['last']
    capture_tokens = [t for i, t in enumerate(tokens) if first <= i <= last ]
    return " ".join(capture_tokens), first, last

def get_entities(sentence, cap_first, cap_last):
    entities = set()
    for e in sentence['entities']:
        all_entity_indices = [*range(e['first'], e['last'])]
        if all(x not in all_entity_indices for x in [cap_first, cap_last]):
            entities.add((e['first'], e['last']))
    return entities
#     return {(e['first'], e['last']) for e in sentence['entities']}  - {(cap_first, cap_last)}


def collect_train_set_sentences():
    train_set = dict()
    dev_and_test = get_dev_and_test_sentences(DATASET_PATH)
    
    for file in glob.glob(f'{SPIKE_MATCH_PATH}/**/*.json', recursive=True):
        with open(file, "r") as f:
            j = json.load(f)
            for sentence_dict in j:
                label = LABEL if 'positive' in file else 'negative'
                sentence_text = clean_punct(" ".join(sentence_dict["words"])).strip()
                capture_text, cap_first, cap_last = get_capture(sentence_dict, label)
                if not validate_sentence(sentence_dict, label, capture_text, sentence_text, dev_and_test): continue
                if sentence_text not in train_set.keys():
                    if label == 'musician':
                        train_set[sentence_text] = {
                            "label": label,
                            "words": sentence_dict["words"],
                            "captures": {(cap_first, cap_last)},
                            "entities": get_entities(sentence_dict, cap_first, cap_last)
                        }
                    else:
                        entities = get_entities(sentence_dict, cap_first, cap_last)
                        entities.update({(cap_first, cap_last)})
                        train_set[sentence_text] = {
                            "label": label,
                            "words": sentence_dict["words"],
                            "captures": {},
                            "entities": entities
                        }                       
                else:
                    if label == 'musician':
                        train_set[sentence_text]["captures"].add((cap_first, cap_last))
                        new_entities = get_entities(sentence_dict, cap_first, cap_last)
                        train_set[sentence_text]["entities"].update(new_entities)
                    else:                        
                        if (cap_first, cap_last) not in train_set[sentence_text]["captures"]:
                            train_set[sentence_text]["entities"].add((cap_first, cap_last))
                        else:
                            # not a true negative!
                            continue
    
    # make sure there are significantly more negative examples than positive ones. 
    print("Number of negatives: ", len([x for x, y in train_set.items() if y["label"] != 'musician']))
    print("Number of positives: ", len([x for x, y in train_set.items() if y["label"] == 'musician']))
    
    return train_set


## Tag Dataset

In [6]:
# Alternatively, save as one token per line
def flatten_list(ent_list):
    return [item for sublist in ent_list for item in sublist]
    

def tag_sentence_one_token_per_row_2(sentence):
    tags = []
    captures = [[*range(span[0], span[1]+1)] for span in sentence["captures"]]
    entities = [[*range(span[0], span[1]+1)] for span in sentence["entities"]]
    flat_captures = flatten_list(captures)
    flat_entities = flatten_list(entities)
    
    for i, word in enumerate(sentence['words']):
        if word != "'s":
            if i in flat_captures:
                captures, tags = tag_span(captures, i, word, 'MUS', tags)
            elif i in flat_entities:
                entities, tags = tag_span(entities, i, word, 'PER', tags)
            else:
                tags.append((word,"O"))
        else:
            tags.append((word,"O"))
    return tags


def tag_span(span_list, i, word, tag_suffix, tags):
    for span in span_list:
        if i == span[0]:
            tags.append((word, f"B-{tag_suffix}"))
        elif i in span:
            tags.append((word,f"I-{tag_suffix}"))
        elif i == span[-1]:
            tags.append((word,f"I-{tag_suffix}"))
            span_list.remove(cap)
    return span_list, tags

In [20]:
# OBSOLETE
def tag_span(i, word, sentence, span_type, span_tagging):
    for span in sentence[span_type]:
        if span[0] <= i <= span[1]:
            if i == span[0]:
                sentence["tagged_sentence"] += f"{word}-[{span_tagging}B] "
            elif i == span[1]:
                if word == "'s":
                    sentence["tagged_sentence"] += f"{word}-[O] "
                else:
                    sentence["tagged_sentence"] += f"{word}-[{span_tagging}I] "
                sentence[span_type].remove(span)                
            else:
                sentence["tagged_sentence"] += f"{word}-[{span_tagging}I] "
            return True


def tag_sentence_one_sentence_per_row(sentence):
    sentence["tagged_sentence"] = ""
    captures = list(sentence["captures"])
    entities = list(sentence["entities"])
    for i, word in enumerate(sentence['words']):
        if sentence['label'] == LABEL:
            is_capture = tag_span(i, word, sentence, 'captures', '')
        else:
            is_capture = tag_span(i, word, sentence, 'captures', 'P')
        is_entity = tag_span(i, word, sentence, 'entities', 'P')
        if not (is_capture or is_entity):
            sentence["tagged_sentence"] += f"{word}-[O] "        
    sentence.update({"entities": entities, "captures": captures})
    
    return sentence   
        

In [10]:
train_set = collect_train_set_sentences()

with jsonlines.open(f'{DATASET_PATH}/train.jsonl', 'w') as f:
    for i, sent in enumerate(sample([v for v in train_set.values()], len(train_set))):
        tags = tag_sentence_one_token_per_row_2(sent)
        sent_json = {"id":i, "sent_items": tags}
        f.write(sent_json)

Number of negatives:  28393
Number of positives:  5322


## Alternative Sets

Create other dev and test sets. The code below splits the current train set into 80%-10%-10% for train dev and test sets respectively 

In [16]:
DATASET = '../data/musicians_dataset'

In [11]:
def split_train_dev_test(fp, sample=False):
    with open(fp, "r") as f:
        all_lines = f.readlines()
        shuffle(all_lines)
        datasize = len(all_lines)
        dev_border = int(datasize*0.1) if not sample else 300
        test_border = int(datasize*0.9) if not sample else datasize-300
        with open(fp.replace("train", "sample_dev"), "w") as f:
            for line in all_lines[0:dev_border]:
                f.write(line)
        with open(fp.replace("train", "split_train"), "w") as f:
            for line in all_lines[dev_border:test_border]:
                f.write(line)
        with open(fp.replace("train", "sample_test"), "w") as f:
            for line in all_lines[test_border:]:
                f.write(line)
    
split_train_dev_test(f'{DATASET_PATH}/train.jsonl', sample=True)

## convert dumps to pandas DF

In [13]:
def import_jsons_to_df(fp):
    tagged_data = []
    with jsonlines.open(fp, 'r') as f:
        for line in f:
            for token in line["sent_items"]:
                tagged_data.append([line["id"], token[0], token[1]])
    return pd.DataFrame(tagged_data, columns=["sentence_id", "words", "labels"])
    

## Dev Set cleanup

Here are some methods that can help us detect bad sentences in the dev set

In [14]:
df = import_jsons_to_df(f"{DATASET_PATH}/sample_dev.jsonl")
df.head(50)

Unnamed: 0,sentence_id,words,labels
0,13389,South,O
1,13389,Dakota,O
2,13389,was,O
3,13389,won,O
4,13389,by,O
5,13389,Governor,O
6,13389,George,B-PER
7,13389,W.,I-PER
8,13389,Bush,I-PER
9,13389,by,O


In [15]:
first_mus = df[df["labels"] == "B-MUS"]["sentence_id"].unique().tolist()[1]
df[df["sentence_id"] == first_mus]

Unnamed: 0,sentence_id,words,labels
220,17189,Sikkil,B-MUS
221,17189,Mala,I-MUS
222,17189,Chandrasekhar,I-MUS
223,17189,is,O
224,17189,a,O
225,17189,noted,O
226,17189,South,O
227,17189,Indian,O
228,17189,carnatic,O
229,17189,flautist,O


In [19]:
non_alpha_persons = [x for x in df[df["labels"] == "I-PER"]["words"].unique().tolist() if not x.isalpha()]
# this c
sentences_with_non_alpha_persons = df[df["words"].isin(non_alpha_persons)]
for i, row in sentences_with_non_alpha_persons.iterrows():
    print(row["sentence_id"], row["words"])

13389 W.
6613 -
24987 -
16505 "
16505 "
16505 "
16505 "
12907 E.
8009 "
8009 "
12687 -
17497 -
9367 L.
9367 E.
9367 -
9367 -
6450 "
6450 "
13373 -
17037 "
17037 "
18913 "
8922 -
26154 -
26154 2
2549 -
2549 -
712 "
712 "
27903 "
27903 "
3576 -
3576 -
3576 -
17161 -
6554 -
6554 "
6554 "
10091 "
10091 "
2369 l'Espadella
23521 "
23521 "
23521 "
23521 "
28246 -
28246 M'Baye
29463 C.
6477 "
26803 "
26803 "
26803 "
26803 "
9506 -
9506 -
12720 "
12720 "
12720 "
12720 "
23494 "
23494 "
23494 "
23494 "
19873 2
26992 "
26992 "
18437 -
18437 -
24244 "
24244 -
24244 "
24244 "
1530 -
1530 -
22042 "
1368 -
10457 -
14913 2
14645 -
18756 -
18756 -
29736 -
23022 D.
2924 -
2924 "
2924 "
30438 "
30438 "
30438 "
30438 D.
24190 -
22157 "
22157 "
25525 -
1394 -
21973 -
21045 -
21045 "
21045 "
31838 "
31838 "
31838 -
7755 "
7755 "
3830 -
3830 -
3830 "
3830 "
8219 4
8219 2
23874 -
23874 "
23874 "
23874 "
23874 "
5367 "
5367 "
5367 "
5367 "
5367 "
5367 "
3268 -
20858 -
32528 "
32528 "
32528 "
32528 "
19523 -
21

In [20]:
all_musicians = df[df["labels"] == "B-MUS"]["words"].unique().tolist()
all_musicians

['Glen',
 'Sikkil',
 'Steve',
 'Missy',
 'Danny',
 'Merle',
 'Jerry',
 'Kim',
 'James',
 'Patrice',
 'Patsy',
 'Wu',
 'Arnold',
 'Healey',
 'Calvin',
 'Högn',
 'Mariah',
 '6lack',
 'Stacey',
 'Janis',
 'Ray',
 'Elsa',
 'Charles',
 'Beatles',
 'Keyshia',
 'Frank',
 'Merissa',
 'Cooly',
 'Santitos',
 'Whitey',
 'Jimi',
 'David',
 'Chaabi',
 'Edward',
 'Madchester',
 'Joe',
 'Bill',
 'Powter',
 'Ryan',
 'Ernie',
 'Apareceu',
 'Adrian',
 'Jim',
 'Matthew',
 'Neil',
 'Cheryl',
 'Chet',
 'Richard',
 'Giacomo',
 'Pete']

Another way of reading the sentences comfortably

In [21]:
from termcolor import colored

In [27]:
with jsonlines.open(f"{DATASET_PATH}/sample_dev.jsonl", 'r') as f:
    for line in f:
        color_text = [x[0] if x[1] == "O" else colored(x[0], 'red') if "PER" in x[1] else colored(x[0], 'green') for x in line["sent_items"]]
        print(line["id"], " ".join(color_text))
        print()

13389 South Dakota was won by Governor [31mGeorge[0m [31mW.[0m [31mBush[0m by a 22.74 point margin of victory .

6613 Fantasy is an album by American singer - songwriter [31mCarole[0m [31mKing[0m , released in 1973 .

2771 His son , [31mBernard[0m , is the Conservative Member of Parliament for Harwich and North Essex .

24987 [32mGlen[0m [32mBurtnik[0m ( born [31mGlenn[0m [31mBurtnick[0m ; April 8 , 1955 ) is a singer , songwriter , entertainer and multi - instrumentalist , best known as a former member of [31mStyx[0m .

16505 An archival   recording of her performing " Balladeer of Cole Younger " was presented on [31mDanny[0m [31mDozier[0m 's Ozark Highlands Radio show where she was introduced as a " prodigious Ozark folk balladeer " .

15710 [31mHenry[0m admitted [31mArnulf[0m 's right to appoint bishops , which greatly reduced Pilgrim 's status .

2228 The company was created in July 2004 in a deal by former Ukrainian President [31mLeonid[0m [31mKuch