# Data Validation

After getting sentences that match our patterns using SPIKE's API, we would want to make sure the following hold, before tagging the data.

1. Sentences are not too short (like titles).
2. Captures make sense ( no non-alphabetical results etc.)
3. Spike is captures-oriented, that is, it returns a match per set of capture. We'd like to merge matches that are the same sentence that is because it has more than a single capture - for example
  a. sent 1: [David Bowie] and Freddie Mercury
  b. sent 2: David Bowie and [Freddie Mercury]
This should be merge that both are labeled with the musician label.
5. Similarly for non-musicians we'd like to ignore the captures and just look at the NER results.
6. `'s` is not part of the entity
7. Sentences in the train set do not appear in the test/dev sets.

and so on...

In [None]:
import string
import json
import pandas as pd
import glob

### Extract dev/test sentences

In [None]:
def remove_tags(sentence):
    tokens = []
    for t in sentence.split():
        if t:
            tokens.append(t.split('-[',1)[0])
    return clean_punct(" ".join(tokens))

def clean_punct(sentence):
    s = sentence.translate(str.maketrans('', '', string.punctuation))
    s = s.replace("  ", " ")
    return s

def get_dev_and_test_sentences(dataset_path):
    test_path = dataset_path + '/test.txt'
    dev_path = dataset_path + '/dev.txt'
    with open(test_path, 'r') as ft, open(dev_path, 'r') as fd:
        test_set = [remove_tags(sent.strip()) for sent in ft.readlines()]
        dev_set = [remove_tags(sent.strip()) for sent in fd.readlines()]
    dev_and_test = dev_set + test_set
    return dev_and_test

In [None]:
def get_capture(sentence, label):
    tokens = sentence["words"]
    first = sentence['captures'][label]['first']
    last = sentence['captures'][label]['last']
    capture_tokens = [t for i, t in enumerate(tokens) if first <= i <= last ]
    return " ".join(capture_tokens), first, last

def get_entities(sentence):
    return [(e['first'], e['last']) for e in sentence['entities']]

In [None]:
#validations
def sentence_is_not_too_short(sentence_text):
    return len(sentence_text) > 50

def capture_is_not_non_alphabetical(capture_text):
    alphabet = 'q w e r t y u i o p a s d f g h j k l z x c v b n m'.split()
    return any(x in capture_text for x in alphabet)


def validate_sentence(sentence_dict, label, capture_text, sentence_text, dev_and_test):
    if not capture_is_not_non_alphabetical(capture_text): return None, None
    if not sentence_is_not_too_short(sentence_text): return None, None
    if sentence_text in dev_and_test: return None, None
    return True

In [None]:
# merge similar sentences
positives[0]

In [None]:
train_set = dict()
LABEL = 'musician'
dev_and_test = get_dev_and_test_sentences('../data/musicians_dataset')


for file in glob.glob('../data/spike_matches/**/*.json', recursive=True):
    with open(file, "r") as f:
        j = json.load(f)
        for sentence_dict in j:
            # start validations:
            label = LABEL if 'positive' in file else 'negative'
            capture_text, cap_first, cap_last = get_capture(sentence_dict, label)
            entities = get_entities(sentence_dict)
            sentence_text = clean_punct(" ".join(sentence_dict["words"])).strip()
            if not validate_sentence(sentence_dict, label, capture_text, sentence_text, dev_and_test): continue
            # 
            if sentence_text not in train_set.keys():
                train_set[sentence_text] = {
                    "label": label,
                    "words": sentence_dict["words"],
                    "captures": {(cap_first, cap_last)},
                    "entities": entities
                }
            else:
                print(label, sentence_text)
                if label == 'musician':
                    
                    train_set[sentence_text]["captures"].add((cap_first, cap_last))
                else:
                    train_set[sentence_text]["entities"].add((cap_first, cap_last))



All in all, 15 sentences from test/dev appear in the train set, and have been removed. 

In [None]:
train_set.keys()

In [None]:
text = "YMO are considered pioneers in the field of popular electronic music and continue to be remixed or sampled by modern artists including experimental artist Yamantaka Eye electronica group LFO jungle band 4hero electrolatino artist Senor Coconut ambient house pioneers The Orb and 808 State electronic music groups Orbital and The Human League hip hop pioneer Afrika Bambaataa and mainstream pop musicians such as Michael Jackson Quincy Jones Greg Phillinganes Eric Clapton Mariah Carey and Jennifer Lopez"

train_set[text]

In [None]:
words = ["YMO", "are", "considered", "pioneers", "in", "the", "field", "of", "popular", "electronic", "music", ",", "and", "continue", "to", "be", "remixed", "or", "sampled", "by", "modern", "artists", ",", "including", "experimental", "artist", "Yamantaka", "Eye", ",", "electronica", "group", "LFO", ",", "jungle", "band", "4hero", ",", "electrolatino", "artist", "Senor", "Coconut", ",", "ambient", "house", "pioneers", "The", "Orb", "and", "808", "State", ",", "electronic", "music", "groups", "Orbital", "and", "The", "Human", "League", ",", "hip", "hop", "pioneer", "Afrika", "Bambaataa", ",", "and", "mainstream", "pop", "musicians", "such", "as", "Michael", "Jackson", ",", "Quincy", "Jones", ",", "Greg", "Phillinganes", ",", "Eric", "Clapton", ",", "Mariah", "Carey", ",", "and", "Jennifer", "Lopez", "."]
words[78:80]

In [None]:
for k, v in train_set.items():
    if len(v["captures"]) > 1:
        print(k, v["captures"], v["entities"], v["label"])
        print()