# Data Validation

After getting sentences that match our patterns using SPIKE's API, we would want to make sure the following hold, before tagging the data.

1. Sentences are not too short (like titles).
2. Captures make sense ( no non-alphabetical results etc.)
3. Spike is captures-oriented, that is, it returns a match per set of capture. We'd like to merge matches that are the same sentence that is because it has more than a single capture - for example
  a. sent 1: [David Bowie] and Freddie Mercury
  b. sent 2: David Bowie and [Freddie Mercury]
This should be merge that both are labeled with the musician label.
5. Similarly for non-musicians we'd like to ignore the captures and just look at the NER results.
6. `'s` is not part of the entity
7. Sentences in the train set do not appear in the test/dev sets.

and so on...

In [2]:
import string
import json
import pandas as pd
import glob
import jsonlines
from random import sample

In [3]:
LABEL = 'musician'
DATAPATH = '../data'

### Extract dev/test sentences

In [4]:
def remove_tags(sentence):
    tokens = []
    for t in sentence.split():
        if t:
            tokens.append(t.split('-[',1)[0])
    return clean_punct(" ".join(tokens))

def clean_punct(sentence):
    s = sentence.translate(str.maketrans('', '', string.punctuation))
    s = s.replace("  ", " ")
    return s

def get_dev_and_test_sentences(dataset_path):
    test_path = dataset_path + '/test.txt'
    dev_path = dataset_path + '/dev.txt'
    with open(test_path, 'r') as ft, open(dev_path, 'r') as fd:
        test_set = [remove_tags(sent.strip()) for sent in ft.readlines()]
        dev_set = [remove_tags(sent.strip()) for sent in fd.readlines()]
    dev_and_test = dev_set + test_set
    return dev_and_test

### validations

In [5]:
#validations
def sentence_is_not_too_short(sentence_text):
    return len(sentence_text) > 50

def capture_is_not_non_alphabetical(capture_text):
    alphabet = 'q w e r t y u i o p a s d f g h j k l z x c v b n m'.split()
    return any(x in capture_text for x in alphabet)


def validate_sentence(sentence_dict, label, capture_text, sentence_text, dev_and_test):
    if not capture_is_not_non_alphabetical(capture_text): return False
    if not sentence_is_not_too_short(sentence_text): return False
    if sentence_text in dev_and_test: return False
    return True

### Collect sentences

In [6]:
def get_capture(sentence, label):
    tokens = sentence["words"]
    first = sentence['captures'][label]['first']
    last = sentence['captures'][label]['last']
    capture_tokens = [t for i, t in enumerate(tokens) if first <= i <= last ]
    return " ".join(capture_tokens), first, last

def get_entities(sentence):
    return {(e['first'], e['last']) for e in sentence['entities']}


def collect_train_set_sentences():
    train_set = dict()
    dev_and_test = get_dev_and_test_sentences(f'{DATAPATH}/musicians_dataset')
    
    for file in glob.glob(f'{DATAPATH}/spike_matches/**/*.json', recursive=True):
        with open(file, "r") as f:
            j = json.load(f)
            for sentence_dict in j:
                label = LABEL if 'positive' in file else 'negative'
                sentence_text = clean_punct(" ".join(sentence_dict["words"])).strip()
                capture_text, cap_first, cap_last = get_capture(sentence_dict, label)
                if not validate_sentence(sentence_dict, label, capture_text, sentence_text, dev_and_test): continue
                if sentence_text not in train_set.keys():
                    train_set[sentence_text] = {
                        "label": label,
                        "words": sentence_dict["words"],
                        "captures": {(cap_first, cap_last)},
                        "entities": get_entities(sentence_dict) - {(cap_first, cap_last)}
                    }
                else:
                    if label == 'musician':
                        train_set[sentence_text]["captures"].add((cap_first, cap_last))
                        updated_entities = train_set[sentence_text]["entities"] - train_set[sentence_text]["captures"]
                        train_set[sentence_text].update({"entities": updated_entities})
                    else:
                        if (cap_first, cap_last) not in train_set[sentence_text]["captures"]:
                            train_set[sentence_text]["entities"].add((cap_first, cap_last))
                        else:
                            # not a true negative!
                            continue
    
    # make sure there are significantly more negative examples than positive ones. 
    print("Number of negatives: ", len([x for x, y in train_set.items() if y["label"] != 'musician']))
    print("Number of positives: ", len([x for x, y in train_set.items() if y["label"] == 'musician']))
    
    return train_set


## Tag Dataset

In [7]:
def tag_span(i, word, sentence, span_type, span_tagging):
    for span in sentence[span_type]:
        if span[0] <= i <= span[1]:
            if i == span[0]:
                sentence["tagged_sentence"] += f"{word}-[{span_tagging}B] "
            elif i == span[1]:
                if word == "'s":
                    sentence["tagged_sentence"] += f"{word}-[O] "
                else:
                    sentence["tagged_sentence"] += f"{word}-[{span_tagging}I] "
                sentence[span_type].remove(span)                
            else:
                sentence["tagged_sentence"] += f"{word}-[{span_tagging}I] "
            return True


def tag_sentence(sentence):
    sentence["tagged_sentence"] = ""
    captures = list(sentence["captures"])
    entities = list(sentence["entities"])
    for i, word in enumerate(sentence['words']):
        if sentence['label'] == LABEL:
            is_capture = tag_span(i, word, sentence, 'captures', '')
        else:
            is_capture = tag_span(i, word, sentence, 'captures', 'P')
        is_entity = tag_span(i, word, sentence, 'entities', 'P')
        if not (is_capture or is_entity):
            sentence["tagged_sentence"] += f"{word}-[O] "        
    sentence.update({"entities": entities, "captures": captures})
    
    return sentence
            
        

In [6]:
train_set = collect_train_set_sentences()

Number of negatives:  28393
Number of positives:  5322


In [9]:
with open(f'{DATAPATH}/musicians_dataset/train_set_with_hearst.txt', 'w') as f:
    for sent in sample([v for v in train_set.values()], len(train_set)):
        tagged = tag_sentence(sent)
        f.write(tagged["tagged_sentence"] + '\n')

## Alternative Sets

Create other dev and test sets. The code below splits the current train set into 80%-10%-10% for train dev and test sets respectively 

In [12]:
from sklearn.model_selection import train_test_split

DATASET = '../data/musicians_dataset'

In [13]:
def split_train_set(train_set_name):
    trainpath = f'{DATASET}/{train_set_name}.txt'
    df = pd.read_csv(trainpath, sep='\t', names=['sentence'])
    sp_train, sp_dev = train_test_split(df, test_size=0.2)
    
    # this is significantly slower, but ensures the output is exactly like the input, with no extra escaping and quotes.
    with open(f'{DATASET}/split_{train_set_name}.txt', "w") as ft:
        for i, row in sp_train.iterrows():
            ft.write(row['sentence']+"\n")
    with open(f'{DATASET}/split_{train_set_name.replace("train", "dev")}.txt', "w") as fd:
        for i, row in sp_dev.iterrows():
            fd.write(row['sentence']+"\n")

In [14]:
split_train_set("train_set_with_hearst")