# Soft label documents using weak labelers

Use a set of weak learners or non-fine tuned models to predict which words are the crowd size.

For each document, each labeler will return the character offset~a vector of len(doc) indicating whether a word is in the crowd size span. These can be aggregated into an overall prediction and converted into a character offset span.

Thoughts:

- the QA model probably isn't compatible with a continuous/regression outcome.

In [6]:
import jsonlines
import spacy
from word2number import word_to_num
import numpy as np
import re
from tqdm.autonotebook import tqdm

In [7]:
nlp = spacy.load("en_core_web_lg")

In [140]:
with jsonlines.open("../ccc_train.jsonl", "r") as f:
    train = list(f.iter())

In [9]:
train[44]

{'size_cat': 1,
 'size_text': 'a few dozen',
 'text': 'The protests are in response to the death of George Floyd, a black man who died after a white officer pressed his knee into Floyd’s neck for minutes, even after he stopped moving.\n\nLas Vegas Metropolitan Police officers approached the protesters Tuesday night to discuss their plans prior to marching. A protester asked the two officers if they would march with them. "We all go together," he said.\n\nAn officer responded that they supported their message, but marching with the protesters could make them a target after a police shooting the night prior.\n\n"This is the risk that we\'re willing to take, it\'s our job, but we shouldn\'t be putting you guys at risk," the officer said.\n\nBy 11 p.m., most of the group had gone home while a few dozen remained in the area of Harmon Avenue and Paradise Road.\n\n@LVMPD officers approached the crowd & one man said, “why don’t you march with us?” Implying they would be safer with officers by 

## Labeler 1: use Roberta QA non-fine tuned

The first method uses a non-fine tuned Roberta QA model.

In [141]:
with jsonlines.open("ccc_train-qa-output.jsonl", "r") as f:
    qa_output = list(f.iter())

In [158]:
NUM = 333
qa = qa_output[NUM]
ex = train[NUM]
doc = nlp(ex['text'])

In [166]:
def qa_match(doc, qa, weight=1):
    word_labels = np.zeros(len(doc))
    for n, i in enumerate(doc):
        if i.idx >= qa['start'] and i.idx < qa['end']:
            word_labels[n] = weight
            
    return word_labels

output = qa_match(doc, qa)
[i for i in doc if output[i.i]]

[Dozens]

In [164]:
np.where(qa_match(doc, qa))[0]

array([92])

## Labeler 2: Use spaCy spans matching the order of magnitude

In [13]:
_num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
              'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen',
              'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty',
              'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety',
              'hundred', 'thousand', 'million', 'billion', 'trillion', 'quadrillion',
              'gajillion', 'bazillion', 'dozen', 'dozens', 'hundreds', 'thousands']

def like_num(text):
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
    if text.count('/') == 1:
        num, denom = text.split('/')
        if num.isdigit() and denom.isdigit():
            return True
    if text in _num_words:
        return True
    return False

def cand_generator(doc):
    for i in doc:
        if like_num(i.text):
            if any([like_num(j.text) for j in i.ancestors]):
                continue
            phrase = [i] + [j for j in i.children if j.dep_ in ['amod', 'quantmod', 'compound', 'advmod']]
            phrase.sort(key=lambda x: x.i)
            yield(phrase)
            
def cand_text(doc):
    spans = [i for i in cand_generator(doc)]
    text_list = []
    for s in spans:
        text = ''.join([i.text_with_ws for i in s if like_num(i.text)]).strip()
        text_list.append(text)
    return text_list

In [126]:
doc = nlp("A protest of a few dozen people occurred in the park.")
list(cand_generator(doc))

[[a, few, dozen]]

In [127]:
cand_text(doc)

['dozen']

In [129]:
def oom_match(doc, size_cat, normalize=True, debug=False):
    # get spacy spans for number phrases
    cands_raw = list(cand_generator(doc))
    # ...and also in text form, stripping out non-number words
    cands_text = cand_text(doc)
    # convert number words to numeric value
    nums = []
    for i in cands_text:
        try:
            nn = word_to_num(i) 
        except ValueError:
            # this is caused by some weird dates ("4/22")
            # for now, assign a too-large number to exclude it
            nn = 1000000000
        nums.append(nn)
    if debug:
        print(cands_raw)
    # convert to CCC size_cat scale
    oom = [len(str(int(i)))-1 for i in nums]
    # check if the extracted span is the right size_cat
    oom_locs = [int(i == int(size_cat)) for i in oom]
    
    # initialize a vector of zeros
    word_labels = np.zeros(len(doc))
    # find the spans that are an oom match, and assign 1 to their tokens
    for n, loc in enumerate(oom_locs):
        if loc:
            poses = [i.i for i in cands_raw[n]]
            if normalize:
                weight = 1 / sum(oom_locs)
            else:
                weight = 1
            word_labels[poses] = weight
            
    return word_labels


In [131]:
doc = nlp('Drivers honked their horns as about 150 protesters lined the Perkiomen Bridge in Collegeville on Saturday afternoon chanting "I can\'t breathe" in honor of George Floyd.')
size_cat = 2
oom_match(doc, size_cat, debug=True)

[[about, 150]]


array([0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [167]:
oom_match(doc, size_cat, debug=True)

[[12], [thousands], [13], [dozens], [26], [54], [27], [54], [47], [2013], [2015], [1,000], [4], [13], [2016], [2014], [2015]]


array([0., 0., 0., ..., 0., 0., 0.])

In [16]:
#oom_match_norm = oom_match / np.sum(oom_match)
#oom_match_norm

## Labeler 3: keywords

Find sentences that have have a matching term, and return those.

In [17]:
#ex = train[233]
#ex = train[1115]
#doc = nlp(ex['text'])

def keyword_match(doc, weight=1):

    search_terms = ["protesters", "demonstrators", "gathered", "crowd", "rallied", "attended",
               "picketed", "protest"]

    # check each sentence to see if it has a keyword
    keyword_match = []
    for c in cand_generator(doc):
        matches = [bool(re.search(s, c[0].sent.text)) for s in search_terms]
        if sum(matches) > 0:
            keyword_match.append([i.i for i in c])
            
    # initialize a vector of zeros
    word_labels = np.zeros(len(doc))
    # assign 1 to the spans that are in a keyword sentence
    for i in keyword_match:
        word_labels[i] = weight
        
    return word_labels

keyword_match(doc)

array([0., 0., 0., ..., 0., 0., 0.])

In [76]:
train[414]

{'size_cat': 1,
 'size_text': 'about 20',
 'text': 'Jayce Storrs, from Green Canyon High School, and Kade White, from Sky View High School, organized a rally Friday to show support for America’s 2nd Amendment.\n\nStorrs said, “We’re sick of our Second Amendment rights being stepped on every day. We just want to show the people of Cache Valley, and more that see in the newspaper, that we’re here to show that there are people out here to fight for Second Amendment rights.”\n\nThe group of about 20 students drove 13 vehicles with flags down Main Street through Logan Friday afternoon, with waves and honks of support along the way.\n\nWhite said, “We just want to show there is support for the Second Amendment even though we’re not as loud as the people who are against it.”'}

In [170]:
docs = list(nlp.pipe([i['text'] for i in train]))

In [185]:
def label_doc(doc, ex, qa=None):
    #qa_labels = qa_match(doc, qa)
    oom_labels = oom_match(doc, ex['size_cat'])
    keyword_labels = keyword_match(doc)
    all_labels = np.array([#qa_labels, 
                            keyword_labels, 
                            oom_labels])
    votes = np.sum(all_labels, axis=0)
    votes2 = np.multiply(votes, oom_labels) # remove ones that are the wrong oom.
    maj_vote = votes2 == np.max(votes2)
    
    voted_words = [i for n, i in enumerate(doc) if maj_vote[n]]
    start_char = voted_words[0].idx
    # Get the last word from the first contiguous span
    for n, current_word in enumerate(voted_words):
        prev_word = voted_words[n-1]
        if (n + 1) == len(voted_words):                
            end_word = current_word
            break
        if n == 0:
            continue
        if current_word.i - prev_word.i > 1:
            end_word = prev_word
            break
        if current_word.i == len(doc):
            end_word = current_word
            
    end_char = end_word.idx + len(end_word)
    answer_text = doc.text[start_char:end_char]

    ## get the sentence with the soft labeled answer
    context = end_word.sent
    
    output = {#"token_labels" : maj_vote.tolist(),
        "start_char": start_char,
        "end_char": end_char,
        "context": context.text,
        "labeled_text": answer_text}
    return output

def label_all(docs, train, qa_output):
    doc_labels = []
    for doc, ex, qa in tqdm(zip(docs, train, qa_output), total=len(train)):
        doc_labels.append(label_doc(doc, ex, qa))
    return doc_labels
        
#doc_labels = label_all(docs, train, qa_output)

In [188]:
NUM = 194
doc = docs[NUM]
ex = train[NUM]
qa = qa_output[NUM]

label_doc(doc, ex, qa)

{'start_char': 77,
 'end_char': 86,
 'context': 'SEATTLE — After several hours of peaceful gatherings and marches Saturday by thousands of people in Seattle protesting the death of George Floyd in Minneapolis, Seattle police said the crowd turned violent, throwing bottles and Molotov cocktails, setting fires, breaking windows and looting businesses in the downtown core.\n\n',
 'labeled_text': 'thousands'}

In [157]:
correct = []
for t, l in zip(train[0:50], doc_labels[0:50]):
    correct.append(t['size_text'].lower() == l['labeled_text'].lower())
    
np.mean(correct)

0.62

In [175]:
correct = []
for t, l in zip(train, doc_labels):
    correct.append(t['size_text'].lower() == l['labeled_text'].lower())
    
np.mean(correct)
# 0.558648849294729  without Roberta
# 0.5545657015590201  with Roberta


0.5545657015590201

In [139]:
ex = train[417]
doc = nlp(ex['text'])
oom = oom_match(doc, ex['size_cat'], debug=True)
[i for n, i in enumerate(doc) if oom[n]]

[[2], [about, a, dozen], [1]]


[about, a, dozen]

In [None]:
ex = {"text": "A group of about 200 protestors met...",
     "soft_label_text": "200"}

In [154]:
doc_labels[553]

{'token_labels': [False,
  False,
  False,
  False,
  False,
  True,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  Fa

In [150]:
[(i['size_text'].lower(), j['labeled_text'].lower()) for i, j in zip(train[0:50], doc_labels[0:50])]

[('hundreds', 'hundreds'),
 ('dozens',
  'oakland students walked out of class today to support teachers who are on the verge of a strike. they plan to march down broadway to school district headquarters. pic.twitter.com/ryqa7kmlcq — amy hollyfield (@amyhollyfield) february 8, 2019\n\noakland, calif. (kgo) -- dozens of high school students in oakland walked out of class today to show support for their teachers, who are on the verge of a strike.they gathered in front of oakland tech high school this morning and then marched together down broadway for a rally in front of school district offices.this is at least the third time oakland students have walked out of class to support their teachers this school year. students say they aren\'t sure whether these demonstrations will make a difference in the teacher\'s contract negotiations but they feel like they have to try.abc7 news caught up with oakland tech senior alex arriola before the march to ask him if this makes a difference."we are ma

In [39]:
for_dump = []
for i in doc_labels:
    for_dump.append(i.tolist())
    
with jsonlines.open("maj_vote_labels.jsonl", "w") as f:
    f.write_all(for_dump)

## Evaluate on test data

In [98]:
with jsonlines.open("../ccc_test.jsonl", "r") as f:
    test = list(f.iter())
    
with jsonlines.open("../ccc_test-qa-output.jsonl", "r") as f:
    qa_output = list(f.iter())
    
assert len(test) == len(qa_output)

In [99]:
test_labels = label_all(test, qa_output)

HBox(children=(FloatProgress(value=0.0, max=1130.0), HTML(value='')))




In [100]:
exact_match = []
for ex, out in zip(test, test_labels):
    pred_text = ex['text'][out['start_char']:out['end_char']]
    m = pred_text.lower().strip() == ex['size_text'].lower().strip()
    if not m:
        if  out['end_char'] - out['start_char'] < 100:
        #print(out['start_char'], out['end_char'])
            print("PREDICTED:", pred_text, "  ACTUAL:", ex['size_text'])
    exact_match.append(m)

PREDICTED: 50   ACTUAL: scores
PREDICTED: 200   ACTUAL: roughly 200
PREDICTED: 100   ACTUAL: about 100
PREDICTED: 150   ACTUAL: about 150
PREDICTED: 24   ACTUAL: more than a dozen
PREDICTED: 80   ACTUAL: about 80
PREDICTED: roughly 30   ACTUAL: a group of roughly 30
PREDICTED: Over more than a dozen   ACTUAL: more than a dozen
PREDICTED: 200   ACTUAL: about 200
PREDICTED: 200   ACTUAL: at least 200
PREDICTED: 75   ACTUAL: about 75
PREDICTED: Approximately 500   ACTUAL: from about 150 to perhaps 800
PREDICTED: 15   ACTUAL: dozens
PREDICTED: 600   ACTUAL: about 200
PREDICTED: more than 30   ACTUAL: dozens
PREDICTED: 15   ACTUAL: About 15
PREDICTED: 301   ACTUAL: Hundreds
PREDICTED: 30   ACTUAL: some 30
PREDICTED: 200   ACTUAL: nearly 200
PREDICTED: 100   ACTUAL: about 200
PREDICTED: about 20 students held a protest on UNLV campus on Thursday, Oct. 11   ACTUAL: about 20
PREDICTED: 100   ACTUAL: estimated 100
PREDICTED: 75   ACTUAL: about 75
PREDICTED: thousands   ACTUAL: nearly a thousand

In [101]:
np.mean(exact_match)

0.5371681415929204

In [None]:
# PREDICTED: roughly two dozen   ACTUAL: two dozen
# PREDICTED: around two dozen   ACTUAL: around two dozen teachers and parents
# PREDICTED: a few dozen   ACTUAL: a few dozen participants
# PREDICTED: two dozen   ACTUAL: around two dozen
# PREDICTED: 300   ACTUAL: A crowd of 300


In [128]:
qa_output[55]

{'score': 0.6445919275283813,
 'start': 23241,
 'end': 23249,
 'answer': 'About 80'}

In [129]:
test[55]

{'size_cat': 2,
 'size_text': 'nearly 100',
 'text': 'PINCKNEYVILLE — Sophie Kelly wondered why no one in her hometown of Pinckneyville had planned a protest or rally to stand with the Black Lives Matter movement.\n\nSince George Floyd, a black man, died in Minneapolis Police custody on Memorial Day after a white police officer pressed his knee to Floyd\'s neck for nearly nine minutes, protests have been held around the world, including in several small Southern Illinois towns like Anna, Benton and Carterville.\n\nKelly posed the question about a Pinckneyville rally on Facebook. Her friends Mikayla Rhienecker and Logan Wildermuth were considering organizing a rally there, too.\n\n“We felt like Pinckneyville needs a change. I’ve heard stories of people who are afraid to come to Pinckneyville,” Rhienecker said. “They call us the friendly little city, but we’re not really that friendly.”\n\nWildermuth decided he had to help his friends.\n\nThe three friends in Pinckneyville started planni

## Add Roberta features

In [187]:
doc_labels[55]

{'token_labels': [False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  True,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  Fa

In [None]:
# make a span of 200

In [176]:
test_labels[44]

{'token_labels': [False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  F