In [668]:
import pandas as pd 
import json
import glob
import re 
CLEANR = re.compile('<.*?>') 


def check_source_row(row):
    head = row['head']
    sent = row['sent']
    if re.search('-\d', head):
        head = re.sub('-\d', '', head)
    
    heads = head.split(';')
    checks = []
    for x in heads:
        not_present = x.strip().lower() in ' '.join(sent).lower()
        checks.append(not_present)
        
    return all(checks)


def get_combined_df(annotated_fn, input_fn):
    json_dat = json.load(open(annotated_fn))['data']
    if isinstance(json_dat, dict) and 'row_data' in json_dat:
        json_dat = json_dat['row_data']
    annot_df = pd.DataFrame(json_dat)
    annot_df = annot_df.applymap(lambda x: x['field_value'] if isinstance(x, dict) else x)
    
    input_dat = json.load(open(input_fn))['html_data']
    input_df = pd.DataFrame(input_dat)

    annot_df_with_input = (
        input_df[['sent', 'sent_idx']]
             .merge(annot_df[['row_idx', 'head', 'quote_type', 'source_type']], left_on='sent_idx', right_on='row_idx')
             .drop(['row_idx', ], axis=1)
#      .loc[lambda df: df['sent'].str.strip().str.len() > 1]
    )
    
    return annot_df_with_input


def cleanhtml(raw_html):
    cleantext = re.sub(CLEANR, '', raw_html)
    return cleantext

####### 
input_data_files = glob.glob('../app/data/input_data/*/*')
annotated_files = glob.glob('../app/data/output_data_affil-role/*/*')
checked_files = glob.glob('../app/data/checked_data_affil-role/*/*')
all_multiply_annotated_sentences = []
all_sources = []

for annot_fn in annotated_files:
    doc_id = re.search('\d+', annot_fn.split('/')[-1])[0]
    input_fn = annot_fn.replace('output_', 'input_').replace('_affil-role', '').replace('annotated-', 'to-annotate-')
    checked_cand = annot_fn.replace('output_', 'checked_').replace('annotated-', 'checked-')
    if checked_cand in checked_files:
        annot_fn = checked_cand
        
    annot_df_w_input = get_combined_df(annot_fn, input_fn)
    annot_df_w_input['doc_id'] = doc_id
    multiply_annotated = annot_df_w_input.loc[lambda df: df['head'].str.contains('-\d') == True]
    all_multiply_annotated_sentences.append(multiply_annotated)
    all_sources.append(annot_df_w_input)

all_sources_df = pd.concat(all_sources)
all_sources_df['sent'] = all_sources_df['sent'].apply(cleanhtml)


def cache_doc_tokens(input_doc, tokenizer, nlp):
    doc_tokens_by_word = []
    doc_tokens_by_sentence = []
    for sent, _, _, _ in input_doc:
        words = list(map(str, nlp(sent.strip())))
        enc = []
        for w_idx, w in enumerate(words):
            if w_idx == 0:
                add_prefix_space = False
            else:
                add_prefix_space = True
            enc.append(
                tokenizer.encode(w, add_special_tokens=False, add_prefix_space=add_prefix_space)
            )
        doc_tokens_by_word.append(enc)
        tokenized_sentence = [tokenizer.bos_token_id] + [i for l in enc for i in l] + [tokenizer.eos_token_id]
        doc_tokens_by_sentence.append(tokenized_sentence)
        
    doc_tokens = [i for l in doc_tokens_by_sentence for i in l]    
    word_lens_by_sent = [list(map(len, x)) for x in doc_tokens_by_word]
    word_lens_by_sent_cumsum = list(map(lambda x: np.cumsum([1] + x), word_lens_by_sent)) # we need a [1] offset
                                                                                          # in the cumsum because there 
                                                                                          # is an extra bos token added.
    sent_lens = list(map(len, doc_tokens_by_sentence))
    sent_lens_cumsum = np.cumsum([0] + sent_lens)
                         
    return (
        doc_tokens_by_word,
        doc_tokens_by_sentence,
        doc_tokens,
        word_lens_by_sent_cumsum,
        sent_lens,
        sent_lens_cumsum
    )

In [522]:
tokenizer.encode('hello', add_special_tokens=False, add_prefix_space=True)

[20760]

In [523]:
tokenizer.encode('there', add_special_tokens=False, add_prefix_space=True)

[89]

In [524]:
tokenizer.encode('hello', add_special_tokens=False, add_prefix_space=False)

[42891]

In [525]:
tokenizer.encode('there', add_special_tokens=False, add_prefix_space=False)

[8585]

In [528]:
tokenizer.decode([89])

' there'

In [527]:
tokenizer.decode([8585])

'there'

# Check which sources are not found in their directly-tagged sentences

In [286]:
unfound = (
    all_sources_df
        .loc[lambda df: df['head'] != '']
        .groupby(['doc_id', 'head'])
        .aggregate(list)
        .reset_index()
        .loc[lambda df: ~df.apply(check_source_row, axis=1)]
)

In [287]:
unfound['head'].value_counts().index.tolist()

['journalist',
 'passive-voice',
 'Joseph R. Biden Jr.',
 'Donald J. Trump',
 'Elizabeth Warren',
 'Trump',
 'David Cameron',
 'John J. Bates',
 'Mahmoud Badr; Mohammed Abdel-Aziz; Hassan Shahin; Mai Wahba; Mohammed Heikal',
 'Mai Wahba',
 'Ron DeSantis',
 'Trevor Haynes',
 'Fu Cheng Qiu',
 'Amy McGrath',
 'Sputnik news agency',
 'Mika Brzezinski',
 'I. Launa',
 'Donald Trump',
 ' Food and Drug Administration',
 'Vladimir V. Putin',
 'Austin Gilbert',
 'Jonathan Segal',
 'Mike Pence',
 'Ryan Ellis',
 'some men',
 'David W. Eaton; Xuewei Bao',
 "Kenya's top politicians",
 "mine's owners",
 'Mahmoud Badr',
 "Paul Ryan's spokesman",
 'Hassan Shahin',
 'bill-7',
 'Giulio Regeni',
 'epytians',
 'Gov. Kate Brown',
 'Oregon State Senate',
 "Oregon's house",
 'Vermont Senate Judiciary Committee',
 "bill's sponsors",
 'bill-2',
 'bill-3',
 'bill-4',
 'bill-8',
 'Cody Wilson ',
 'Jerome H. Powell',
 'Fire departement',
 'Asia Development Bank',
 'Mary Walsh',
 'Ohio State University',
 'campaign

# Tag sources in document

In [495]:
def find_rk(seq, subseq):
    n = len(seq)
    m = len(subseq)
    if seq[:m] == subseq:
        return 0
    hash_subseq = sum(hash(x) for x in subseq)  # compute hash
    curr_hash = sum(hash(x) for x in seq[:m])  # compute hash
    for i in range(1, n - m + 1):
        curr_hash += hash(seq[i + m - 1]) - hash(seq[i - 1])   # update hash
        if hash_subseq == curr_hash and seq[i:i + m] == subseq:
            return i
    return False

def get_source_in_sentence(source_head, sentence):
    if re.search('-\d', source_head):
        source_head = re.sub('-\d', '', source_head)
    if source_head in sentence:
        return find_rk(sentence.split(), source_head.split())
    else:
        return -1

In [515]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [656]:
def find_source_offset(source_head, source_sents, doc_sents, tok_lens_by_sent, sent_lens):
    # 1. iterate through source-related sentences first
    found = False
    for sentence, _, s_idx, _ in source_sents:
        sentence = unidecode(sentence)
        offset = get_source_in_sentence(source_head.lower(), sentence.lower())
        if offset != -1:
            sent_toks = tok_lens_by_sent[int(s_idx)]
            return {
                'source': source_head,
                's_idx': s_idx,
                'start_tok_idx': sent_lens[int(s_idx)] + sent_toks[offset],
                'end_tok_idx': sent_lens[int(s_idx)] + sent_toks[offset + len(source_head.split())],
                'doc_idx': doc_idx
            }

    # 2. iterate through the whole document if the source is not in the source sentences
    for sentence, _, s_idx, _ in doc_sents:
        sentence = unidecode(sentence)
        offset = get_source_in_sentence(source_head.lower(), sentence.lower())
        if offset != -1:
            sent_toks = tok_lens_by_sent[int(s_idx)]
            return {
                'source': source_head, 
                's_idx': s_idx,
                'start_tok_idx': sent_lens[int(s_idx)] + sent_toks[offset],
                'end_tok_idx': sent_lens[int(s_idx)] + sent_toks[offset + len(source_head.split())],
                'doc_idx': doc_idx
            }
        
    # 3. nothing found, returning
    return {
        'source': source_head, 
        's_idx': -1,
        'e_idx': -1,
        'start_tok_idx': -1,
        'end_tok_idx': -1,
        'doc_idx': doc_idx
    }

In [666]:
def generate_training_chunk_from_source_offset(source_offset_chunk, all_doc_tokens, sent_lens):
    s_idx = int(source_offset_chunk['s_idx'])
    
    ## 
    training_chunk = {}
    training_chunk['start_position'] = source_offset_chunk['start_tok_idx']
    training_chunk['end_position'] = source_offset_chunk['end_tok_idx']
    training_chunk['context'] = all_doc_tokens
    sent_inds = []
    for i, l in enumerate(sent_lens):
        if i == s_idx:
            sent_inds += [1] * l
        else:
            sent_inds += [0] * l 
    
    training_chunk['sentence_indicator_tokens'] = sent_inds
    return training_chunk

In [670]:
import numpy as np 
import csv, itertools
from tqdm.auto import tqdm
from unidecode import unidecode

data_path = '../models_neural/quote_attribution/data/our-annotated-data__stage-2.tsv'
split, data_chunk = [], []
with open(data_path) as f:
    csv_reader = csv.reader(f, delimiter="\t")
    csv_data = list(csv_reader)

grouped = []
for doc_idx, doc in itertools.groupby(csv_data, key=lambda x: x[3]):  # group by doc_id
    sorted_doc = sorted(doc, key=lambda x: int(x[2]))  # sort by sent_id
    sorted_doc = list(map(lambda x: [x[0].strip(), x[1], x[2], x[3]] , sorted_doc))
    grouped.append((doc_idx, sorted_doc))

### 
training_data = []
for doc_idx, doc_to_group in tqdm(grouped, total=len(grouped)):
    doc_to_group[0][0] = 'journalist passive-voice ' + doc_to_group[0][0]
    (
        doc_tok_by_word, 
        doc_tok_by_sent,
        all_doc_tokens, 
        word_len_cumsum,
        sent_lens,
        sent_len_cumsum
    ) = cache_doc_tokens(doc_to_group, tokenizer, nlp)    
    
    doc_to_group = sorted(doc_to_group, key=lambda x: x[1]) # sort by source
    
    for source_heads, source_sentences in itertools.groupby(doc_to_group, key=lambda x: x[1]):
        if source_heads == 'None':
            continue
        
        for source_head in source_heads.split(';'):
            source_head = unidecode(source_head).strip()
            source_chunk = find_source_offset(source_head, source_sentences, doc_to_group, word_len_cumsum, sent_len_cumsum)
            training_chunk = generate_training_chunk_from_source_offset(source_chunk, all_doc_tokens, sent_lens)
            training_data.append(training_chunk)

  0%|          | 0/296 [00:00<?, ?it/s]

In [658]:
source_to_word_offset[-1]

[{'source': 'Food and Drug Administration',
  's_idx': '11',
  'start_tok_idx': 419,
  'end_tok_idx': 423,
  'doc_idx': '/test/716'},
 {'source': 'Elizabeth Holmes',
  's_idx': '1',
  'start_tok_idx': 60,
  'end_tok_idx': 62,
  'doc_idx': '/test/716'},
 {'source': 'Theranos',
  's_idx': '0',
  'start_tok_idx': 4,
  'end_tok_idx': 5,
  'doc_idx': '/test/716'},
 {'source': 'Wall Street Journal',
  's_idx': '11',
  'start_tok_idx': 394,
  'end_tok_idx': 397,
  'doc_idx': '/test/716'},
 {'source': 'company spokesman',
  's_idx': '8',
  'start_tok_idx': 294,
  'end_tok_idx': 296,
  'doc_idx': '/test/716'}]

In [650]:
tokenizer.decode(doc_tok_by_sent[11][11:14])

' Wall Street Journal'

In [661]:
tokenizer.decode(all_doc_tokens[294:296])

' company spokesman'

In [628]:
model_name = "deepset/roberta-base-squad2"

In [629]:
from transformers import AutoConfig

In [631]:
config= AutoConfig.from_pretrained(model_name)

In [632]:
config.num_labels

2

In [621]:
# do checks!!! clean up these sources!!! 

In [518]:
source_word_offset_df = pd.DataFrame([i for s in source_to_word_offset for i in s])

# this is likely because of rows with multiple sources (sep by ';')
source_word_offset_df.assign(c=1).groupby(['doc_idx', 'source'])['c'].sum().loc[lambda s: s>1].head()

## todo: go through and correct all these sources
_ = source_word_offset_df.loc[lambda df: df['s_idx'] == -1]

In [519]:
doc_to_group = sorted(doc_to_group, key=lambda x: int(x[2]))

In [566]:
doc_tok_by_word, doc_tok_by_sent, blank_toks_by_sent, all_doc_tokens, word_len_cumsum = cache_doc_tokens(
    doc_to_group, tokenizer, nlp
)

In [567]:
word_len_cumsum

[array([ 1,  3,  4,  5,  6,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
        20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
        37, 38, 39, 40, 41, 42, 43, 44, 45, 46]),
 array([ 1,  2,  3,  4,  5,  6,  7,  8, 11, 12, 13, 14, 15, 16, 17, 18, 21,
        22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
        39, 40, 41, 42]),
 array([ 1,  2,  3,  4,  5,  6,  8,  9, 10, 11, 12, 14, 15, 16, 17, 20, 21,
        22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36]),
 array([ 1,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 16, 17, 18, 19,
        20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
        37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51]),
 array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 16, 17, 18, 19,
        20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 32, 33, 34, 35, 36, 37, 38,
        39, 40, 41, 42]),
 array([ 1,  2,  3,  4,  6,  7,  8, 10, 11, 13, 14, 15, 18, 19, 20, 21, 23

In [568]:
tokenizer.decode(toks)

' Food and Drug Administration'

In [579]:
source_to_word_offset[-1][4]

{'source': 'company spokesman',
 's_idx': '8',
 'start_word_idx': 1,
 'end_word_idx': 3,
 'doc_idx': '/test/716'}

In [580]:
word_len_cumsum[8][1]

2

In [581]:
word_len_cumsum[8][3]

4

In [582]:
tokenizer.decode(doc_tok_by_sent[8][2:4])

' company spokesman'

# Make sure we have a good tokenizing pattern

In [132]:
from transformers import AutoTokenizer, RobertaTokenizer

In [653]:
tokenizer = RobertaTokenizer.from_pretrained('/Users/alex/.cache/torch/transformers/named-models/roberta-base-expanded-embeddings')

In [143]:
doc_sents = list(map(lambda x: x[0].strip(), sorted_doc))

In [144]:
doc_str = ' '.join(doc_sents)

In [148]:
s = doc_sents[0]

In [672]:
import torch

In [674]:
torch.ones_like(torch.tensor([1,2,3]))

tensor([1, 1, 1])

In [675]:
torch.ones_like([1,2,3])

TypeError: ones_like(): argument 'input' (position 1) must be Tensor, not list