# PolNeAR

In [1]:
import numpy as np 
import os
import glob
import pandas as pd 
import spacy
from tqdm.auto import tqdm
from IPython.display import display, HTML
import ast

nlp = spacy.load('en_core_web_lg')

def get_loc_in_sent(char_idx, sent_lens_cumsum):
    if not isinstance(char_idx, int):
        char_idx = int(char_idx)
    
    sent_bin = np.digitize(char_idx, sent_len_cumsum)
    offset_in_sent = char_idx - sent_len_cumsum[sent_bin - 1]
    return sent_bin, offset_in_sent

In [2]:
text_files = glob.glob('../data/academic-datasets/PolNeAR/data/*/text/*')

In [3]:
all_text = []
all_annotations = []
all_sentences = []
for text_file in tqdm(text_files):
    base_dir, filename = text_file.split('text/')
    base_filename = filename.replace('.txt', '')
    
    ## read text
    with open(text_file) as f:
        text = f.read()
        doc = nlp(text)
        sents = list(doc.sents)
        text_sents = list(map(str, sents))
        sent_len_cumsum = list(map(lambda x: x.end_char, sents))
        all_text.append({
            'text_id': base_filename,
            'text': text,
            'sentences': text_sents
        })
        
    ## read annotation
    attr_dir = os.path.join(base_dir, 'attributions', '*')
    annot_files = list(filter(lambda x: base_filename in x, glob.glob(attr_dir)))
    for a in annot_files:
        f_id = a.split('/')[-1]        
        d = open(a).read()
        t = list(map(lambda x: x.split('\t'), d.split('\n')))
        t = list(filter(lambda x: len(x) > 1, t))
        annotated_file = pd.DataFrame(t)

        if len(annotated_file) == 0:
            continue

        annotated_file['source'] = np.nan
        annotated_file['start_sentence'] = np.nan
        annotated_file['end_sentence'] = np.nan

        for row_idx, annotated_id, info_block in (
            annotated_file
                 .loc[lambda df: df[0].str.contains('T') == True]
                 [[0, 1]].itertuples()
        ):
            chunks = info_block.split(' ')
            block_type, char_idx_chunks = chunks[0], ' '.join(chunks[1:])
            start_idxs, end_idxs = [], []
            
            for start_end_chunk in char_idx_chunks.split(';'):
                start, end = start_end_chunk.split(' ')
                start_sent_idx, char_start = get_loc_in_sent(start, sent_len_cumsum)
                end_sent_idx, char_end = get_loc_in_sent(int(end) - 1, sent_len_cumsum)
                start_idxs.append(start_sent_idx)
                end_idxs.append(end_sent_idx)
                
            annotated_file.loc[row_idx, 'start_sentence'] = str(start_idxs)
            annotated_file.loc[row_idx, 'end_sentence'] = str(end_idxs)

        entities = annotated_file.loc[lambda df: df[0].str.contains('E')]
        for source_block in entities[1]:
            chunks = source_block.strip().split(' ')
            ids = list(map(lambda x: x.split(':')[1], chunks))
            source_ids = list(filter(lambda x: 'Source' in x, chunks ))
            if len(source_ids) == 1:
                source_id = source_ids[0].split(':')[1]
                source_name = annotated_file.loc[lambda df: df[0] == source_id][2].iloc[0]
                annotated_file.loc[lambda df: df[0].isin(ids), 'source'] = source_name 
            
        annotated_file['t_id'] = base_filename
        annotated_file['a_id'] = f_id
        all_annotations.append(annotated_file)

  0%|          | 0/1012 [00:00<?, ?it/s]

In [4]:
all_annotations_df = pd.concat(all_annotations)

In [6]:
all_text_df = (
    pd.DataFrame(all_text)
    .assign(sentences=lambda df: df['sentences'].apply(lambda x: list(map(lambda y: y.strip(), x))))
)

In [7]:
source_annotations = (
    all_annotations_df
     .loc[lambda df: df['source'].notnull()]
     .assign(start_sentence=lambda df: df['start_sentence'].apply(ast.literal_eval))
     .assign(end_sentence=lambda df: df['end_sentence'].apply(ast.literal_eval))    
)

In [9]:
source_annotations['start_sentence'].str.len().value_counts()

1     96531
2      3277
3       132
4        50
6        12
5        12
7         2
11        2
20        1
9         1
10        1
12        1
8         1
Name: start_sentence, dtype: int64

In [10]:
source_sentence_mapping = (
    source_annotations
     .groupby(['t_id', 'a_id', 'source'])
     [['start_sentence', 'end_sentence']]
     .aggregate(list)
     .applymap(lambda x: list(set([s for l in x for s in l])))
     .apply(lambda x: list(set(x['start_sentence'] + x['end_sentence'])), axis=1)
     .to_frame('source_sentences')
     .reset_index()
)

In [12]:
questionable_words = [
#     'it',
#     'which',
#     'that',
#     'some',
#     'you',
#     'many'
]

anonymous_sources = ['some', 'many']
messy_sources = ['you', 'that', 'which', 'i']

In [13]:
## check questionable sources
if False:
    t = (source_sentence_mapping
             .loc[lambda df: df['source'].isin(questionable_words)]
             .merge(all_text_df[['text_id', 'sentences']], left_on='t_id', right_on='text_id')
             .assign(source_sentences_text=lambda df: 
                 df.apply(lambda x: list(map(lambda y: x['sentences'][y], x['source_sentences'])) , axis=1)
            )
    )

    t[['source', 'source_sentences_text']].iloc[3].values

In [14]:
source_sentence_mapping = source_sentence_mapping.loc[lambda df: ~df['source'].str.lower().isin(messy_sources)]

In [15]:
quote_sentence_lists = (
    source_sentence_mapping
     .groupby('t_id')[['source', 'source_sentences']]
     .aggregate(list)
     .assign(source=lambda df: df.apply(lambda x: [[x['source'][i]] * len(s) for i, s in enumerate(x['source_sentences'])], axis=1))
     .applymap(lambda x: [s for l in x for s in l])
#      .apply(lambda x: list(set(x)))
#      .to_frame('quote_sentences')
)

In [16]:
quote_sentence_lists_exploded = (
    quote_sentence_lists
     .apply(lambda x: list(map(lambda y: {'source': y[0], 'sent': y[1], 't_id': x.name}, zip(x['source'], x['source_sentences']))) , axis=1)
     .pipe(lambda s: pd.DataFrame([i for x in s for i in x]))
)

In [17]:
all_text_df_exploded = (all_text_df
 .apply(lambda x: list(map(lambda y: {'s': y[1], 's_idx': y[0], 't_id': x['text_id']}, enumerate(x['sentences']))), axis=1)
 .pipe(lambda s: pd.DataFrame([i for x in s for i in x]))
)

In [18]:
matched_quotes_exploded = (
    all_text_df_exploded
     .merge(quote_sentence_lists_exploded, how='left', left_on=['t_id', 's_idx'], right_on=['t_id', 'sent'])
     .set_index('t_id')
     .drop(['sent', 
#             's_idx'
           ], axis=1)
)

In [50]:
t_ids = (matched_quotes_exploded.index.unique().tolist())


t_ids = (
    matched_quotes_exploded
     .loc[lambda df: df['source'].notnull()]
     .loc[lambda df: df.apply(lambda x: x['source'] not in x['s'], axis=1)]
     .index.unique().tolist() 
)

In [73]:
coreferent_sources = ['he', 'she', 'who', 'they', 'her', 'his', 'a', 'some', 'q']

In [95]:
t_ids = (matched_quotes_exploded
 .loc[lambda df: ~df['source'].str.lower().isin(coreferent_sources)]
 # source not in the sentence
 .loc[lambda df: df['source'].notnull()]
 .loc[lambda df: df.apply(lambda x: x['source'] not in x['s'], axis=1)] 
 .reset_index()
 .assign(c=1)
 .groupby(['t_id', 'source'])
 ['c'].sum()
 .reset_index()
 .loc[lambda df: df['c'] > 1]
 ['t_id'].unique().tolist()
)

In [99]:
idx = 5
d = matched_quotes_exploded.loc[t_ids[idx]]

In [100]:
html_all = '<table>'
for sent, s_idx, source in d.itertuples(index=False):
    if pd.notnull(source):
        html_all += '<tr><td style="background-color: pink">' +  sent + '</td><td>' + source + '</tr>'
    else:
        html_all += '<tr><td>' + sent + '</td><td></td></tr>'
html_all += '</table>'

In [101]:
display(HTML(html_all))

0,1
"Exclusive–Maria Espinoza, Anti-Illegal-Immigration Leader, Running For Congress.",
Anti-illegal immigration leader Maria Espinoza is running for Congress in the 7th congressional district in Texas.,
"Espinoza is Co-founder and National Director of The Remembrance Project, a non-profit that advocates for families of Americans who have been killed by illegal aliens.",a non-profit
She will be challenging incumbent Rep. John Culberson (R-TX).,She
“Culberson just isn’t working any more.,Espinoza
"In fact, last year, he missed the second most votes of any Texas Congressman and was among the bottom ten percent of all members of Congress for missed votes,” Espinoza tells Breitbart News.",Espinoza
"“Culberson is now part of the problem in Washington, not the solution,” Espinoza said.",Espinoza
“Fourteen years in Congress is long enough.,Espinoza
Conservatives can no longer count on Culberson.”,Espinoza
Espinoza is of Mexican descent and is a sixth generation Texan from her mother’s side.,


In [162]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(matched_quotes_exploded.reset_index()['t_id'].drop_duplicates().tolist())
train_test_info = pd.concat([
    pd.Series(train).to_frame('files').assign(group='/train/'),
    pd.Series(test).to_frame('files').assign(group='/test/')
])

In [165]:
(matched_quotes_exploded
 .assign(source=lambda df: df['source'].isnull())
 .reset_index()
 .merge(train_test_info, left_on='t_id', right_on='files', how='left')
 .assign(t_id=lambda df: df['group'] + df['t_id'])
 .groupby(['t_id', 's', 's_idx'])['source'].any()
 .reset_index()
 .sort_values(['t_id', 's_idx'])
 [['source', 's', 't_id', 's_idx'] ]
  .to_csv('../models/neural_models/quote_detection/data/polnear-training-data-stage-1.tsv', index=False, sep='\t')
)

In [168]:
notebook.__package__

'notebook'

In [154]:
t3 = (source_sentence_mapping
 .groupby(['t_id', 'a_id'])
 ['source'].aggregate(list).iloc[6]
)

t3

['Author Roger Stone',
 'Breitbart Jerusalem editor Aaron Klein',
 'Breitbart London’s Raheem Kassam',
 'Breitbart Texas editor Brandon Darby',
 'Dave Gorab',
 'Investigative reporter\xa0Julia Hahn',
 'Pope Francis']

In [155]:
t = 'Former Secretary of State and Democratic presidential frontrunner Hillary Clinton'
t = 'Author Roger Stone'
t = t3[2]

In [156]:
ents = list(nlp(t).ents)

In [157]:
ents

[Breitbart London’s, Raheem Kassam]

In [158]:
t2 = ents[0]

In [159]:
t2.label_

'PERSON'

In [160]:
list(map(lambda x: x.label_, ents))

['PERSON', 'PERSON']

In [986]:
data_df = all_text_df.merge(quote_sentence_lists, right_index=True, left_on='text_id')
data_df

Unnamed: 0,text_id,text,sentences,source,source_sentences
0,wash-post_2016-10-19_so-guess-who-s-the-demogo...,So guess who's the Demogorgon in this scenario...,[So guess who's the Demogorgon in this scenari...,"[Clinton, He, His office, Liberals, Newt Gingr...","[32, 35, 10, 41, 15, 9, 38, 7, 28, 36, 32, 33,..."
1,breitbart_2016-11-02_asian-markets-on-edge-on-...,Asian markets on edge on US election uncertain...,[Asian markets on edge on US election uncertai...,"[Chris Weston, chief market strategist in Melb...","[5, 6, 7, 19, 4, 18]"
2,west-journal_2016-10-11_donald-trump-claims-th...,Donald Trump Claims ‘The Shackles’ Have Now Be...,[Donald Trump Claims ‘The Shackles’ Have Now B...,"[Donald Trump, House Speaker Paul Ryan, Joshua...","[0, 5, 17, 18, 16, 1, 8, 9, 15, 11, 4, 6, 10, ..."
3,usa-today_2016-10-26_congress-could-never-cont...,Congress could never control President Trump.\...,[Congress could never control President Trump....,"[An unusual number of Republican lawmakers, Co...","[4, 24, 1, 10, 29, 11, 15, 25, 10, 27, 20, 16,..."
4,west-journal_2016-10-19_lone-player-stands-dur...,Lone Player Stands During National Anthem Whil...,"[Lone Player Stands During National Anthem, Wh...","[Brewer, Jewell Young, a Decatur resident, Jew...","[6, 20, 21, 22, 14, 12, 26, 29, 11, 23, 19, 24..."
...,...,...,...,...,...
1007,politico_2016-09-30_flight-attendants-union-en...,Flight attendants' union endorses Clinton.\n\n...,"[Flight attendants' union endorses Clinton., A...","[A union representing flight attendants, Fligh...","[1, 0, 5, 3, 2, 6, 1, 2, 3]"
1008,breitbart_2016-03-11_activists-who-lit-syria-r...,Activists who lit Syria revolt washed away in ...,[Activists who lit Syria revolt washed away in...,"[He, He, He, Jimmy Shahinian, a 28-year-old ac...","[32, 26, 39, 3, 4, 33, 34, 35, 27, 28, 29, 22,..."
1009,breitbart_2016-03-11_africa-builds-expertise-i...,"Africa builds expertise in science, tech, engi...","[Africa builds expertise in science, tech, eng...","[Alta Schutte, Gitau, Gitau, who specializes i...","[2, 43, 33, 32, 10, 17, 28, 44, 7, 13, 14, 36,..."
1010,breitbart_2016-03-11_activists-brick-up-entran...,Activists Brick Up Entrance To Migrants-Only P...,[Activists Brick Up Entrance To Migrants-Only ...,[A source from the German Identitarian Movemen...,"[18, 19, 22, 21, 23, 1, 7, 6, 8, 4, 24, 17, 15..."


# QuoBERT Annotated

In [178]:
import pandas as pd 

In [182]:
quotbert_annotated_data = pd.read_json('../models_other/Quotebank/quobert/annotated_mturk.json', lines=True)

In [186]:
(
    quotbert_annotated_data['speaker']
     .value_counts()
     .pipe(lambda s: pd.concat([s[:4], pd.Series({'other': s[4:].sum()})]))
)

not_quote        210
none             201
not_mentioned    159
ambiguous         42
other            888
dtype: int64

In [196]:
quotbert_annotated_data.loc[4].to_dict()

{'articleUID': '2015121004_00119255_W',
 'articleOffset': 0,
 'leftContext': '. The Council has extended the deadline for further submissions from the original closing date of next Wednesday until the end of next week, Friday 18 December. Tony Avery, the Interim General Manager of Planning and Development, said that the decision had been made in light of the large number of original submissions, and the short 10 day time frame specified in the Resource Management Act for making further submissions.',
 'quotation': 'This will assist people in digesting the large number of rezoning requests, which took longer than expected to map accurately, and which are now easily viewable online.',
 'rightContext': "Mr Avery also cautioned that while the map was a helpful tool, anyone interested in making a further submission should always refer back to the original submission. The map, the submissions and a summary are all on the Council's website www.qldc.govt.nz",
 'speaker': 'Tony Avery',
 'entiti

# IBM Datasets

In [209]:
ls ../data/academic-datasets/IBMDebaterEvidenceSentences/

README.txt  test.csv    train.csv


In [210]:
wiki_evidence = pd.read_csv('../data/academic-datasets/wikipedia_evidence_dataset_29429.csv')
ibm_debater_evidence_sentences = pd.read_csv('../data/academic-datasets/IBMDebaterEvidenceSentences/train.csv')

In [208]:
wiki_evidence['Evidence'][3]

'In 2001 and 2002 Simeoni was suspended for several months for doping use.'

In [219]:
ibm_debater_evidence_sentences.iloc[5].to_dict()

{'topic': 'We should limit executive compensation',
 'the concept of the topic': 'executive compensation',
 'candidate': 'In 2007, the Chairman of the Financial Services Committee Rep. Barney Frank passed legislation in the House of Representatives that gave shareholders a non-binding vote on executive compensation.',
 'candidate masked': 'In 2007, the Chairman of the Financial Services Committee Rep. Barney Frank passed legislation in the House of Representatives that gave shareholders a non-binding vote on TOPIC_CONCEPT.',
 'label': 0,
 'wikipedia article name': 'Say on pay',
 'wikipedia url': 'https://en.wikipedia.org/wiki/Say_on_pay'}

In [239]:
(ibm_debater_evidence_sentences
 .loc[lambda df: df['label'] == 1]
 [['topic', 'candidate', 'label']]
 .iloc[15]
 .to_dict()
)

{'topic': 'We should ban partial birth abortions',
 'candidate': 'Kennedy\'s majority opinion argued that the case differed from Stenberg v. Carhart, a 2000 case in which the Supreme Court struck down a state ban on "partial-birth abortion" as unconstitutional, in that the Partial Birth Abortion Act defined the banned procedure more clearly.',
 'label': 1}

In [249]:
(ibm_debater_evidence_sentences
 .loc[lambda df: df['label'] == 0]
 [['topic', 'candidate', 'label']]
 .iloc[8]
 .to_dict()
)

{'topic': 'We should limit executive compensation',
 'candidate': 'In 2005, Goodwill Industries of the Columbia Willamette (GICW), Goodwill\'s Portland, Oregon branch, came under scrutiny due to executive compensation that the Oregon attorney general\'s office concluded was "unreasonable."',
 'label': 0}

In [84]:
annotated_file.loc[98][1]

'Attribution:T80 Cue:T79 Source:T78 Content:T77'

In [85]:
annotated_file.loc[[97, 96, 95, 94]]

Unnamed: 0,0,1,2,f_id
97,T80,Attribution 5896 5900,said,../data/academic-datasets/PolNeAR/data/annotat...
96,T79,Cue 5896 5900,said,../data/academic-datasets/PolNeAR/data/annotat...
95,T78,Source 5892 5895,she,../data/academic-datasets/PolNeAR/data/annotat...
94,T77,Content 5851 5891;5902 6000,"“Africa might be luckier going forward,” “Here...",../data/academic-datasets/PolNeAR/data/annotat...


In [97]:
text_file = '../data/academic-datasets/PolNeAR/data/annotator-training/text/breitbart_2016-03-11_africa-builds-expertise-in-scien.txt'

In [99]:
with open(text_file) as f:
    text_file = f.read()

In [103]:
text_file[5851:6000]

'“Africa might be luckier going forward,” she said. “Here we have to discuss and address deep, serious social challenges, and from there we can leap.”'

In [110]:
all_annotations_df = pd.concat(all_annotations)

In [114]:
all_annotations_df[1].isnull().value_counts()

False    145003
True          3
Name: 1, dtype: int64

In [122]:
(all_annotations_df
 .loc[lambda df: df[0].str.contains('T')]
 .loc[lambda df: df[1].str.contains('Cue') == True][2]
 .value_counts()
)

said                                            6142
says                                             802
told                                             527
according to                                     410
say                                              377
                                                ... 
to show off later on their Instagram account       1
depicting                                          1
soon recalibrated                                  1
scoffed                                            1
tries to                                           1
Name: 2, Length: 9641, dtype: int64

In [136]:
polnear_sources = (
    all_annotations_df
     .loc[lambda df: df[0].str.contains('T')]
     .loc[lambda df: df[1].str.contains('Source') == True][2]
     .str.lower()
     .value_counts()
)

In [None]:
list(filter(lambda x: any(map(lambda y: y in x, desired_checklist_of_anonymous_sources)), polnear_sources.index))

In [None]:
list(filter(lambda x: any(map(lambda y: y in x and 'clinton' not in x, desired_checklist_of_documents)), polnear_sources.index))

In [163]:
(all_annotations_df
 .loc[lambda df: df[1].str.contains('Cue') == True].dropna()[2]
 .loc[lambda s: s.str.split().str.len() > 1]
 .str.lower()
 .loc[lambda s: ~s.str.contains('said')]
 .value_counts()
 .iloc[40:50]
)

also told         9
has announced     9
also noted        9
attacks on        9
put it            9
agree with        9
calling for       9
to discuss        9
have shown        9
went on to say    8
Name: 2, dtype: int64

In [172]:
one_word_cues = (
    all_annotations_df
     .loc[lambda df: df[1].str.contains('Cue') == True].dropna()[2]
     .loc[lambda s: s.str.split().str.len() == 1]
     .str.lower()
     .value_counts()
     .index.tolist()[:500]
)

In [181]:
import pyperclip
pyperclip.copy(',\n'.join(sorted(list(map(lambda x: '"%s"' % x, one_word_cues)))))

In [183]:
all_annotations_df

Unnamed: 0,0,1,2,f_id
0,T1,Source 352 362,their club,../data/academic-datasets/PolNeAR/data/test/at...
1,T2,Cue 363 386,should publicly support,../data/academic-datasets/PolNeAR/data/test/at...
2,T3,Content 387 399,Donald Trump,../data/academic-datasets/PolNeAR/data/test/at...
3,T4,Attribution 363 386,should publicly support,../data/academic-datasets/PolNeAR/data/test/at...
4,E1,Attribution:T4 Content:T3 Cue:T2 Source:T1,,../data/academic-datasets/PolNeAR/data/test/at...
...,...,...,...,...
94,T77,Content 5851 5891;5902 6000,"“Africa might be luckier going forward,” “Here...",../data/academic-datasets/PolNeAR/data/annotat...
95,T78,Source 5892 5895,she,../data/academic-datasets/PolNeAR/data/annotat...
96,T79,Cue 5896 5900,said,../data/academic-datasets/PolNeAR/data/annotat...
97,T80,Attribution 5896 5900,said,../data/academic-datasets/PolNeAR/data/annotat...
