# PolNeAR

In [1]:
import numpy as np 
import os
import glob
import pandas as pd 
import spacy
from tqdm.auto import tqdm
from IPython.display import display, HTML
import ast

nlp = spacy.load('en_core_web_lg')

def get_loc_in_sent(char_idx, sent_lens_cumsum):
    if not isinstance(char_idx, int):
        char_idx = int(char_idx)
    
    sent_bin = np.digitize(char_idx, sent_len_cumsum)
    offset_in_sent = char_idx - sent_len_cumsum[sent_bin - 1]
    return sent_bin, offset_in_sent

In [2]:
text_files = glob.glob('../data/academic-datasets/PolNeAR/data/*/text/*')

In [5]:
all_text = []
all_annotations = []
all_sentences = []
for text_file in tqdm(text_files):
    base_dir, filename = text_file.split('text/')
    base_filename = filename.replace('.txt', '')
    
    ## read text
    with open(text_file) as f:
        text = f.read()
        doc = nlp(text)
        sents = list(doc.sents)
        text_sents = list(map(str, sents))
        sent_len_cumsum = list(map(lambda x: x.end_char, sents))
        all_text.append({
            'text_id': base_filename,
            'text': text,
            'sentences': text_sents
        })
        
    ## read annotation
    attr_dir = os.path.join(base_dir, 'attributions', '*')
    annot_files = list(filter(lambda x: base_filename in x, glob.glob(attr_dir)))
    for a in annot_files:
        f_id = a.split('/')[-1]        
        d = open(a).read()
        t = list(map(lambda x: x.split('\t'), d.split('\n')))
        t = list(filter(lambda x: len(x) > 1, t))
        annotated_file = pd.DataFrame(t)

        if len(annotated_file) == 0:
            continue

        annotated_file['source'] = np.nan
        annotated_file['start_sentence'] = np.nan
        annotated_file['end_sentence'] = np.nan

        for row_idx, annotated_id, info_block in (
            annotated_file
                 .loc[lambda df: df[0].str.contains('T') == True]
                 [[0, 1]].itertuples()
        ):
            chunks = info_block.split(' ')
            block_type, char_idx_chunks = chunks[0], ' '.join(chunks[1:])
            start_idxs, end_idxs = [], []
            
            for start_end_chunk in char_idx_chunks.split(';'):
                start, end = start_end_chunk.split(' ')
                start_sent_idx, char_start = get_loc_in_sent(start, sent_len_cumsum)
                end_sent_idx, char_end = get_loc_in_sent(int(end) - 1, sent_len_cumsum)
                start_idxs.append(start_sent_idx)
                end_idxs.append(end_sent_idx)
                
            annotated_file.loc[row_idx, 'start_sentence'] = str(start_idxs)
            annotated_file.loc[row_idx, 'end_sentence'] = str(end_idxs)

        entities = annotated_file.loc[lambda df: df[0].str.contains('E')]
        for source_block in entities[1]:
            chunks = source_block.strip().split(' ')
            ids = list(map(lambda x: x.split(':')[1], chunks))
            source_ids = list(filter(lambda x: 'Source' in x, chunks ))
            if len(source_ids) == 1:
                source_id = source_ids[0].split(':')[1]
                source_name = annotated_file.loc[lambda df: df[0] == source_id][2].iloc[0]
                annotated_file.loc[lambda df: df[0].isin(ids), 'source'] = source_name 
            
        annotated_file['t_id'] = base_filename
        annotated_file['a_id'] = f_id
        all_annotations.append(annotated_file)

  0%|          | 0/1012 [00:00<?, ?it/s]

In [6]:
all_annotations_df = pd.concat(all_annotations)

In [7]:
all_text_df = (
    pd.DataFrame(all_text)
    .assign(sentences=lambda df: df['sentences'].apply(lambda x: list(map(lambda y: y.strip(), x))))
)

In [8]:
source_annotations = (
    all_annotations_df
     .loc[lambda df: df['source'].notnull()]
     .assign(start_sentence=lambda df: df['start_sentence'].apply(ast.literal_eval))
     .assign(end_sentence=lambda df: df['end_sentence'].apply(ast.literal_eval))    
)

In [9]:
source_annotations['start_sentence'].str.len().value_counts()

1     96531
2      3277
3       132
4        50
6        12
5        12
7         2
11        2
20        1
9         1
10        1
12        1
8         1
Name: start_sentence, dtype: int64

In [10]:
source_sentence_mapping = (
    source_annotations
     .groupby(['t_id', 'a_id', 'source'])
     [['start_sentence', 'end_sentence']]
     .aggregate(list)
     .applymap(lambda x: list(set([s for l in x for s in l])))
     .apply(lambda x: list(set(x['start_sentence'] + x['end_sentence'])), axis=1)
     .to_frame('source_sentences')
     .reset_index()
)

In [11]:
questionable_words = [
#     'it',
#     'which',
#     'that',
#     'some',
#     'you',
#     'many'
]

anonymous_sources = ['some', 'many']
messy_sources = ['you', 'that', 'which', 'i']

In [12]:
## check questionable sources
if False:
    t = (source_sentence_mapping
             .loc[lambda df: df['source'].isin(questionable_words)]
             .merge(all_text_df[['text_id', 'sentences']], left_on='t_id', right_on='text_id')
             .assign(source_sentences_text=lambda df: 
                 df.apply(lambda x: list(map(lambda y: x['sentences'][y], x['source_sentences'])) , axis=1)
            )
    )

    t[['source', 'source_sentences_text']].iloc[3].values

In [13]:
source_sentence_mapping = source_sentence_mapping.loc[lambda df: ~df['source'].str.lower().isin(messy_sources)]

In [15]:
quote_sentence_lists = (
    source_sentence_mapping
     .groupby('t_id')[['source', 'source_sentences']]
     .aggregate(list)
     .assign(source=lambda df: df.apply(lambda x: [[x['source'][i]] * len(s) for i, s in enumerate(x['source_sentences'])], axis=1))
     .applymap(lambda x: [s for l in x for s in l])
#      .apply(lambda x: list(set(x)))
#      .to_frame('quote_sentences')
)

In [16]:
quote_sentence_lists_exploded = (quote_sentence_lists
 .apply(lambda x: list(map(lambda y: {'source': y[0], 'sent': y[1], 't_id': x.name}, zip(x['source'], x['source_sentences']))) , axis=1)
 .pipe(lambda s: pd.DataFrame([i for x in s for i in x]))
)

In [17]:
all_text_df_exploded = (all_text_df
 .apply(lambda x: list(map(lambda y: {'s': y[1], 's_idx': y[0], 't_id': x['text_id']}, enumerate(x['sentences']))), axis=1)
 .pipe(lambda s: pd.DataFrame([i for x in s for i in x]))
)

In [93]:
matched_quotes_exploded = (
    all_text_df_exploded
     .merge(quote_sentence_lists_exploded, how='left', left_on=['t_id', 's_idx'], right_on=['t_id', 'sent'])
     .set_index('t_id')
     .drop(['sent', 
#             's_idx'
           ], axis=1)
)

In [94]:
t_ids = matched_quotes_exploded.index.unique().tolist()

In [95]:
idx = 3
d = matched_quotes_exploded.loc[t_ids[idx]]

In [96]:
html_all = '<table>'
for sent, source in d.itertuples(index=False):
    if pd.notnull(source):
        html_all += '<tr><td style="background-color: pink">' +  sent + '</td><td>' + source + '</tr>'
    else:
        html_all += '<tr><td>' + sent + '</td><td></td></tr>'
html_all += '</table>'

ValueError: too many values to unpack (expected 2)

In [22]:
display(HTML(html_all))

0,1
Congress could never control President Trump.,
Donald Trump's small band of intellectual defenders can no longer claim that he is good man.,Donald Trump's small band of intellectual defenders
"Instead, they say his dishonesty and cruelty are acceptable because Congress could check him if he really got out of hand.",they
"Like so much else about the Trump campaign, this assertion rests on fantasy.",
"An unusual number of Republican lawmakers have come out against him, but most have not.",An unusual number of Republican lawmakers
Why are so many siding with a candidate who is so unfit?,so many
High on the list of probable motives is fear of a primary challenge.,
"If GOP lawmakers are cowering before candidate Trump, how could they stand up to President Trump?",
"In addition to his wealth and political base, he would wield the vast power of the executive branch.",
There is little doubt that he'd use it to punish those who displease him.,


In [162]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(matched_quotes_exploded.reset_index()['t_id'].drop_duplicates().tolist())
train_test_info = pd.concat([
    pd.Series(train).to_frame('files').assign(group='/train/'),
    pd.Series(test).to_frame('files').assign(group='/test/')
])

In [165]:
(matched_quotes_exploded
 .assign(source=lambda df: df['source'].isnull())
 .reset_index()
 .merge(train_test_info, left_on='t_id', right_on='files', how='left')
 .assign(t_id=lambda df: df['group'] + df['t_id'])
 .groupby(['t_id', 's', 's_idx'])['source'].any()
 .reset_index()
 .sort_values(['t_id', 's_idx'])
 [['source', 's', 't_id', 's_idx'] ]
  .to_csv('../models/neural_models/quote_detection/data/polnear-training-data-stage-1.tsv', index=False, sep='\t')
)

In [168]:
notebook.__package__

'notebook'

In [154]:
t3 = (source_sentence_mapping
 .groupby(['t_id', 'a_id'])
 ['source'].aggregate(list).iloc[6]
)

t3

['Author Roger Stone',
 'Breitbart Jerusalem editor Aaron Klein',
 'Breitbart London’s Raheem Kassam',
 'Breitbart Texas editor Brandon Darby',
 'Dave Gorab',
 'Investigative reporter\xa0Julia Hahn',
 'Pope Francis']

In [155]:
t = 'Former Secretary of State and Democratic presidential frontrunner Hillary Clinton'
t = 'Author Roger Stone'
t = t3[2]

In [156]:
ents = list(nlp(t).ents)

In [157]:
ents

[Breitbart London’s, Raheem Kassam]

In [158]:
t2 = ents[0]

In [159]:
t2.label_

'PERSON'

In [160]:
list(map(lambda x: x.label_, ents))

['PERSON', 'PERSON']

In [986]:
data_df = all_text_df.merge(quote_sentence_lists, right_index=True, left_on='text_id')
data_df

Unnamed: 0,text_id,text,sentences,source,source_sentences
0,wash-post_2016-10-19_so-guess-who-s-the-demogo...,So guess who's the Demogorgon in this scenario...,[So guess who's the Demogorgon in this scenari...,"[Clinton, He, His office, Liberals, Newt Gingr...","[32, 35, 10, 41, 15, 9, 38, 7, 28, 36, 32, 33,..."
1,breitbart_2016-11-02_asian-markets-on-edge-on-...,Asian markets on edge on US election uncertain...,[Asian markets on edge on US election uncertai...,"[Chris Weston, chief market strategist in Melb...","[5, 6, 7, 19, 4, 18]"
2,west-journal_2016-10-11_donald-trump-claims-th...,Donald Trump Claims ‘The Shackles’ Have Now Be...,[Donald Trump Claims ‘The Shackles’ Have Now B...,"[Donald Trump, House Speaker Paul Ryan, Joshua...","[0, 5, 17, 18, 16, 1, 8, 9, 15, 11, 4, 6, 10, ..."
3,usa-today_2016-10-26_congress-could-never-cont...,Congress could never control President Trump.\...,[Congress could never control President Trump....,"[An unusual number of Republican lawmakers, Co...","[4, 24, 1, 10, 29, 11, 15, 25, 10, 27, 20, 16,..."
4,west-journal_2016-10-19_lone-player-stands-dur...,Lone Player Stands During National Anthem Whil...,"[Lone Player Stands During National Anthem, Wh...","[Brewer, Jewell Young, a Decatur resident, Jew...","[6, 20, 21, 22, 14, 12, 26, 29, 11, 23, 19, 24..."
...,...,...,...,...,...
1007,politico_2016-09-30_flight-attendants-union-en...,Flight attendants' union endorses Clinton.\n\n...,"[Flight attendants' union endorses Clinton., A...","[A union representing flight attendants, Fligh...","[1, 0, 5, 3, 2, 6, 1, 2, 3]"
1008,breitbart_2016-03-11_activists-who-lit-syria-r...,Activists who lit Syria revolt washed away in ...,[Activists who lit Syria revolt washed away in...,"[He, He, He, Jimmy Shahinian, a 28-year-old ac...","[32, 26, 39, 3, 4, 33, 34, 35, 27, 28, 29, 22,..."
1009,breitbart_2016-03-11_africa-builds-expertise-i...,"Africa builds expertise in science, tech, engi...","[Africa builds expertise in science, tech, eng...","[Alta Schutte, Gitau, Gitau, who specializes i...","[2, 43, 33, 32, 10, 17, 28, 44, 7, 13, 14, 36,..."
1010,breitbart_2016-03-11_activists-brick-up-entran...,Activists Brick Up Entrance To Migrants-Only P...,[Activists Brick Up Entrance To Migrants-Only ...,[A source from the German Identitarian Movemen...,"[18, 19, 22, 21, 23, 1, 7, 6, 8, 4, 24, 17, 15..."


In [525]:
all_annotations_df.shape 

(134523, 8)

In [528]:
(all_annotations_df
 .loc[lambda df: df[0].str.contains('T')]
 .loc[lambda df: df[1].str.contains('Attribution')==True][1]
 .str.split(' ')
 .str.get(0)
 .str.strip()
 .value_counts()
)

Attribution              25382
Attribution-no-source     1955
Name: 1, dtype: int64

In [529]:
(all_annotations_df
 .loc[lambda df: df[0].str.contains('T')]
 .loc[lambda df: df[1].str.contains('Attribution-no-source')==True]
 [2].value_counts()
#  .str.split(' ')
#  .str.get(0)
#  .str.strip()
#  .value_counts()
)

seemed                          48
is expected                     43
appeared                        41
seems                           36
Asked                           36
                                ..
growing concern about            1
had been previously reported     1
recognition                      1
recognizing                      1
was designated as                1
Name: 2, Length: 1243, dtype: int64

In [531]:
all_annotations_df

Unnamed: 0,0,1,2,source,start_sentence,end_sentence,t_id,a_id
0,T1,Cue 3 8,guess,,[0],[0],wash-post_2016-10-19_so-guess-who-s-the-demogo...,wash-post_2016-10-19_so-guess-who-s-the-demogo...
1,T2,Content 9 46,who's the Demogorgon in this scenario,,[0],[0],wash-post_2016-10-19_so-guess-who-s-the-demogo...,wash-post_2016-10-19_so-guess-who-s-the-demogo...
2,T3,Attribution-no-source 3 8,guess,,[0],[0],wash-post_2016-10-19_so-guess-who-s-the-demogo...,wash-post_2016-10-19_so-guess-who-s-the-demogo...
3,E1,Attribution-no-source:T3 Content:T2 Cue:T1,,,,,wash-post_2016-10-19_so-guess-who-s-the-demogo...,wash-post_2016-10-19_so-guess-who-s-the-demogo...
4,T4,Source 121 142,government scientists,government scientists,[1],[1],wash-post_2016-10-19_so-guess-who-s-the-demogo...,wash-post_2016-10-19_so-guess-who-s-the-demogo...
...,...,...,...,...,...,...,...,...
0,T1,Content 1124 1236,"“Very happy with my performance, especially in...",Nabi,[7],[7],breitbart_2016-03-11_afghanistan-thrash-zimbab...,breitbart_2016-03-11_afghanistan-thrash-zimbab...
1,T2,Cue 1237 1241,said,Nabi,[7],[7],breitbart_2016-03-11_afghanistan-thrash-zimbab...,breitbart_2016-03-11_afghanistan-thrash-zimbab...
2,T3,Source 1242 1246,Nabi,Nabi,[7],[7],breitbart_2016-03-11_afghanistan-thrash-zimbab...,breitbart_2016-03-11_afghanistan-thrash-zimbab...
3,T4,Attribution 1237 1241,said,Nabi,[7],[7],breitbart_2016-03-11_afghanistan-thrash-zimbab...,breitbart_2016-03-11_afghanistan-thrash-zimbab...


In [186]:
annotation_files[0]

'../data/academic-datasets/PolNeAR/data/test/attributions/wash-post_2016-10-25_on-campus-trump-backers-have-lea_6b86.ann'

In [189]:
ls ../data/academic-datasets/PolNeAR/data/test/text

breitbart_2016-10-12_washington-post-backs-hillary-cl.txt
breitbart_2016-10-13_cnn-s-jake-tapper-on-donna-brazi.txt
breitbart_2016-10-18_obama-says-trump-modeling-polici.txt
breitbart_2016-10-19_election-uncertainty-weighs-on-p.txt
breitbart_2016-10-22_canada-eu-failure-signals-more-b.txt
breitbart_2016-10-25_the-latest-fbi-ready-to-respond-.txt
breitbart_2016-10-28_us-stocks-sink-as-fbi-reopens-cl.txt
breitbart_2016-10-31_sheldon-adelson-commits-million-.txt
breitbart_2016-11-02_asian-markets-on-edge-on-us-elec.txt
breitbart_2016-11-05_hillary-clinton-documentary-skip.txt
breitbart_2016-11-06_clinton-aims-high-trump-goes-har.txt
breitbart_2016-11-07_wikileaks-hillary-clinton-flatte.txt
huff-post_2016-10-10_new-poll-gives-hillary-clinton-a.txt
huff-post_2016-10-12_donald-trump-tells-florida-suppo.txt
huff-post_2016-10-14_donald-trump-jr-women-who-can-t-.txt
huff-post_2016-10-21_michaela-angela-davis-it-s-time-.txt
huff-post_2016-10-25_trump-warns-clinton-policy-on-sy.tx

In [84]:
annotated_file.loc[98][1]

'Attribution:T80 Cue:T79 Source:T78 Content:T77'

In [85]:
annotated_file.loc[[97, 96, 95, 94]]

Unnamed: 0,0,1,2,f_id
97,T80,Attribution 5896 5900,said,../data/academic-datasets/PolNeAR/data/annotat...
96,T79,Cue 5896 5900,said,../data/academic-datasets/PolNeAR/data/annotat...
95,T78,Source 5892 5895,she,../data/academic-datasets/PolNeAR/data/annotat...
94,T77,Content 5851 5891;5902 6000,"“Africa might be luckier going forward,” “Here...",../data/academic-datasets/PolNeAR/data/annotat...


In [97]:
text_file = '../data/academic-datasets/PolNeAR/data/annotator-training/text/breitbart_2016-03-11_africa-builds-expertise-in-scien.txt'

In [99]:
with open(text_file) as f:
    text_file = f.read()

In [103]:
text_file[5851:6000]

'“Africa might be luckier going forward,” she said. “Here we have to discuss and address deep, serious social challenges, and from there we can leap.”'

In [110]:
all_annotations_df = pd.concat(all_annotations)

In [114]:
all_annotations_df[1].isnull().value_counts()

False    145003
True          3
Name: 1, dtype: int64

In [122]:
(all_annotations_df
 .loc[lambda df: df[0].str.contains('T')]
 .loc[lambda df: df[1].str.contains('Cue') == True][2]
 .value_counts()
)

said                                            6142
says                                             802
told                                             527
according to                                     410
say                                              377
                                                ... 
to show off later on their Instagram account       1
depicting                                          1
soon recalibrated                                  1
scoffed                                            1
tries to                                           1
Name: 2, Length: 9641, dtype: int64

In [136]:
polnear_sources = (
    all_annotations_df
     .loc[lambda df: df[0].str.contains('T')]
     .loc[lambda df: df[1].str.contains('Source') == True][2]
     .str.lower()
     .value_counts()
)

In [None]:
list(filter(lambda x: any(map(lambda y: y in x, desired_checklist_of_anonymous_sources)), polnear_sources.index))

In [None]:
list(filter(lambda x: any(map(lambda y: y in x and 'clinton' not in x, desired_checklist_of_documents)), polnear_sources.index))

In [163]:
(all_annotations_df
 .loc[lambda df: df[1].str.contains('Cue') == True].dropna()[2]
 .loc[lambda s: s.str.split().str.len() > 1]
 .str.lower()
 .loc[lambda s: ~s.str.contains('said')]
 .value_counts()
 .iloc[40:50]
)

also told         9
has announced     9
also noted        9
attacks on        9
put it            9
agree with        9
calling for       9
to discuss        9
have shown        9
went on to say    8
Name: 2, dtype: int64

In [172]:
one_word_cues = (
    all_annotations_df
     .loc[lambda df: df[1].str.contains('Cue') == True].dropna()[2]
     .loc[lambda s: s.str.split().str.len() == 1]
     .str.lower()
     .value_counts()
     .index.tolist()[:500]
)

In [181]:
import pyperclip
pyperclip.copy(',\n'.join(sorted(list(map(lambda x: '"%s"' % x, one_word_cues)))))

In [183]:
all_annotations_df

Unnamed: 0,0,1,2,f_id
0,T1,Source 352 362,their club,../data/academic-datasets/PolNeAR/data/test/at...
1,T2,Cue 363 386,should publicly support,../data/academic-datasets/PolNeAR/data/test/at...
2,T3,Content 387 399,Donald Trump,../data/academic-datasets/PolNeAR/data/test/at...
3,T4,Attribution 363 386,should publicly support,../data/academic-datasets/PolNeAR/data/test/at...
4,E1,Attribution:T4 Content:T3 Cue:T2 Source:T1,,../data/academic-datasets/PolNeAR/data/test/at...
...,...,...,...,...
94,T77,Content 5851 5891;5902 6000,"“Africa might be luckier going forward,” “Here...",../data/academic-datasets/PolNeAR/data/annotat...
95,T78,Source 5892 5895,she,../data/academic-datasets/PolNeAR/data/annotat...
96,T79,Cue 5896 5900,said,../data/academic-datasets/PolNeAR/data/annotat...
97,T80,Attribution 5896 5900,said,../data/academic-datasets/PolNeAR/data/annotat...
