In [2]:
import pandas as pd 
import ast
from sklearn.metrics import f1_score
import unidecode

In [3]:
def read_pred_output_file(fname):
    with open(fname) as f:
        output = f.read()
        preds = list(filter(lambda x: x != '', output.split('\n')))
        preds = list(zip(preds[::2], preds[1::2]))
        preds = pd.Series(dict(preds))
        preds = preds.apply(lambda x: x.replace('{}', '')).apply(ast.literal_eval)
        
    return (
        preds
             .apply(pd.Series)
             .unstack()
             .dropna()
             .reset_index()
             .rename(columns={'level_1': 'file_id', 'level_0': 'sent_idx', 0:'pred'})
             .sort_values(['file_id', 'sent_idx'])
             [['file_id', 'sent_idx', 'pred']]
             .reset_index(drop=True)
    )

# Stage 1: Polnear-only Training

In [99]:
polnear_only_fn = '../models_neural/quote_detection/output/first_run_annotated_scores.txt'
polnear_only_pred_df = read_pred_output_file(polnear_only_fn)

In [100]:
input_data_df = pd.read_csv('../data/our-annotated-data-full.csv.gz')

In [89]:
polnear_only_pred_and_label_df = (
    polnear_only_pred_df
    .merge(input_data_df, left_on=['file_id', 'sent_idx'], right_on=['entry_id', 'sent_idx'])
    .drop('entry_id', axis=1)
)

In [78]:
(
    polnear_only_pred_and_label_df
     .loc[lambda df: df['sentence'].str.strip().apply(unidecode.unidecode) != '"']
     .pipe(lambda df: f1_score(df['label'], df['pred']))
)

0.32312070986104136

In [None]:
(
    polnear_only_pred_and_label_df
     .loc[lambda df: df['sentence'].str.strip().apply(unidecode.unidecode) != '"']
     .pipe(lambda df: f1_score(df['label'], df['pred']))
)

In [79]:
polnear_only_pred_and_label_df

Unnamed: 0,file_id,sent_idx,pred,label,sentence,quote_type,tagline,source_type,affiliation,role,role_status
0,doc_0,0,0.0,False,A two - day rally in global stocks looked set ...,,,,,,
1,doc_0,1,1.0,False,Tokyo stocks were down significantly at midday...,,,,,,
2,doc_0,2,1.0,False,Other markets in the Asian - Pacific region we...,,,,,,
3,doc_0,3,1.0,False,"U.S. Treasury bonds , typically seen by invest...",,,,,,
4,doc_0,4,0.0,False,Global stocks have been buoyed this week by pr...,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
6830,doc_956,51,0.0,True,"Gabriela Ramirez , 33 , said that “ there ’s s...",QUOTE,33,Named Individual,Witness,Participant,Current
6831,doc_956,52,0.0,True,“,QUOTE,,,,,
6832,doc_956,53,1.0,True,What kind of society do you live in,QUOTE,,,,,
6833,doc_956,54,0.0,True,", ” she said , “ if you just shoot with no con...",QUOTE,,,,,


# Stage 1: Our Dataset only Training

In [4]:
second_run_fn = '../models_neural/quote_detection/output/second_run_annotated_scores.txt'
second_run_pred_df = read_pred_output_file(second_run_fn)

In [5]:
orig_training_df = pd.read_csv('../data/our-annotated-source-training-df.tsv', sep='\t')

In [6]:
orig_training_df.head(2)

Unnamed: 0,source,s,t_id,sent_idx
0,False,BANGKOK,/train/doc_902,0
1,False,—,/train/doc_902,1


In [7]:
full_data_df = pd.read_csv('../data/our-annotated-data-full.csv.gz')

In [8]:
second_run_pred_df_w_labels = (
    second_run_pred_df
         .merge(
            orig_training_df.assign(join_key=lambda df: df['t_id'].str.split('/').str.get(2)), 
            left_on=['file_id', 'sent_idx'], 
            right_on=['join_key', 'sent_idx']
         )
         .assign(split=lambda df: df['t_id'].str.split('/').str.get(1))
        .merge(
            full_data_df, 
            left_on=['file_id', 'sent_idx'],
            right_on=['entry_id', 'sent_idx']
        )    
        .drop(['t_id', 'join_key', 's', 'entry_id', 'source'], axis=1)
).loc[lambda df: df['split'] == 'test']

In [9]:
(second_run_pred_df_w_labels
  .pipe(lambda df: f1_score(df['label'], df['pred']))
)

0.8461137193531559

In [10]:
(second_run_pred_df_w_labels
 .groupby('quote_type')
 .apply(lambda df: f1_score(df['label'], df['pred']))
)

quote_type
BACKGROUND                      0.859873
COMMUNICATION, NOT TO JOURNO    0.888889
DECLINED COMMENT                0.941176
DOCUMENT                        0.914286
LAWSUIT                         0.000000
Other: Data Analysis            1.000000
Other: LAWSUIT                  0.969697
PRESS REPORT                    0.892857
PROPOSAL                        0.500000
PUBLIC SPEECH, NOT TO JOURNO    1.000000
PUBLISHED WORK                  0.789474
QUOTE                           0.944649
STATEMENT                       0.851852
TWEET                           0.833333
VOTE/POLL                       0.666667
dtype: float64

In [20]:
test_docs = second_run_pred_df_w_labels.file_id.drop_duplicates().values.tolist()

In [24]:
import pyperclip
pyperclip.copy(', '.join(list(map(lambda x: '"%s"' % x,  test_docs))))

In [25]:
second_run_source_grouped = (
    second_run_pred_df_w_labels
         .assign(head=lambda df: df['head'].fillna('None'))
         .groupby(['file_id', 'head'])
         [['label', 'pred', 'source_type', 'affiliation', 'role', 'role_status']]
         .aggregate(list)
        #  ['label'].iloc[0]
         .applymap(lambda x: list(filter(lambda y: pd.notnull(y), x)))
         
         .assign(pred=lambda df: df['pred'].apply(lambda x: (sum(x) / len(x)) > .3))
#          .assign(pred=lambda df: df['pred'].apply(any))
         .assign(label=lambda df: df['label'].apply(any))
         .applymap(lambda x: x if not isinstance(x, list) else ('' if len(x) == 0 else x[0]))
)

In [26]:
(second_run_source_grouped
 .loc[lambda df: df['source_type'] == '']
 .pipe(lambda df: f1_score(~df['label'], ~df['pred']))
)

0.923076923076923

In [27]:
second_run_source_grouped.pipe(lambda df: f1_score(df['label'], df['pred']))

0.951048951048951

In [28]:
ind_order = [
    'Named Individual',
    'Named Group',
    'Report/Document',
    'Unnamed Group',
    'Unnamed Individual',
    'Vote/Poll'
]

In [34]:
(second_run_source_grouped
 .groupby('source_type')
 .apply(lambda df: f1_score(df['label'], df['pred']))
 .loc[ind_order]
 .to_frame('BERT Classification').pipe(lambda df: df * 100).round(1)
)

Unnamed: 0_level_0,BERT Classification
source_type,Unnamed: 1_level_1
Named Individual,99.6
Named Group,90.6
Report/Document,94.7
Unnamed Group,90.9
Unnamed Individual,100.0
Vote/Poll,66.7


### PARC3

In [247]:
import glob
parc3_files = glob.glob('../data/academic-datasets/PARC3_complete/*/*')

In [250]:
parc3_files = list(filter(lambda x: x.endswith('.xml'), parc3_files))

In [257]:
from lxml import etree

In [289]:
if False:
    xml_str = open(parc3_files[400]).read()
    tree = etree.fromstring(xml_str)
    print(etree.tostring(tree, pretty_print=True).decode())

    
all_candidates = []
for p in parc3_files:
    xml_str = open(p).read()
    try:
        tree = etree.fromstring(xml_str)
        c = tree.findall('candidate')
        all_candidates += c
    except:
        print('failed...')

failed...
failed...
failed...
failed...
failed...
failed...
failed...
failed...
failed...
failed...
failed...
failed...


In [291]:
all_candidates = list(map(lambda x: x.text, all_candidates))

In [320]:
pd.options.display.max_rows = 200

In [326]:
anon_candidate_list = (pd.Series(all_candidates)
 .value_counts()
 .reset_index()
 .rename(columns={0:'count', 'index':'name'})
 .loc[lambda df: df['name'].str.split().apply(lambda x: all(map(lambda y: y[0].upper() != y[0], x )))]
 ['name'].tolist()
)

In [331]:
to_exclue = ['he', 'she', 'they', 'it', 'i', 'me', 'you', 'we', "said",]
additional_anon_source_list = list(filter(lambda x: x not in to_exclue, anon_candidate_list))

In [334]:
def list_to_string(inp_list, split_lines_on_k=4):
    list_of_lists = []
    for i in range(split_lines_on_k):
        list_of_lists.append(inp_list[i::split_lines_on_k])
    # 
    pre_line_split_output = []
    rows = list(zip(*list_of_lists))
    # remainder
    remainder = inp_list[len(inp_list) - (len(inp_list) % split_lines_on_k):]
    rows.append(remainder)
    for l in rows:
        pre_line_split_output.append(', '.join(list(map(lambda x: '"%s"' % x.strip(), l))))
    return ',\n'.join(pre_line_split_output)    

In [353]:
mods = ['a', 'the', 'one', 'some', 'several', 'many', 'its', 'even', 'big', 'this']
additional_anon_source_list = list(map(lambda x: 
                                       ' '.join(list(filter(lambda y: y not in mods, x.split()))), 
                                       additional_anon_source_list
                                      ))
additional_anon_source_list = list(filter(lambda x: len(x.split()) < 5, additional_anon_source_list))

In [352]:
pyperclip.copy(list_to_string(additional_anon_source_list, split_lines_on_k=6))

# Prepare for Stage 2

### Examine ambiguous sources

In [4]:
import re
import glob
import json
doc_to_look_at = 'doc_564'

In [5]:
input_data_files = glob.glob('../app/data/input_data/*/*')
annotated_files = glob.glob('../app/data/output_data_affil-role/*/*')
checked_files = glob.glob('../app/data/checked_data_affil-role/*/*')

In [6]:
annotated_target_file = list(filter(lambda x: '564' in x, annotated_files + checked_files))[0]
input_target_file = list(filter(lambda x: '564' in x, input_data_files))[0]

In [84]:
def get_combined_df(annotated_fn, input_fn):
    json_dat = json.load(open(annotated_fn))['data']
    if isinstance(json_dat, dict) and 'row_data' in json_dat:
        json_dat = json_dat['row_data']
    annot_df = pd.DataFrame(json_dat)
    annot_df = annot_df.applymap(lambda x: x['field_value'] if isinstance(x, dict) else x)
    
    input_dat = json.load(open(input_fn))['html_data']
    input_df = pd.DataFrame(input_dat)

    annot_df_with_input = (
        input_df[['sent', 'sent_idx']]
             .merge(annot_df[['row_idx', 'head', 'quote_type']], left_on='sent_idx', right_on='row_idx')
             .drop(['row_idx', ], axis=1)
#      .loc[lambda df: df['sent'].str.strip().str.len() > 1]
    )
    
    return annot_df_with_input

In [85]:
pd.options.display.max_colwidth = 200

In [86]:
all_multiply_annotated_sentences = []
all_sources = []

for annot_fn in annotated_files:
    doc_id = re.search('\d+', annot_fn.split('/')[-1])[0]
    input_fn = annot_fn.replace('output_', 'input_').replace('_affil-role', '').replace('annotated-', 'to-annotate-')
    checked_cand = annot_fn.replace('output_', 'checked_').replace('annotated-', 'checked-')
    if checked_cand in checked_files:
        annot_fn = checked_cand
        
    annot_df_w_input = get_combined_df(annot_fn, input_fn)
    annot_df_w_input['doc_id'] = doc_id
    multiply_annotated = annot_df_w_input.loc[lambda df: df['head'].str.contains('-\d') == True]
    all_multiply_annotated_sentences.append(multiply_annotated)
    all_sources.append(annot_df_w_input)

In [87]:
all_doc_df = pd.concat(all_sources)

In [89]:
all_multiply_annotated_df = pd.concat(all_multiply_annotated_sentences)

In [12]:
import pyperclip

pyperclip.copy(
    (
        all_doc_df
         .loc[lambda df: df['doc_id'] ==  all_multiply_annotated_df['doc_id'].drop_duplicates().iloc[12]]
         .to_csv(sep='\t')
    )
)

In [13]:
all_multiply_annotated_df['doc_id'].drop_duplicates()

45    564
25    592
47    852
34     99
82     77
18    514
62    510
24    431
39    446
24    204
55    165
6     602
23    109
Name: doc_id, dtype: object

# Make Dataset for Stage 2

In [14]:
!head ../models_neural/quote_attribution/data/our-annotated-data__stage-2.tsv

BANGKOK	None	0	/train/902
—	None	1	/train/902
A plane carrying key senior Laotian government officials crashed Saturday morning , leaving at least four people dead , Laotian diplomats said Saturday .  	Laotian diplomats	2	/train/902
Killed in the crash were two top figures in the security apparatus of the authoritarian Lao government : the deputy prime minister , Douangchay Phichit , and Thongbane Sengaphone , the minister of public security , according to two Lao diplomats .  	None	3	/train/902
For a Communist party that relies on force and intimidation to stay in power , the loss of what were arguably the two most powerful people in the security apparatus was a significant blow .  	None	4	/train/902
The governor of Vientiane province was also killed in the crash .  	None	5	/train/902
In addition to his post as deputy prime minister Mr. Douangchay was defense minister and a member of the Politburo , the highest decision - making body of the Communist party .	None	6	/train/902
M

In [90]:
import re 
CLEANR = re.compile('<.*?>') 

def cleanhtml(raw_html):
    cleantext = re.sub(CLEANR, '', raw_html)
    return cleantext

In [104]:
from sklearn.model_selection import train_test_split

out_file = '../models_neural/quote_attribution/data/our-annotated-data__stage-2.tsv'

entry_ids = all_doc_df['doc_id'].unique().tolist()
train_files, test_files = train_test_split(entry_ids)
split_df = pd.concat([
    pd.Series(train_files).to_frame('file_id').assign(split='train'),
    pd.Series(test_files).to_frame('file_id').assign(split='test')
])

(
    all_doc_df
         .merge(split_df, left_on='doc_id', right_on='file_id')
         .assign(entry_id=lambda df: '/' + df['split'] + '/' + df['doc_id'])
         .assign(sent=lambda df: df['sent'].apply(cleanhtml))
         .assign(head=lambda df: df.apply(lambda x: x['head'] if x['quote_type'] not in ['BACKGROUND', 'NARRATIVE'] else '', axis=1))
         .assign(head=lambda df: df['head'].fillna('None').apply(lambda x: {'':'None'}.get(x, x)))
        [['sent', 'head', 'sent_idx', 'entry_id']]
     .to_csv(
         out_file,
         sep='\t',
         index=False,
         header=False
     )
)

# Divide up data 

In [19]:
import pandas as pd

In [10]:
input_data_files = glob.glob('../app/data/input_data/*/*')
annotated_files = glob.glob('../app/data/output_data_affil-role/*/*')
checked_files = glob.glob('../app/data/checked_data_affil-role/*/*')

In [12]:
annotated_files[0]

'../app/data/output_data_affil-role/18/annotated-902.json'

In [13]:
input_data_files[0]

'../app/data/input_data/18/to-annotate-931.json'

In [15]:
left_to_annotate = []
for input_fn in input_data_files:
    annot_fn = input_fn.replace('input_data', 'output_data_affil-role').replace('to-annotate-', 'annotated-')
    if annot_fn in annotated_files:
        continue
    checked_cand = annot_fn.replace('output_', 'checked_').replace('annotated-', 'checked-')
    if checked_cand in checked_files:
        continue
    left_to_annotate.append(input_fn)

In [39]:
left_to_annotate_df = pd.Series(left_to_annotate).to_frame('fn').sample(frac=1)

In [46]:
(
    left_to_annotate_df
        .pipe(lambda df: 
                pd.concat([`
                    df.iloc[:int(len(df)/2)].assign(annotator='alex'),
                    df.iloc[int(len(df)/2):].assign(annotator='james'),
                ])
        )
        .assign(file_id=lambda df: df['fn'].str.split('-').str.get(-1))
    .to_csv('../app/data/2022-07-08__annotator-assignments.csv.gz', compression='gzip')
    
)

In [50]:
(    left_to_annotate_df
        .pipe(lambda df: 
                pd.concat([
                    df.iloc[:int(len(df)/2)].assign(annotator='alex'),
                    df.iloc[int(len(df)/2):].assign(annotator='james'),
                ])
        )
        .assign(file_id=lambda df: df['fn'].str.split('-').str.get(-1))
 .loc[lambda df: df['annotator'] == 'alex']
 ['file_id'].tolist()
 
)

['746.json',
 '364.json',
 '806.json',
 '917.json',
 '814.json',
 '48.json',
 '655.json',
 '506.json',
 '820.json',
 '555.json',
 '843.json',
 '187.json',
 '561.json',
 '813.json',
 '329.json',
 '614.json',
 '488.json',
 '523.json',
 '464.json',
 '716.json',
 '659.json',
 '682.json',
 '319.json',
 '954.json',
 '670.json',
 '529.json',
 '671.json',
 '388.json',
 '356.json',
 '240.json',
 '857.json',
 '770.json',
 '430.json',
 '213.json',
 '594.json',
 '465.json',
 '691.json',
 '789.json',
 '371.json',
 '260.json',
 '834.json',
 '301.json',
 '497.json',
 '538.json',
 '702.json',
 '706.json',
 '330.json',
 '251.json',
 '926.json',
 '22.json',
 '503.json',
 '743.json',
 '142.json',
 '350.json',
 '654.json',
 '617.json',
 '548.json',
 '950.json',
 '853.json',
 '825.json',
 '766.json',
 '589.json',
 '314.json',
 '199.json',
 '439.json',
 '863.json',
 '218.json',
 '605.json',
 '705.json',
 '693.json',
 '726.json',
 '489.json',
 '765.json',
 '939.json',
 '737.json',
 '534.json',
 '664.json',
 

In [193]:
680 / 3

226.66666666666666