In [25]:
import pandas as pd 
import ast
from sklearn.metrics import f1_score
import unidecode

In [97]:
def read_pred_output_file(fname):
    with open(fname) as f:
        output = f.read()
        preds = list(filter(lambda x: x != '', output.split('\n')))
        preds = list(zip(preds[::2], preds[1::2]))
        preds = pd.Series(dict(preds))
        preds = preds.apply(lambda x: x.replace('{}', '')).apply(ast.literal_eval)
        
    return (
        preds
             .apply(pd.Series)
             .unstack()
             .dropna()
             .reset_index()
             .rename(columns={'level_1': 'file_id', 'level_0': 'sent_idx', 0:'pred'})
             .sort_values(['file_id', 'sent_idx'])
             [['file_id', 'sent_idx', 'pred']]
             .reset_index(drop=True)
    )

# Stage 1: Polnear-only Training

In [99]:
polnear_only_fn = '../models_neural/quote_detection/output/first_run_annotated_scores.txt'
polnear_only_pred_df = read_pred_output_file(polnear_only_fn)

In [100]:
input_data_df = pd.read_csv('../data/our-annotated-data-full.csv.gz')

In [89]:
polnear_only_pred_and_label_df = (
    polnear_only_pred_df
    .merge(input_data_df, left_on=['file_id', 'sent_idx'], right_on=['entry_id', 'sent_idx'])
    .drop('entry_id', axis=1)
)

In [78]:
(
    polnear_only_pred_and_label_df
     .loc[lambda df: df['sentence'].str.strip().apply(unidecode.unidecode) != '"']
     .pipe(lambda df: f1_score(df['label'], df['pred']))
)

0.32312070986104136

In [None]:
(
    polnear_only_pred_and_label_df
     .loc[lambda df: df['sentence'].str.strip().apply(unidecode.unidecode) != '"']
     .pipe(lambda df: f1_score(df['label'], df['pred']))
)

In [79]:
polnear_only_pred_and_label_df

Unnamed: 0,file_id,sent_idx,pred,label,sentence,quote_type,tagline,source_type,affiliation,role,role_status
0,doc_0,0,0.0,False,A two - day rally in global stocks looked set ...,,,,,,
1,doc_0,1,1.0,False,Tokyo stocks were down significantly at midday...,,,,,,
2,doc_0,2,1.0,False,Other markets in the Asian - Pacific region we...,,,,,,
3,doc_0,3,1.0,False,"U.S. Treasury bonds , typically seen by invest...",,,,,,
4,doc_0,4,0.0,False,Global stocks have been buoyed this week by pr...,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
6830,doc_956,51,0.0,True,"Gabriela Ramirez , 33 , said that “ there ’s s...",QUOTE,33,Named Individual,Witness,Participant,Current
6831,doc_956,52,0.0,True,“,QUOTE,,,,,
6832,doc_956,53,1.0,True,What kind of society do you live in,QUOTE,,,,,
6833,doc_956,54,0.0,True,", ” she said , “ if you just shoot with no con...",QUOTE,,,,,


# Stage 1: Our Dataset only Training

In [102]:
second_run_fn = '../models_neural/quote_detection/output/second_run_annotated_scores.txt'
second_run_pred_df = read_pred_output_file(second_run_fn)

In [108]:
orig_training_df = pd.read_csv('../data/our-annotated-source-training-df.tsv', sep='\t')

In [110]:
orig_training_df.head(2)

Unnamed: 0,source,s,t_id,sent_idx
0,False,BANGKOK,/train/doc_902,0
1,False,—,/train/doc_902,1


In [144]:
full_data_df = pd.read_csv('../data/our-annotated-data-full.csv.gz')

In [149]:
second_run_pred_df_w_labels = (
    second_run_pred_df
         .merge(
            orig_training_df.assign(join_key=lambda df: df['t_id'].str.split('/').str.get(2)), 
            left_on=['file_id', 'sent_idx'], 
            right_on=['join_key', 'sent_idx']
         )
         .assign(split=lambda df: df['t_id'].str.split('/').str.get(1))
        .merge(
            full_data_df, 
            left_on=['file_id', 'sent_idx'],
            right_on=['entry_id', 'sent_idx']
        )    
        .drop(['t_id', 'join_key', 's', 'entry_id', 'source'], axis=1)
).loc[lambda df: df['split'] == 'test']

In [151]:
(second_run_pred_df_w_labels
  .pipe(lambda df: f1_score(df['label'], df['pred']))
)

0.8461137193531559

In [152]:
(second_run_pred_df_w_labels
 .groupby('quote_type')
 .apply(lambda df: f1_score(df['label'], df['pred']))
)

quote_type
BACKGROUND                      0.859873
COMMUNICATION, NOT TO JOURNO    0.888889
DECLINED COMMENT                0.941176
DOCUMENT                        0.914286
LAWSUIT                         0.000000
Other: Data Analysis            1.000000
Other: LAWSUIT                  0.969697
PRESS REPORT                    0.892857
PROPOSAL                        0.500000
PUBLIC SPEECH, NOT TO JOURNO    1.000000
PUBLISHED WORK                  0.789474
QUOTE                           0.944649
STATEMENT                       0.851852
TWEET                           0.833333
VOTE/POLL                       0.666667
dtype: float64

In [171]:
second_run_source_grouped = (
    second_run_pred_df_w_labels
 .assign(head=lambda df: df['head'].fillna('None'))
 .groupby(['file_id', 'head'])
 [['label', 'pred', 'source_type', 'affiliation', 'role', 'role_status']]
 .aggregate(list)
#  ['label'].iloc[0]
 .applymap(lambda x: list(set(filter(lambda y: pd.notnull(y), x))))
 .assign(pred=lambda df: df['pred'].apply(lambda x: any(x)))
 .assign(label=lambda df: df['label'].apply(lambda x: any(x)))
 .applymap(lambda x: x if not isinstance(x, list) else ('' if len(x) == 0 else x[0]))
)

In [173]:
second_run_source_grouped.pipe(lambda df: f1_score(df['label'], df['pred']))

0.9

In [177]:
(second_run_source_grouped
 .groupby('source_type')
 .apply(lambda df: f1_score(df['label'], df['pred']))
)

source_type
                      0.058824
Named Group           0.925926
Named Individual      1.000000
Report/Document       0.947368
Unnamed Group         0.909091
Unnamed Individual    1.000000
Vote/Poll             0.666667
dtype: float64