In [1]:
from models_neural.quote_attribution.classification_models import SourceQA
import json
import attrdict
from models_neural.src.config_helper import TransformersConfig
import pandas as pd 
import zipfile
from models_neural.quote_attribution.utils_dataset import SourceQADataModule

In [None]:
# download model 
! aws s3 cp s3://aspangher/transformer-pretrained-models/roberta-base-expanded-embeddings.zip .  --endpoint http://s3.dev.obdc.bcs.bloomberg.com
with zipfile.ZipFile('roberta-base-expanded-embeddings.zip', 'r') as zip_ref:
    zip_ref.extractall()
    
# download dataset
! aws s3 cp s3://aspangher/source-exploration/./config-Stage\ 2\:\ Quote\ Attribution.\ Our\ dataset\ only..json  . --endpoint http://s3.dev.obdc.bcs.bloomberg.com
            
# download spacy
! aws s3 cp --recursive s3://aspangher/spacy/en_core_web_lg/ en_core_web_lg/   --endpoint http://s3.dev.obdc.bcs.bloomberg.com

In [5]:
data_fn = 'our-annotated-data__stage-2.tsv'
data_df = pd.read_csv(data_fn, sep='\t', header=None)
model_path = 'trial-Stage 2: Quote Attribution. Our dataset only.__epoch=05-perplexity=0.00.ckpt'

In [8]:
config_out = 'config-%s.json' % "Stage 2: Quote Attribution. Our dataset only."
with open(config_out) as f:
    config_dict = json.load(f)

config_dict = attrdict.AttrDict(config_dict)
config_dict.pretrained_files_s3 = config_dict.pretrained_model_path
config_dict.model_type = 'roberta'

In [9]:
model = SourceQA.load_from_checkpoint(model_path, config=config_dict)

In [11]:
data_model = SourceQADataModule(
    config=config_dict,
    data_fp=data_fn,
    model_type='roberta',
    max_length_seq=2048,
    pretrained_model_path=config_dict.pretrained_model_path,
    batch_size=1,
    spacy_path='en_core_web_lg'
)

In [12]:
data_model.prepare_data()

In [13]:
data_model.setup()

100%|██████████| 335/335 [02:10<00:00,  2.57it/s]


In [14]:
dataloader = data_model.val_dataloader()

In [15]:
dataloader.batch_size

1

In [17]:
model = model.to('cuda')

In [18]:
for idx, sample in enumerate(dataloader):
    sample

In [19]:
idx

711

In [20]:
data_model.test_dataset[idx].keys()

dict_keys(['doc_idx', 'source_head', 'start_position', 'end_position', 'context', 'sentence_indicator_tokens'])

In [22]:
data_model.test_dataset[idx]['start_position']

472

In [89]:
len(data_model.train_dataset)

6400

In [90]:
len(data_model.test_dataset)

712

In [40]:
import torch
from tqdm.auto import tqdm

In [42]:
all_samples = []
for idx, sample in tqdm(enumerate(data_model.test_dataset), total=len(data_model.test_dataset)):
    sample = data_model.collate_fn([sample])

    with torch.no_grad():
        input_ids = sample['input_ids'].to('cuda')
        attention_mask=sample['attention_mask'].to('cuda')
        sentence_ids=sample['sentence_ids'].to('cuda')
        output = model.forward(input_ids=input_ids, attention_mask=attention_mask, sentence_ids=sentence_ids)

        start_logits, end_logits = output
        start, end = start_logits.argmax().to('cpu'), end_logits.argmax().to('cpu') 
    
    sample['pred_start'] = start
    sample['pred_end'] = end
    sample['start_logits'] = start_logits.to('cpu')
    sample['end_logits'] = end_logits.to('cpu')
    all_samples.append(sample)

100%|██████████| 712/712 [00:32<00:00, 22.18it/s]


In [45]:
samples_df = pd.DataFrame(all_samples)

In [58]:
start_end_df = (samples_df[['start_positions', 'end_positions', 'pred_start', 'pred_end']]
    .assign(start_positions=lambda df: df['start_positions'].str.get(0).apply(int))
    .assign(end_positions=lambda df: df['end_positions'].str.get(0).apply(int)) 
    .assign(pred_start=lambda df: df['pred_start'].apply(int)) 
    .assign(pred_end=lambda df: df['pred_end'].apply(int)) 
)

In [62]:
from sklearn.metrics import f1_score

In [87]:
start_end_df

Unnamed: 0,start_positions,end_positions,pred_start,pred_end
0,49,52,49,92
1,1387,1394,1387,22
2,1,3,27,335
3,235,239,240,239
4,277,279,157,159
...,...,...,...,...
707,363,365,857,365
708,635,640,1089,1093
709,97,101,97,101
710,614,618,295,777


In [None]:
samples_df

In [65]:
start_end_df.pipe(lambda df: df['start_positions'] ==  df['pred_start']).mean()

0.5196629213483146

In [66]:
start_end_df.pipe(lambda df: df['end_positions'] ==  df['pred_end']).mean()

0.5084269662921348

In [84]:
def evaluate_overlap(row):
    temp_pred_start, temp_pred_end = row[['pred_start', 'pred_end']]
    pred_start = min(temp_pred_start, temp_pred_end)
    pred_end = max(temp_pred_start, temp_pred_end)
    
    true_start = row['start_positions']
    true_end = row['end_positions']
    
    start_mark = max(pred_start, true_start)
    end_mark = min(pred_end, true_end)
    
    num = max(0, end_mark - start_mark)
    denom = pred_end - pred_start
    
    return num / denom

In [86]:
start_end_df.apply(evaluate_overlap, axis=1).mean()

0.43925877688822845

In [115]:
from transformers import AutoTokenizer

In [117]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base-expanded-embeddings')

In [119]:
tokenizer.decode(sample['input_ids'][:, start:end][0])

' David W. Eaton'

In [136]:
start.to('cpu')

tensor(212)

In [123]:
end

tensor(216)

In [121]:
sample

{'start_positions': tensor([212]),
 'end_positions': tensor([216]),
 'input_ids': tensor([[    0, 15545,   661, 18718,   111,  2236,    96,     5,  2625,    81,
          25752,     9,   681,     8,  1123, 12252,  2156,  4257,   747, 22884,
              5,   810,    14,     5,   609,    64,   278,   160,  3027, 20396,
            479,     2,     0,  1708,  4211,   224,    14,    11,     5,   315,
            532,  2156, 25752,   111, 26914, 20396,    32,    45,  1537,   479,
              2,     0,  1121,   896,  2156,   959,  2156,    10, 22040,     9,
          20396,    11,  6055,   624,     5,   375,   292,   107,    34,    57,
           9702,     7, 25752,  2156,    50, 29215, 34905,  2156,    11,    61,
            514,  2156,  8321,     8,  6255,    32, 22993,    23,   239,  1164,
             88,    10,   157, 20463,    11,    10, 14352,  9285,     7,  1108,
             62,     5,  3152,     8,   800,   681,     8,  1123,   479,     2,
              0,  5975,  2156,  4211,  