In [131]:
%reload_ext autoreload
%autoreload 2

In [2]:
import torch

In [80]:
from argparse import Namespace

args = Namespace(
    random_state=42,
    preprocessing=True,
    test_size=0.2,
    num_queries=20,
    batch_size = 50,
    input_path="./input/feedback-prize-2021/"
)

In [81]:
# from datasets import build_fdb_data, collate_fn

# dataset, val, postprocessor, num_classes = build_fdb_data(args)

In [82]:
import numpy as np

from datasets.processing_funcs import PIPELINE
from datasets.fbp_dataset import FBPDataset, load_texts
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split


preprocess = PIPELINE if args.preprocessing else []
documents, tags = load_texts(args.input_path, preprocess)  # type: ignore

encoder = OrdinalEncoder()
label_unique = np.array(tags["discourse_type"].unique())  # type: ignore
encoder.fit(label_unique.reshape(-1, 1))

train_idx, val_idx = train_test_split(
    documents.index, test_size=args.test_size, random_state=args.random_state
)

train_dataset = FBPDataset(documents[train_idx], tags, encoder)  # type:ignore
val_dataset = FBPDataset(documents[val_idx], tags, encoder)  # type:ignore

num_classes = len(label_unique)

100%|██████████| 15594/15594 [00:02<00:00, 5502.04it/s]


In [83]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    return tuple(list(i) for i in zip(*batch))

dl = DataLoader(train_dataset, args.batch_size, collate_fn=collate_fn)

docs, targets, infos = next(iter(dl))

In [84]:
predictions = {
    'pred_logits': torch.rand([args.batch_size, args.num_queries, num_classes + 1]) * torch.Tensor([[1] * num_classes + [10]]),
    'pred_boxes': torch.rand([args.batch_size, args.num_queries, 2]) * torch.Tensor([[1, 0.1]])
}

In [85]:
from models.matcher import HungarianMatcher
from models.criterion import CriterionDETR

matcher = HungarianMatcher()

weight_dict = {'loss_ce': 1., 'loss_bbox': 1., 'loss_giou': 1.}
losses = ['labels', 'boxes', 'cardinality']

criterion = CriterionDETR(num_classes, matcher=matcher, weight_dict=weight_dict,
                            eos_coef=0.1, losses=losses)

In [86]:
criterion(predictions, targets)

{'loss_ce': tensor(4.2919),
 'loss_bbox': tensor(0.1137),
 'loss_giou': tensor(0.7377),
 'cardinality_error': tensor(7.)}

In [87]:
from datasets.postprocess import FBPPostProcess

postprocessor = FBPPostProcess(encoder, tags, num_classes)

In [88]:
postprocessor.add_outputs(predictions, infos)

In [89]:
postprocessor.results

Unnamed: 0,id,class,predictionstring,score
0,97E4E42863A3,Evidence,421 422 423 424 425 426 427 428 429 430 431 43...,0.172355
1,03BA5E41C5E4,Position,77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 9...,0.183338
2,0119F710D008,Rebuttal,0 1 2 3 4 5 6 7 8,0.175442
3,0119F710D008,Concluding Statement,194 195 196 197 198 199 200 201 202 203 204 20...,0.192177
4,F00B4D036D97,Claim,194 195 196 197 198 199 200 201 202 203 204 20...,0.176637
...,...,...,...,...
78,325FE13C0550,Evidence,126 127 128 129 130,0.205394
79,325FE13C0550,Position,74 75 76 77 78 79 80 81 82,0.192399
80,325FE13C0550,Claim,95 96 97 98 99 100 101 102,0.200640
81,1C58F2AF08B4,Position,134 135 136 137 138 139 140 141 142 143 144 14...,0.166960


In [96]:
postprocessor.evaluate()

Unnamed: 0,precision,recall,f1
Lead,0.0,0.0,0.0
Position,0.0,0.0,0.0
Evidence,0.153834,0.018518,0.033057
Claim,0.105258,0.022988,0.037735
Concluding Statement,0.0,0.0,0.0
Counterclaim,0.0,0.0,0.0
Rebuttal,0.0,0.0,0.0
macro_avg,0.037013,0.00593,0.010113


In [61]:
import util.visualization as viz

idx = 4

id_example = infos[idx]['id']
doc = docs[idx]

In [62]:
viz.highlight_segments(id_example, doc, tags)

In [63]:
viz.highlight_segments(id_example, doc, postprocessor.results)

# PROVA

In [113]:
from argparse import Namespace

args = Namespace(
    random_state=42,
    preprocessing=True,
    test_size=0.2,
    num_queries=20,
    batch_size = 50,
    input_path="./input/feedback-prize-2021/",
    device="cpu",
    hidden_dim=10,
    set_cost_class=1,
    set_cost_bbox=1,
    set_cost_giou=1,
    bbox_loss_coef=0.5,
    giou_loss_coef=0.1,
    eos_coef=0.1,
    num_workers=2,
    start_epoch=0,
    epochs=10,
    lr=0.001,
    weight_decay=0.1,
    lr_drop=1,
    clip_max_norm=0
)

device = torch.device(args.device)

In [114]:

from datasets import build_fdb_data, collate_fn

dataset_train, dataset_val, postprocessor, num_classes = build_fdb_data(args)

100%|██████████| 15594/15594 [00:41<00:00, 378.35it/s] 


In [132]:
from models import build_models

tokenizer, model, criterion = build_models(num_classes, args)
model.to(device)

Some weights of the model checkpoint at allenai/led-base-16384 were not used when initializing LEDModel: ['lm_head.weight', 'final_logits_bias']
- This IS expected if you are initializing LEDModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LEDModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DETR(
  (transformer): Transformer(
    (model): LEDModel(
      (shared): Embedding(50265, 768, padding_idx=1)
      (encoder): LEDEncoder(
        (embed_tokens): Embedding(50265, 768, padding_idx=1)
        (embed_positions): LEDLearnedPositionalEmbedding(16384, 768)
        (layers): ModuleList(
          (0): LEDEncoderLayer(
            (self_attn): LEDEncoderAttention(
              (longformer_self_attn): LEDEncoderSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (query_global): Linear(in_features=768, out_features=768, bias=True)
                (key_global): Linear(in_features=768, out_features=768, bias=True)
                (value_global): Linear(in_features=768, out_features=768, bias=True)
              )
              (output): Linear(in_features=768, out_featu

In [133]:
import time
import datetime
from engine import train_one_epoch, evaluate

n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("number of params:", n_parameters)

optimizer = torch.optim.AdamW(
    model.parameters(), lr=args.lr, weight_decay=args.weight_decay
)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

data_loader_train = DataLoader(
    dataset_train,
    shuffle=True,
    batch_size=args.batch_size,
    collate_fn=collate_fn,
    num_workers=args.num_workers,
)
data_loader_val = DataLoader(
    dataset_val,
    shuffle=False,
    batch_size=args.batch_size,
    drop_last=False,
    collate_fn=collate_fn,
    num_workers=args.num_workers,
)


print("Start training")
start_time = time.time()
for epoch in range(args.start_epoch, args.epochs):
    train_one_epoch(
        tokenizer=tokenizer,
        model=model,
        criterion=criterion,
        data_loader=data_loader_train,
        optimizer=optimizer,
        device=device,
        epoch=epoch,
        max_norm=args.clip_max_norm,
    )
    lr_scheduler.step()

    postprocessor.reset_results()
    evaluate(
        tokenizer=tokenizer,
        model=model,
        criterion=criterion,
        postprocessor=postprocessor,
        data_loader=data_loader_val,
        epoch=epoch,
        device=device,
        tag="Validation",
    )
total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print("Training time {}".format(total_time_str))

number of params: 161846021
Start training


Train Epoch    0:   0%|          | 0/250 [00:05<?, ?it/s]


RuntimeError: The size of tensor a (449) must match the size of tensor b (490) at non-singleton dimension 1