In [1]:
import sys
import pandas as pd
import numpy as np
import random
import torch

In [2]:
sys.path.append('../')
sys.path.append('../model')

In [3]:
from datasets import load_dataset

In [4]:
from transformers import (
    AutoConfig,
    AutoTokenizer,
    DataCollatorForSeq2Seq, 
)

from modeling_longformerbart import LonformerBartWithDoctypeForConditionalGeneration, LongformerBartConfig

## Device

In [5]:
device = torch.device('cpu')#'cuda:0' if torch.cuda.is_available() else 'cpu')

## Datasets

In [6]:
law_dataset = load_dataset('metamong1/summarization_law', 
    use_auth_token='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')

Reusing dataset law_summarization (/opt/ml/.cache/huggingface/datasets/metamong1___law_summarization/Paper Summarization/1.2.0/b422baca30e481895dd2b572a7ff9f6c6428725e575fdafb73c0aa1d62356973)


  0%|          | 0/2 [00:00<?, ?it/s]

## Tokenizer

In [7]:
model_checkpoint = 'hyunwoongko/kobart'

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)



## data arguments

In [9]:
from arguments import DataTrainingArguments

In [10]:
data_args = DataTrainingArguments

## Processing

In [11]:
from processor import preprocess_function
from functools import partial

In [12]:
train_dataset = law_dataset['train']

In [13]:
column_names = train_dataset.column_names
print(column_names)

['doc_id', 'title', 'text', 'doc_type', 'file']


In [14]:
prep_fn  = partial(preprocess_function, tokenizer=tokenizer, data_args=data_args)
train_dataset = train_dataset.map(
    prep_fn,
    batched=True,
    num_proc=data_args.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not data_args.overwrite_cache,
)

Loading cached processed dataset at /opt/ml/.cache/huggingface/datasets/metamong1___law_summarization/Paper Summarization/1.2.0/b422baca30e481895dd2b572a7ff9f6c6428725e575fdafb73c0aa1d62356973/cache-2e2d80f3ce625fb8.arrow
Loading cached processed dataset at /opt/ml/.cache/huggingface/datasets/metamong1___law_summarization/Paper Summarization/1.2.0/b422baca30e481895dd2b572a7ff9f6c6428725e575fdafb73c0aa1d62356973/cache-5e3a19c3addc699e.arrow


In [15]:
train_dataset

Dataset({
    features: ['attention_mask', 'input_ids', 'labels'],
    num_rows: 23730
})

## Config

In [16]:
config = LongformerBartConfig.from_pretrained(model_checkpoint, 
    num_attention_heads=16,
    attention_window=[64]*12,
    attention_probs_dropout_prob=0.1,
    doc_type_size=3+4)

model = LonformerBartWithDoctypeForConditionalGeneration.from_pretrained(model_checkpoint, config=config).to(device)

Some weights of the model checkpoint at hyunwoongko/kobart were not used when initializing LonformerBartWithDoctypeForConditionalGeneration: ['encoder.layers.5.self_attn.out_proj.weight', 'encoder.layers.2.self_attn.out_proj.bias', 'encoder.layers.4.self_attn.q_proj.bias', 'encoder.layers.4.self_attn.v_proj.weight', 'encoder.layers.0.self_attn.q_proj.weight', 'encoder.layers.5.self_attn.q_proj.bias', 'encoder.layers.2.self_attn.v_proj.bias', 'encoder.layers.1.self_attn.out_proj.weight', 'encoder.layers.2.self_attn.out_proj.weight', 'encoder.layers.3.self_attn.q_proj.bias', 'encoder.layers.3.self_attn.v_proj.weight', 'encoder.layers.3.self_attn.out_proj.weight', 'encoder.layers.2.self_attn.k_proj.bias', 'encoder.layers.1.self_attn.q_proj.weight', 'encoder.layers.0.self_attn.out_proj.weight', 'encoder.layers.4.self_attn.k_proj.bias', 'encoder.layers.4.self_attn.out_proj.bias', 'encoder.layers.1.self_attn.k_proj.bias', 'encoder.layers.0.self_attn.v_proj.bias', 'encoder.layers.4.self_attn.

## Data Collator

In [17]:
from transformers import DataCollatorForSeq2Seq

In [18]:
label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
print(label_pad_token_id)

-100


In [19]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

## Pipeline

In [20]:
from torch.utils.data import DataLoader

### Data Loader

In [21]:
train_dataloader = DataLoader(
    train_dataset, shuffle=True, batch_size=4, collate_fn=data_collator
)

In [27]:
for batch in train_dataloader:
    break

batch['doc_type_ids'] = torch.zeros(batch['input_ids'].shape)
{k: v.shape for k, v in batch.items()}

{'attention_mask': torch.Size([4, 336]),
 'input_ids': torch.Size([4, 336]),
 'labels': torch.Size([4, 10]),
 'decoder_input_ids': torch.Size([4, 10]),
 'doc_type_ids': torch.Size([4, 336])}

### Model Outpus

In [28]:
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch)

TypeError: forward() got an unexpected keyword argument 'doc_type_ids'

In [None]:
outputs.keys()

odict_keys(['loss', 'logits', 'decoder_hidden_states', 'encoder_hidden_states'])

In [None]:
outputs['loss']

tensor(10.6212, grad_fn=<NllLossBackward0>)

In [None]:
outputs['logits'].shape

torch.Size([2, 8, 32000])