In [1]:
import sys
import pandas as pd
import numpy as np
import random
import torch

In [2]:
sys.path.append('../')
sys.path.append('../Model')

In [3]:
from datasets import load_dataset

In [4]:
from transformers import (
    AutoConfig,
    AutoTokenizer,
    DataCollatorForSeq2Seq, 
)

from bert_lstm import RobertaModel

## Device

In [5]:
device = torch.device('cpu')#'cuda:0' if torch.cuda.is_available() else 'cpu')

## Datasets

In [6]:
law_dataset = load_dataset('metamong1/summarization_law', 
    use_auth_token='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')

Reusing dataset law_summarization (/opt/ml/.cache/huggingface/datasets/metamong1___law_summarization/Paper Summarization/1.2.0/b422baca30e481895dd2b572a7ff9f6c6428725e575fdafb73c0aa1d62356973)


  0%|          | 0/2 [00:00<?, ?it/s]

## Tokenizer

In [7]:
model_checkpoint = 'klue/roberta-large'

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

## data arguments

In [None]:
from arguments import DataTrainingArguments

In [None]:
data_args = DataTrainingArguments

## Processing

In [None]:
from processor import preprocess_function
from functools import partial

In [None]:
train_dataset = law_dataset['train']

In [None]:
column_names = train_dataset.column_names
print(column_names)

['doc_id', 'title', 'text', 'doc_type', 'file']


In [None]:
prep_fn  = partial(preprocess_function, tokenizer=tokenizer, data_args=data_args)
train_dataset = train_dataset.map(
    prep_fn,
    batched=True,
    num_proc=data_args.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not data_args.overwrite_cache,
)

Loading cached processed dataset at /opt/ml/.cache/huggingface/datasets/metamong1___law_summarization/Paper Summarization/1.2.0/b422baca30e481895dd2b572a7ff9f6c6428725e575fdafb73c0aa1d62356973/cache-754bd2bea88ca415.arrow
Loading cached processed dataset at /opt/ml/.cache/huggingface/datasets/metamong1___law_summarization/Paper Summarization/1.2.0/b422baca30e481895dd2b572a7ff9f6c6428725e575fdafb73c0aa1d62356973/cache-4c103939d6dd8cc1.arrow


In [None]:
train_dataset

Dataset({
    features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
    num_rows: 23730
})

## Config

In [None]:
config = AutoConfig.from_pretrained('klue/roberta-base')
config.decoder_start_token_id = tokenizer.cls_token_id
model = RobertaModel.from_pretrained('klue/roberta-base', config=config).to(device)

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.lstm_head.bias_hh_l5', 'roberta.lstm_head.bias_ih_l9', 'roberta.lstm_head.weight_ih_l7', 'roberta.lstm_head.bias_ih_l3', 'roberta.lstm

## Data Collator

In [None]:
from transformers import DataCollatorForSeq2Seq

In [None]:
label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
print(label_pad_token_id)

-100


In [None]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

## Pipeline

In [None]:
from torch.utils.data import DataLoader

### Data Loader

In [None]:
train_dataloader = DataLoader(
    train_dataset, shuffle=True, batch_size=2, collate_fn=data_collator
)

In [None]:
for batch in train_dataloader:
    break

{k: v.shape for k, v in batch.items()}

{'attention_mask': torch.Size([2, 376]),
 'input_ids': torch.Size([2, 376]),
 'labels': torch.Size([2, 8]),
 'token_type_ids': torch.Size([2, 376]),
 'decoder_input_ids': torch.Size([2, 8])}

In [None]:
batch['decoder_input_ids'][0]

tensor([    0,     0,  6953,  4962,  5450,  2170,  3618, 13519])

In [None]:
batch['labels'][0]

tensor([    0,  6953,  4962,  5450,  2170,  3618, 13519,     2])

### Model Outpus

In [None]:
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch)

In [None]:
outputs.keys()

odict_keys(['loss', 'logits', 'decoder_hidden_states', 'encoder_hidden_states'])

In [None]:
outputs['loss']

tensor(10.6212, grad_fn=<NllLossBackward0>)

In [None]:
outputs['logits'].shape

torch.Size([2, 8, 32000])