In [2]:
import argparse
import logging
import os
import math
from dataclasses import dataclass, field
import copy
from typing import Dict, List, Optional, Tuple

import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss
from torch.nn import functional as F
from torch import Tensor

from transformers.models.mbart.modeling_mbart import MBartLearnedPositionalEmbedding
from transformers.models.bart.modeling_bart import BartLearnedPositionalEmbedding
from transformers import MBartForConditionalGeneration, MBartConfig, MBart50Tokenizer
from transformers import PreTrainedTokenizerFast
from transformers.models.bart.modeling_bart import shift_tokens_right
from transformers.models.longformer.modeling_longformer import LongformerSelfAttention
from transformers.models.bart.modeling_bart import BartLearnedPositionalEmbedding

import warnings
warnings.filterwarnings(action='ignore')

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


class LongformerSelfAttentionForMBart(nn.Module):
    def __init__(self, config, layer_id):
        super().__init__()
        self.embed_dim = config.d_model
        self.longformer_self_attn = LongformerSelfAttention(
            config, layer_id=layer_id)
        self.output = nn.Linear(self.embed_dim, self.embed_dim)

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:

        is_cross_attention = key_value_states is not None
        bsz, tgt_len, embed_dim = hidden_states.size()

        attention_mask = attention_mask.squeeze(dim=1)
        attention_mask = attention_mask[:, 0]

        is_index_masked = attention_mask < 0
        is_index_global_attn = attention_mask > 0
        is_global_attn = is_index_global_attn.flatten().any().item()

        outputs = self.longformer_self_attn(
            hidden_states,
            attention_mask=attention_mask,
            layer_head_mask=None,
            is_index_masked=is_index_masked,
            is_index_global_attn=is_index_global_attn,
            is_global_attn=is_global_attn,
            output_attentions=output_attentions,
        )

        attn_output = self.output(outputs[0])

        return (attn_output,) + outputs[1:] if len(outputs) == 2 else (attn_output, None, None)


class LongformerEncoderDecoderForConditionalGeneration(MBartForConditionalGeneration):
    def __init__(self, config):
        super().__init__(config)

        if config.attention_mode == 'n2':
            pass  # do nothing, use BertSelfAttention instead
        else:

            self.model.encoder.embed_positions = MBartLearnedPositionalEmbedding(
                config.max_encoder_position_embeddings,
                config.d_model)

            self.model.decoder.embed_positions = MBartLearnedPositionalEmbedding(
                config.max_decoder_position_embeddings,
                config.d_model)

            for i, layer in enumerate(self.model.encoder.layers):
                layer.self_attn = LongformerSelfAttentionForMBart(
                    config, layer_id=i)


class LongformerEncoderDecoderConfig(MBartConfig):
    def __init__(self, attention_window: List[int] = None, attention_dilation: List[int] = None,
                 autoregressive: bool = False, attention_mode: str = 'sliding_chunks',
                 gradient_checkpointing: bool = False, **kwargs):
        """
        Args:
            attention_window: list of attention window sizes of length = number of layers.
                window size = number of attention locations on each side.
                For an affective window size of 512, use `attention_window=[256]*num_layers`
                which is 256 on each side.
            attention_dilation: list of attention dilation of length = number of layers.
                attention dilation of `1` means no dilation.
            autoregressive: do autoregressive attention or have attention of both sides
            attention_mode: 'n2' for regular n^2 self-attention, 'tvm' for TVM implemenation of Longformer
                selfattention, 'sliding_chunks' for another implementation of Longformer selfattention
        """
        super().__init__(**kwargs)
        self.attention_window = attention_window
        self.attention_dilation = attention_dilation
        self.autoregressive = autoregressive
        self.attention_mode = attention_mode
        self.gradient_checkpointing = gradient_checkpointing
        assert self.attention_mode in ['tvm', 'sliding_chunks', 'n2']

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# pot do modela je morda druga
tokenizer = MBart50Tokenizer.from_pretrained("../results/mbart-long/checkpoint-1100")

model = LongformerEncoderDecoderForConditionalGeneration.from_pretrained("../results/mbart-long/checkpoint-1100/")

In [6]:
# define function for summarization
max_seq_len = 10240

def summarize(text, max_len):

    context_tokens = ['<s>'] + tokenizer.tokenize(text) + ['</s>']
    input_ids = tokenizer.convert_tokens_to_ids(context_tokens) 

    print(f"diploma tokenized length: {len(input_ids)}")

    if len(input_ids) < max_seq_len:   
            while len(input_ids) < max_seq_len: 
                input_ids += [tokenizer.pad_token_id] 
    else:
        input_ids = input_ids[:max_seq_len - 1] + [   
            tokenizer.eos_token_id]


    model.model.encoder.config.gradient_checkpointing = True
    model.model.decoder.config.gradient_checkpointing = True

    res_ids = model.generate(torch.tensor([input_ids]),
                                        max_length=max_len,
                                        num_beams=5,
                                        no_repeat_ngram_size = 3,
                                        eos_token_id=tokenizer.eos_token_id,
                                        bad_words_ids=[[tokenizer.unk_token_id]])        
    res = tokenizer.batch_decode(res_ids.tolist(), skip_special_tokens=True)[0]
    
    return res

In [7]:
diploma = ""
with open('./diploma.txt') as f:
    diploma = f.read()
    print(diploma)

1. UVOD

Povzemanje daljših besedil predstavlja eno izmed težjih in manj raziskanih nalog na prodročju obdelave naravnega jezika. Delno je to zaradi tega, ker je težko definirati, kaj točno predstavlja dober pozvetek. 
Zaradi teh razlogov ne obstaja veliko tovrstnih modelov. Za razvoj modela sem se odločil, ker še ne obstaja slovenski povzemalnik dolgih besedil. 
V diplomski nalogi sem izde- lal nevronski model za povzemanje daljših besedil.  Opišem razvoj globoke nevronske mreže, ki sprejme besedilo, ga obdela in vrne povzetek.  
Opišem kako uporabiti nevronsko mrežo arhitekture Longformer, da osnoven model prilagodimo za obdelavo daljših besedil. 
Ključna sprememba je v računanju pozornosti, kjer preidemo iz kvadratične v linearno pozornost, kar omogoča procesiranje daljših vhodov. 
Uporabimo KAS 2.0 učno množico [11] za povzemanje in Macocu [1] učno množico za vnaprejšno učenje.   
Model evalviram kvantitativno in kvalitativno. Rezultate modela na metriki ROUGE[5] primerjam z rezult

In [8]:
summarize(diploma, max_seq_len)

diploma tokenized length: 6259


'V diplomski nalogi opišem razvoj globoke nevronske mreže, ki sprejme besedilo, ga obdela in vrne povzetek. Opišem kako uporabiti nevronsko mrežo arhitekture Longformer, da osnoven model prilagodimo za obdelavo daljših besedil. Ključna sprememba je v računanju pozornosti, kjer preidemo iz kvadratične v linearno pozornost. Uporabimo KAS 2.0 učno množico [11] za povzetek in Macocu [1] učni množico za vnaprejšno učenje. Model evalviram kvantitativno in kvalitativno. Rezultate modela na metriki ROUGE[5] primerjam z rezultati referenčnega (angl. baseline) modela.'