In [1]:
!pip install transformers -q
!pip install SentencePiece -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m67.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
current_directory = "path/to/save/model"

In [6]:
from typing import List, Optional, Tuple, Dict
from torch import nn, Tensor

from transformers import  PegasusForConditionalGeneration,LongformerSelfAttention,AutoConfig,PegasusTokenizerFast


class LongformerEncoderDecoderForConditionalGeneration(PegasusForConditionalGeneration):
    def __init__(self, config):
        super().__init__(config)
        for i, layer in enumerate(self.model.encoder.layers):
            layer.self_attn = LongformerSelfAttentionForArman(config, layer_id=i)
                

class LongformerSelfAttentionForArman(LongformerSelfAttention):
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        output_attentions=False,
        **kwargs,
    ):
        return super().forward(hidden_states, attention_mask=attention_mask, output_attentions=output_attentions)


In [15]:
import argparse
import logging
import os
import copy
import torch
import tensorflow as tf

from transformers import AutoTokenizer,LongformerSelfAttention
from transformers import AutoModelForSeq2SeqLM

import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)



def create_long_model(save_model_to, attention_window, max_pos):
  
    base_model = 'alireza7/ARMAN-MSR-persian-base'
    model = AutoModelForSeq2SeqLM.from_pretrained(base_model)

    tokenizer =  AutoTokenizer.from_pretrained(base_model, model_max_length=max_pos)
    
    config = model.config

    config.attention_probs_dropout_prob = config.attention_dropout
    config.architectures = ['LongformerEncoderDecoderForConditionalGeneration', ]

    # extend position embeddings
    tokenizer.model_max_length = max_pos
    tokenizer.init_kwargs['model_max_length'] = max_pos
    current_max_pos, embed_size = model.model.encoder.embed_positions.weight.shape
    # assert current_max_pos == config.max_position_embeddings
    config.max_position_embeddings = max_pos
    config.max_encoder_position_embeddings = max_pos 
    config.max_decoder_position_embeddings = 512 #can be different
    print("max_encoder_position_embeddings: ", config.max_encoder_position_embeddings)
    print("max_decoder_position_embeddings: ", config.max_decoder_position_embeddings)

    # del config.max_position_embeddings 

    assert max_pos > current_max_pos


    # allocate a larger position embedding matrix
    new_encoder_pos_embed = model.model.encoder.embed_positions.weight.new_empty(max_pos, embed_size)
    # copy position embeddings over and over to initialize the new position embeddings

    # k = 0
    # step = current_max_pos
    k = 0
    step = current_max_pos - k
    while k < max_pos - 1:
        new_encoder_pos_embed[k:(k + step)] = model.model.encoder.embed_positions.weight[:]
        k += step

    model.model.encoder.embed_positions = torch.nn.Embedding.from_pretrained(new_encoder_pos_embed)

    print(model.model.encoder.layers)


    config.attention_window = [attention_window] * config.num_hidden_layers
    config.attention_dilation = [1] * config.num_hidden_layers

    # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`
    for i, layer in enumerate(model.model.encoder.layers):
        longformer_self_attn_for_pegasus = LongformerSelfAttention(config, layer_id=i)

        longformer_self_attn_for_pegasus.query = layer.self_attn.q_proj
        longformer_self_attn_for_pegasus.key = layer.self_attn.k_proj
        longformer_self_attn_for_pegasus.value = layer.self_attn.v_proj

        longformer_self_attn_for_pegasus.query_global = copy.deepcopy(layer.self_attn.q_proj)
        longformer_self_attn_for_pegasus.key_global = copy.deepcopy(layer.self_attn.k_proj)
        longformer_self_attn_for_pegasus.value_global = copy.deepcopy(layer.self_attn.v_proj)

        longformer_self_attn_for_pegasus.output = layer.self_attn.out_proj
        layer.self_attn = longformer_self_attn_for_pegasus

    print("OK")
    logger.info(f'saving model to {save_model_to}')
    model.save_pretrained(save_model_to)
    tokenizer.save_pretrained(save_model_to)
    return model, tokenizer


def main():
    model, tokenizer = create_long_model(save_model_to=current_directory, attention_window=512, max_pos=8192)


    

if __name__ == "__main__":
    main()


max_encoder_position_embeddings:  8192
max_decoder_position_embeddings:  512
ModuleList(
  (0-11): 12 x PegasusEncoderLayer(
    (self_attn): PegasusAttention(
      (k_proj): Linear(in_features=768, out_features=768, bias=True)
      (v_proj): Linear(in_features=768, out_features=768, bias=True)
      (q_proj): Linear(in_features=768, out_features=768, bias=True)
      (out_proj): Linear(in_features=768, out_features=768, bias=True)
    )
    (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (activation_fn): ReLU()
    (fc1): Linear(in_features=768, out_features=3072, bias=True)
    (fc2): Linear(in_features=3072, out_features=768, bias=True)
    (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
)
OK
