In [None]:
!pip install lightning evaluate rouge_score

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_dataset
from transformers import AutoTokenizer
import lightning.pytorch as pl
from lightning.pytorch.loggers import CSVLogger
import evaluate
import random
import math

### **BART-Model**

In [3]:
class FFN(nn.Module):
  def __init__(self,d_model,d_ff,dropout=0.1):
    super().__init__()
    self.lin1 = nn.Linear(d_model,d_ff)
    self.lin2 = nn.Linear(d_ff,d_model)
    self.dropout = nn.Dropout(dropout)
  def forward(self,x):
    return self.lin2(self.dropout(F.gelu(self.lin1(x))))

class SelfAttn(nn.Module):
  def __init__(self,d_model,n_heads,dropout=0.1):
    super().__init__()
    self.attn = nn.MultiheadAttention(d_model,n_heads,dropout=dropout,batch_first=True)
    self.ln = nn.LayerNorm(d_model)
  def forward(self,x,attn_mask=None,key_padding_mask=None):
    h,_ = self.attn(x,x,x,attn_mask=attn_mask,key_padding_mask=key_padding_mask,need_weights=False)
    return self.ln(x+h)

class CrossAttn(nn.Module):
  def __init__(self,d_model,n_heads,dropout=0.1):
    super().__init__()
    self.attn = nn.MultiheadAttention(d_model,n_heads,dropout=dropout,batch_first=True)
    self.ln = nn.LayerNorm(d_model)
  def forward(self,x,mem,attn_mask=None,key_padding_mask=None,mem_padding_mask=None):
    h,_ = self.attn(x,mem,mem,attn_mask=attn_mask,key_padding_mask=key_padding_mask)
    return self.ln(x+h)

class EncoderLayer(nn.Module):
  def __init__(self,d_model,n_heads,d_ff,dropout=0.1):
    super().__init__()
    self.self_attn = SelfAttn(d_model,n_heads,dropout)
    self.ffn = FFN(d_model,d_ff,dropout)
    self.ln = nn.LayerNorm(d_model)
  def forward(self,x,key_padding_mask=None):
    x = self.self_attn(x,key_padding_mask=key_padding_mask)
    return self.ln(x+self.ffn(x))

class DecoderLayer(nn.Module):
  def __init__(self,d_model,n_heads,d_ff,dropout=0.1):
    super().__init__()
    self.self_attn = SelfAttn(d_model,n_heads,dropout)
    self.cross_attn = CrossAttn(d_model,n_heads,dropout)
    self.ffn = FFN(d_model,d_ff,dropout)
    self.ln = nn.LayerNorm(d_model)
  def forward(self,x,mem,tgt_key_padding_mask=None,mem_key_padding_mask=None,causal_mask=None):
    x = self.self_attn(x,attn_mask=causal_mask,key_padding_mask=tgt_key_padding_mask)
    x = self.cross_attn(x,mem,mem_padding_mask=mem_key_padding_mask)
    return self.ln(x+self.ffn(x))

In [4]:
class MiniBART(nn.Module):
  def __init__(self,vocab_size,d_model=256,n_heads=4,d_ff=1024,num_enc=3,num_dec=3,max_len=512):
    super().__init__()
    self.tok_emb = nn.Embedding(vocab_size,d_model)
    self.pos_emb = nn.Embedding(max_len,d_model)
    self.enc_layers = nn.ModuleList([EncoderLayer(d_model,n_heads,d_ff) for _ in range(num_enc)])
    self.dec_layers = nn.ModuleList([DecoderLayer(d_model,n_heads,d_ff) for _ in range(num_dec)])
    self.lm_head = nn.Linear(d_model,vocab_size,bias=False)

  def forward(self,src_ids,tgt_ids):

    def add_pos(x):
      b,L = x.shape
      pos = torch.arange(L,device=x.device).unsqueeze(0).expand(b,L)
      return self.tok_emb(x) + self.pos_emb(pos)

    src = add_pos(src_ids)
    tgt = add_pos(tgt_ids)

    # encoder
    mem = src
    for layer in self.enc_layers:
      mem = layer(mem)

    # causal mask for decoder self-attn
    L = tgt.size(1)
    causal = torch.triu(torch.ones(L,L,device=tgt.device)*float("-inf"),diagonal=1)

    out = tgt
    for layer in self.dec_layers:
      out = layer(out,mem,causal_mask=causal)

    logits = self.lm_head(out)
    return logits

In [5]:
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
MASK_ID = tokenizer.mask_token_id
PAD_ID = tokenizer.pad_token_id
BOS_ID = tokenizer.bos_token_id
EOS_ID = tokenizer.eos_token_id

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

## **PRETRAIN**

#### dataset

In [6]:
MAX_INPUT = 512
MAX_TARGET = 128

# دیتاست،متن خام
dataset = load_dataset("wikitext", "wikitext-103-v1")
train_txt = dataset["train"]["text"]
val_txt = dataset["validation"]["text"]

README.md: 0.00B [00:00, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/722k [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/156M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/156M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/655k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

#### methods

In [7]:
# حذف خطوط خالی
def clean_lines(lines):
  out = []
  for line in lines:
    line = line.strip()
    if len(line) > 0:
      out.append(line)
  return out

In [8]:
train_txt = clean_lines(train_txt)
val_txt = clean_lines(val_txt)

In [9]:
# ابزار جمله بندی خیلی ساده
def split_sentences(x):
  out = []
  s = []
  for tok in x.split():
    s.append(tok)
    if tok.endswith(('.','!','?','."',"!'","?'")):
      out.append(" ".join(s))
      s = []
  if s:
    out.append(" ".join(s))
  return out if out else [x]

In [10]:
# نمونه گیری طول با توزیع پواسون
def simple_span_len(lam=3):
  L=0
  p = math.exp(-lam)
  F = p
  u = random.random()
  while u > F:
    L += 1
    p *= lam/L
    F += p
  return max(L,1)

In [11]:
def text_infilling_token_ids(ids,mask_ratio=0.3,lam=3):
  L = len(ids)
  num_to_mask = max(1,int(mask_ratio*L))
  masked = ids[:]
  covered = set()
  while len(covered) < num_to_mask:
    start = random.randrange(0,L)
    if start in covered:
      continue
    span_len = simple_span_len(lam)
    end = min(L,start+span_len)
    for i in range(start,end):
      if i not in covered:
        masked[i] = MASK_ID
        covered.add(i)
  return masked

In [12]:
def sentence_permute(text):
  sents = split_sentences(text)
  random.shuffle(sents)
  return " ".join(sents)

In [13]:
def apply_noise(text,do_sentperm=True,mask_ratio = 0.3,lam=3):
  if do_sentperm:
    text = sentence_permute(text)

  ids = tokenizer(text,truncation=True,max_length=MAX_INPUT,add_special_tokens=False)["input_ids"]
  if len(ids) == 0:
    return None

  noisy_ids = text_infilling_token_ids(ids,mask_ratio=mask_ratio,lam=lam)

  def pack(arr,max_len):
    arr = [BOS_ID] + arr[:max_len-2] + [EOS_ID]
    attn = [1]*len(arr)
    if len(arr) < max_len:
      pad_len = max_len - len(arr)
      arr += [PAD_ID]*pad_len
      attn += [0]*pad_len
    return arr, attn

  noisy_imp, noisy_attn = pack(noisy_ids,MAX_INPUT)
  clean_tgt, _  = pack(ids,MAX_TARGET)

  return {
      "input_ids":noisy_imp,
      "attention_mask":noisy_attn,
      "labels":clean_tgt
  }

In [14]:
def make_tensor_dataset(texts, n_limits=None):
  rows = []
  count = 0
  for t in texts:
    ex = apply_noise(t,do_sentperm=True,mask_ratio=0.3,lam=3)
    if ex is not None:
      rows.append(ex)
      count += 1
      if n_limits is not None and count >= n_limits:
          break

  input_ids = torch.tensor([r["input_ids"] for r in rows], dtype=torch.long)
  attention_mask = torch.tensor([r["attention_mask"] for r in rows], dtype=torch.long)
  labels = torch.tensor([r["labels"] for r in rows], dtype=torch.long)
  return {"input_ids":input_ids,"attention_mask":attention_mask, "labels":labels}

In [15]:
train_ids = make_tensor_dataset(train_txt,n_limits=150000)
val_ids = make_tensor_dataset(val_txt,n_limits=10000)

#### lightning

In [17]:
rouge = evaluate.load("rouge")

class LitMiniBART(pl.LightningModule):
  def __init__(self,vocab_size,d_model=256,n_heads=4,d_ff=1024,num_enc=3,num_dec=3):
    super().__init__()
    self.model = MiniBART(vocab_size,d_model,n_heads,d_ff,num_enc,num_dec)
    self.loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
    self.val_preds = []
    self.val_labels = []

  def forward(self,src_ids,tgt_ids):
    return self.model(src_ids,tgt_ids)

  def training_step(self,batch,batch_idx):
    logits = self(batch["input_ids"],batch["labels"])
    shift_logits = logits[:,:-1].contiguous()
    shift_labels = batch["labels"][:,1:].contiguous()
    loss = self.loss_fn(shift_logits.view(-1,shift_logits.size(-1)),shift_labels.view(-1))
    self.log("train_loss",loss,prog_bar=True)
    return loss

  def validation_step(self,batch,batch_idx):
    logits = self(batch["input_ids"],batch["labels"])
    shift_logits = logits[:,:-1].contiguous()
    shift_labels = batch["labels"][:,1:].contiguous()
    loss = self.loss_fn(shift_logits.view(-1,shift_logits.size(-1)),shift_labels.view(-1))
    self.log("val_loss",loss,prog_bar=True)

    generated_ids = torch.argmax(logits,dim=-1)
    preds = tokenizer.batch_decode(generated_ids,skip_special_tokens=True)
    labels = tokenizer.batch_decode(batch["labels"],skip_special_tokens=True)
    self.val_preds.extend(preds)
    self.val_labels.extend(labels)

  def on_validation_epoch_end(self) :
    if len(self.val_preds) > 0:
      results = rouge.compute(predictions=self.val_preds,references=self.val_labels,use_stemmer=True)
      self.log_dict({f"val_{k}":v for k,v in results.items()},prog_bar=True)
      self.val_preds = []
      self.val_labels = []

  def configure_optimizers(self) :
    opt = torch.optim.AdamW(self.parameters(),lr=5e-4,betas=(0.9,0.98),weight_decay=0.01)
    sch = torch.optim.lr_scheduler.LinearLR(opt, start_factor=0.1, total_iters=500)
    return {"optimizer":opt,"lr_scheduler":{"scheduler":sch,"interval":"step"}}


#### dataloader , train

In [18]:
def to_dataset_obj(d):
  return TensorDataset(d["input_ids"],d["attention_mask"],d["labels"])

class WrappedSet(torch.utils.data.Dataset):
  def __init__(self,d):
    self.input_ids = d["input_ids"]
    self.attention_mask = d["attention_mask"]
    self.labels = d["labels"]

  def __len__(self): return self.input_ids.size(0)
  def __getitem__(self,idx):
    return {
        "input_ids":self.input_ids[idx],
        "attention_mask":self.attention_mask[idx],
        "labels":self.labels[idx]
    }


In [19]:
train_wrapped = WrappedSet(train_ids)
val_wrapped = WrappedSet(val_ids)

train_loader = DataLoader(train_wrapped,batch_size=4,shuffle=True,num_workers=2)
val_loader = DataLoader(val_wrapped,batch_size=4,shuffle=False,num_workers=2)

logger = CSVLogger("logs", name="minibart_denoise")
lit = LitMiniBART(vocab_size=tokenizer.vocab_size)

trainer = pl.Trainer(
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    devices=1,
    precision=16 if torch.cuda.is_available() else 32,
    max_epochs=2,
    gradient_clip_val=1.0,
    logger=logger,
    log_every_n_steps=50
)

/usr/local/lib/python3.12/dist-packages/lightning/fabric/connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
INFO: Using 16bit Automatic Mixed Precision (AMP)
INFO:lightning.pytorch.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO: 💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:lightning.pytorch.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.util

In [20]:
trainer.fit(lit,train_loader,val_loader)

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/usr/local/lib/python3.12/dist-packages/lightning/pytorch/utilities/model_summary/model_summary.py:231: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
INFO: 
  | Name    | Type             | Params | Mode 
-----------------------------------------------------
0 | model   | MiniBART         | 31.4 M | train
1 | loss_fn | CrossEntropyLoss | 0      | train
-----------------------------------------------------
31.4 M    Trainable params
0         Non-trainable params
31.4 M    Total params
125.585   Total estimated model params size (MB)
79        Modules in train mode
0         Modules in eval mode
INFO:lightning.pytorch.callbacks.model_summary:
  | Name    | Type             | Params | Mode 
-----------------------------------------------------
0 | model   | MiniBART         |

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=2` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.


## **TRAIN**

### **Dataset**

In [24]:
cnn_dmail = load_dataset("cnn_dailymail","3.0.0")

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [25]:
MAX_INPUT = 512
MAX_TARGET = 128

def prepprocess(batch):
  inputs = tokenizer(
      batch["article"],max_length=MAX_INPUT,padding="max_length",truncation=True
  )
  targets = tokenizer(
      batch["highlights"],max_length=MAX_TARGET,padding="max_length",truncation=True
  )
  inputs["labels"] = targets["input_ids"]
  return inputs

In [27]:
train_cnn_dmail = cnn_dmail["train"].select(range(80000)) #.select(range(80000))
val_cnn_dmail = cnn_dmail["validation"].select(range(6000)) #.select(range(6000))

tokenized_train_cnn_dmail = train_cnn_dmail.map(prepprocess,batched=True,remove_columns=train_cnn_dmail.column_names)
tokenized_val_cnn_dmail = val_cnn_dmail.map(prepprocess,batched=True,remove_columns=val_cnn_dmail.column_names)

train_cnn_dmail = tokenized_train_cnn_dmail.with_format("torch")
val_cnn_dmail = tokenized_val_cnn_dmail.with_format("torch")

#train_loader = DataLoader(train_data,batch_size=4,shuffle=True)
#val_loader = DataLoader(val_data,batch_size=4,shuffle=False)

Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

### **train**

In [28]:
#train_cnn_dmail_ = train_cnn_dmail.select(range(80000))
#val_cnn_dmail_ = val_cnn_dmail.select(range(6000))

train_loader_cnn_dmail = DataLoader(train_cnn_dmail,batch_size=4,shuffle=True)
val_loader_cnn_dmail = DataLoader(val_cnn_dmail,batch_size=4,shuffle=False)

trainer_cnndmail = pl.Trainer(
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    devices=1,
    precision=16 if torch.cuda.is_available() else 32,
    max_epochs=2,
    gradient_clip_val=1.0,
    logger=logger,
    log_every_n_steps=50
)

/usr/local/lib/python3.12/dist-packages/lightning/fabric/connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
INFO: Using 16bit Automatic Mixed Precision (AMP)
INFO:lightning.pytorch.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO: 💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:lightning.pytorch.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.util

In [29]:
trainer_cnndmail.fit(lit,train_loader_cnn_dmail,val_loader_cnn_dmail)

/usr/local/lib/python3.12/dist-packages/lightning/pytorch/callbacks/model_checkpoint.py:701: Checkpoint directory logs/minibart_denoise/version_0/checkpoints exists and is not empty.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/usr/local/lib/python3.12/dist-packages/lightning/pytorch/utilities/model_summary/model_summary.py:231: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
INFO: 
  | Name    | Type             | Params | Mode 
-----------------------------------------------------
0 | model   | MiniBART         | 31.4 M | train
1 | loss_fn | CrossEntropyLoss | 0      | train
-----------------------------------------------------
31.4 M    Trainable params
0         Non-trainable params
31.4 M    Total params
125.585   Total estimated model params size (MB)
79        Modules in train mode
0         Modules in eval mode

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=2` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.


In [36]:
from google.colab import drive
drive.mount('/content/drive')

save_path = "/content/drive/MyDrive/Labratory_11_miniBAR_pretrain.pth"
torch.save(lit.model.state_dict(),save_path)
tokenizer_path = "/content/drive/MyDrive/Labratory_11_miniBAR_pretrain_tokenizer.json"
tokenizer.save_pretrained(tokenizer_path)

Mounted at /content/drive


('/content/drive/MyDrive/Labratory_11_miniBAR_pretrain_tokenizer.json/tokenizer_config.json',
 '/content/drive/MyDrive/Labratory_11_miniBAR_pretrain_tokenizer.json/special_tokens_map.json',
 '/content/drive/MyDrive/Labratory_11_miniBAR_pretrain_tokenizer.json/vocab.json',
 '/content/drive/MyDrive/Labratory_11_miniBAR_pretrain_tokenizer.json/merges.txt',
 '/content/drive/MyDrive/Labratory_11_miniBAR_pretrain_tokenizer.json/added_tokens.json',
 '/content/drive/MyDrive/Labratory_11_miniBAR_pretrain_tokenizer.json/tokenizer.json')

### **Test**

In [40]:
def greedy_decode(model,src_text,max_len=64):
  model.eval()
  src_ids = tokenizer(src_text,return_tensors="pt",truncation=True, padding="max_length",max_length=MAX_INPUT)["input_ids"].to(model.device)
  tgt_ids = torch.tensor([[tokenizer.bos_token_id]]).to(model.device)


  for _ in range(max_len):
    logits = model(src_ids,tgt_ids)
    next_token = logits[:,-1].argmax(-1).unsqueeze(0)
    tgt_ids = torch.cat([tgt_ids, next_token],dim=1)
    if next_token.item() == tokenizer.eos_token_id:
      break
  return tokenizer.decode(tgt_ids.squeeze(),skip_special_tokens=True)

In [39]:
print(greedy_decode(lit, cnn_dmail["test"][9]["article"]))

NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW:NEW says, "It's not toNEW:NEW's,NEW's't "It's not toNEW:NEW'sNEW:NEW:NEW's not toNEW's not toNEW, says, says
