In [None]:
!pip install transformers
!pip install GPUtil
!pip install rouge
!pip install datasets
!pip install rouge_score


Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 23.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 58.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 60.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.0 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 77.3 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    F

In [None]:
from transformers import BertTokenizer, BertModel, BertConfig
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader, SequentialSampler
from nltk.tokenize import sent_tokenize
from torch.nn import functional as F
import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda
import os
import re
import datasets
import torch
from torch.utils.data import Dataset, DataLoader, SequentialSampler
import pandas as pd
from io import open
import unicodedata
import string
import numpy as np

import time
import random
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from google.colab import drive
drive.mount('/content/drive')
root_dir = '/content/drive/MyDrive/CS6120Dataset'

Mounted at /content/drive


Preprocessing and Dataset

In [None]:
def pad_art(flat_list):
    sent_sep_idxs = [idx for idx, t in enumerate(flat_list) if t == tokenizer.sep_token_id and idx < 512]
    last_sent_sep_idx = min(max(sent_sep_idxs)+1 if (len(sent_sep_idxs) > 0) else 512, 512)
    flat_list = flat_list[:last_sent_sep_idx]
    padded_list = flat_list + [tokenizer.pad_token_id] * (512 - len(flat_list))
    return torch.tensor([padded_list])

def pad_sum(encoded_sum):
    padded_sum = encoded_sum + [tokenizer.pad_token_id] * (512 - len(encoded_sum))
    padded_sum += [tokenizer.pad_token_id]
    return torch.tensor([padded_sum])

def pad_seg(seg_embs):
    seg_embs = torch.tensor([seg_embs])
    padded_seg = F.pad(input=seg_embs, pad=(0,512-seg_embs.shape[1]), mode='constant', value=tokenizer.pad_token_id)
    return padded_seg

def mask(padded_tensor):
    src_mask = torch.zeros_like(padded_tensor)
    src_mask[padded_tensor != tokenizer.pad_token_id] = 1 
    return src_mask



class CNNBertDataset(Dataset):
    
    def __init__(self,root_dir, type_, transform = None, n=None):
        super().__init__()
        self.root_dir = root_dir
        self.dataset = pd.read_csv(f'{root_dir}/{type_}.csv')
        if n:
          self.dataset = self.dataset[:n]
        self.transform = transform
        
    
    def __len__(self):
        return self.dataset.shape[0]

    def __getitem__(self, idx):
        art_ = eval(self.dataset.iloc[idx]['src'])
        sum_ = eval(self.dataset.iloc[idx]['tgt'])
        seg_ = eval(self.dataset.iloc[idx]['segs'])
        padded_art = pad_art(art_).squeeze()
        seg_art = pad_seg(seg_).squeeze()
        mask_art = mask(padded_art).squeeze()
        
        padded_sum = pad_sum(sum_)
        mask_sum = mask(padded_sum)
        
        return padded_art, padded_sum, seg_art, mask_art, mask_sum

Model architecture

In [None]:
class AttnDecoderforgetGRU2(nn.Module):

  def __init__(self,input_size, hidden_size, n_layers, output_size):
    super().__init__()
    self.hidden_size = hidden_size
    self.n_layers = n_layers
    self.attn = nn.Linear(input_size*4, hidden_size)
    self.softmax = nn.Softmax()
    self.gru1 =  nn.GRU(hidden_size*3, hidden_size*3, n_layers, batch_first =True, dropout=0.2)
    self.gru2 =  nn.GRU(hidden_size*3, hidden_size*3, n_layers, batch_first =True, dropout=0.2)
    self.gru3 =  nn.GRU(hidden_size*3, hidden_size*3, n_layers, batch_first =True, dropout=0.2)
    self.gru4 =  nn.GRU(hidden_size*3, hidden_size*3, n_layers, batch_first =True, dropout=0.2)
    self.fc1 = nn.Linear(hidden_size*3, hidden_size*2)
    self.fc2 = nn.Linear(hidden_size*3, hidden_size*2)
    self.fc3 = nn.Linear(hidden_size*3, hidden_size)
    self.forget = nn.Linear(hidden_size*3, hidden_size)
    self.resize = nn.Linear(hidden_size, hidden_size*2)
    self.out = nn.Linear(hidden_size, output_size)


  def forward(self, input_, prev_hidden, encoder_hidden,encoder_h):
    #print(input_.shape, prev_hidden.shape,encoder_hidden.shape  )
    
    #lstm_C = encoder_out.repeat(self.n_layers,1,1)
    energy = self.attn(torch.cat((input_, prev_hidden), dim=1))

    prev_hidden = prev_hidden.repeat(self.n_layers,1,1)

    attn = torch.bmm(energy.unsqueeze(1),encoder_h)

    out = F.relu(torch.cat((attn, input_.unsqueeze(1)), dim=2)).squeeze(1)
    #print(out.shape)
    encoder_hidden = encoder_hidden*(1 - F.sigmoid(self.forget(torch.cat((encoder_hidden, out), dim=1))))

    #encoder_hidden_re = F.relu(self.resize(encoder_hidden))
   
    #context = F.relu(self.attn(torch.cat((encoder_hidden,input_), dim = 1 )))
    #prev_hidden = prev_hidden.repeat(self.n_layers,1,1)

    out, prev_hidden = self.gru1(torch.cat((out,encoder_hidden), dim=1).unsqueeze(1),prev_hidden)
    out = F.relu(self.fc1(out)).squeeze(1)
    
    out, prev_hidden = self.gru2(torch.cat((out,encoder_hidden), dim=1).unsqueeze(1),prev_hidden)
    out = F.relu(self.fc2(out)).squeeze(1)

    out, prev_hidden = self.gru3(torch.cat((out,encoder_hidden), dim=1).unsqueeze(1),prev_hidden)
    out = F.relu(self.fc3(out))

    
    #out, prev_hidden = self.gru4(torch.cat((out,encoder_hidden), dim=1).unsqueeze(1),prev_hidden)
 

    return self.out(out), prev_hidden.permute(1,0,2)[:,-1].squeeze(1), encoder_hidden

  def initHidden(self, batch_size):
      return torch.zeros(batch_size,self.hidden_size*3, device=device)

In [None]:
class AttnDecoderforgetGRU(nn.Module):

  def __init__(self,input_size, hidden_size, n_layers, output_size):
    super().__init__()
    self.hidden_size = hidden_size
    self.n_layers = n_layers
    self.attn = nn.Linear(input_size*2, hidden_size)
    self.softmax = nn.Softmax()
    self.gru1 =  nn.GRU(hidden_size*2, hidden_size, n_layers, batch_first =True, dropout=0.2)
    self.gru2 =  nn.GRU(hidden_size*2, hidden_size, n_layers, batch_first =True, dropout=0.2)
    self.gru3 =  nn.GRU(hidden_size*2, hidden_size, n_layers, batch_first =True, dropout=0.2)
    self.gru4 =  nn.GRU(hidden_size*2, hidden_size, n_layers, batch_first =True, dropout=0.2)
    self.fc1 = nn.Linear(hidden_size, hidden_size)
    self.fc2 = nn.Linear(hidden_size, hidden_size)
    self.fc3 = nn.Linear(hidden_size, hidden_size)
    self.forget = nn.Linear(hidden_size*2, hidden_size)
    self.out = nn.Linear(hidden_size, output_size)


  def forward(self, input_, prev_hidden, encoder_hidden):
    #print(input_.shape, prev_hidden.shape,encoder_hidden.shape  )
    encoder_hidden = encoder_hidden*(1 - F.sigmoid(self.forget(torch.cat((encoder_hidden, input_), dim=1))))
   
    context = F.relu(self.attn(torch.cat((encoder_hidden,input_), dim = 1 )))
    prev_hidden = prev_hidden.repeat(self.n_layers,1,1)

    out, prev_hidden = self.gru1(torch.cat((input_,encoder_hidden), dim=1).unsqueeze(1),prev_hidden)
    out = F.relu(self.fc1(out)).squeeze(1)
    
    out, prev_hidden = self.gru2(torch.cat((out,encoder_hidden), dim=1).unsqueeze(1),prev_hidden)
    out = F.relu(self.fc2(out)).squeeze(1)

    out, prev_hidden = self.gru3(torch.cat((out,encoder_hidden), dim=1).unsqueeze(1),prev_hidden)
    out = F.relu(self.fc3(out))

    #out, prev_hidden = self.gru4(torch.cat((out,encoder_hidden), dim=1).unsqueeze(1),prev_hidden)
 

    return self.out(out), prev_hidden.permute(1,0,2)[:,-1].squeeze(1), encoder_hidden

  def initHidden(self, batch_size):
      return torch.zeros(batch_size,self.hidden_size, device=device)

Loading dataset

In [None]:
BATCH_SIZE = 4

train_data = CNNBertDataset(
    root_dir = root_dir+'/dataset_750',
    type_ = 'train',
    n = 4
    
)

val_data = CNNBertDataset(
    root_dir = root_dir+'/dataset_750',
    type_ = 'val',
    n = 4
    
)
test_data = CNNBertDataset(
    root_dir = root_dir+'/dataset_750',
    type_ = 'test',
    n = 4
)


train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True,
                   pin_memory=True)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True,
                    pin_memory=True)
val_dataloader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=True,
                    pin_memory=True)

Runner function that trains the model if its in train mode otherwise genertates test resuls


In [None]:
def attnRunnerForgetGRU(encoder, decoder, dataloader, criterion,optimizer, startingword, vocab_size, train =True, MAX_LEN = 30, teacher_forcing=False):

  if train:
    decoder.train()
  else:
    decoder.eval()
  
  epoch_loss = 0
  epoch_acc = 0
  epoch_total = 0
 
  for i,batch in enumerate(dataloader):
    padded_art, padded_sum, seg_art, mask_art, mask_sum = batch
    this_batch_size = padded_art.shape[0]
    if train:
      optimizer.zero_grad()
    encoder_ = None
    input_ = None
    with torch.no_grad():
      encoder_ =  encoder( input_ids = padded_art.to(device),
                                attention_mask = mask_art.to(device),
                                token_type_ids = seg_art.to(device))
      start_tensor = torch.tensor([[starting_word]]).repeat(this_batch_size,1).to(device)
      input_ = encoder(start_tensor)['pooler_output']
      del start_tensor
    
    encoder_output = encoder_['pooler_output']
    encoder_hidden = encoder_['last_hidden_state']    
    summ_ = torch.empty((this_batch_size, 0, vocab_size)).to(device)
    #summ_ = torch.empty((this_batch_size, 0)).to(device)
    prev_hidden = decoder.initHidden(this_batch_size)
    

    for di in range(1,MAX_LEN):
          out, prev_hidden, encoder_output = decoder(input_, prev_hidden, encoder_output)#For no attn
          #out, prev_hidden, encoder_output = decoder(input_, prev_hidden, encoder_output,encoder_hidden)#for attn
          del input_
          next = out.max(-1)[1]

          summ_ = torch.cat((summ_,out), dim=1)
          #summ_ = torch.cat((summ_,next), dim=1)
          with torch.no_grad():
            if train and random.random()<teacher_forcing:
                input_ = encoder(padded_sum[:,:,di].reshape(-1).unsqueeze(dim=1).to(device))['pooler_output']
            else:
                input_ = encoder(next)['pooler_output']
    target = padded_sum[:,:,1:MAX_LEN].squeeze(1).to(device)

    loss = criterion(summ_,target  )

    epoch_loss += loss.item()

    epoch_acc += (summ_.max(-1)[1] == target).sum().item()

    epoch_total += target.numel()
    if train:
      loss.backward()
      optimizer.step()
    del loss
    del  padded_sum
  epoch_loss = epoch_loss/epoch_total
  epoch_acc = epoch_acc/epoch_total
  return epoch_loss, epoch_acc  

Wrapper function to for training and testing


In [None]:

def trainer(encoder, decoder, train_dataloader,val_dataloader,test_dataloader, startingword, optimizer, criterion, vocab_size,
             EPOCHS = 10, MAX_LEN = 30, checkpoint = 5, model_location = './',teacher_forcing = 0.5, runner=None):
  results = []
  for epoch in range(EPOCHS):
    start = time.time()
    tr_loss, tr_acc = runner(encoder, decoder, train_dataloader, criterion,optimizer, startingword, vocab_size, train =True, MAX_LEN = MAX_LEN, teacher_forcing = teacher_forcing)
    tr_time = time.time() - start
    start = time.time()
    val_loss, val_acc = runner(encoder, decoder, val_dataloader, criterion,optimizer, startingword, vocab_size,train =False, MAX_LEN = MAX_LEN)
    print(f'epoch {epoch}, train s/it: {tr_time : .2f} val s/it: {time.time()-start: .2f}, tr_loss: {tr_loss: .6f}, tr_acc: {tr_acc :.6f}, val_loss: {val_loss: .6f}, val_acc: {val_acc: .6f}')
    if epoch%checkpoint == 0:
      torch.save(decoder.state_dict(), model_location)
    results.append((epoch, tr_loss,tr_acc, val_loss, val_acc))
  torch.save(decoder.state_dict(), model_location)
  test_loss, test_acc = runner(encoder, decoder, test_dataloader, criterion,optimizer, startingword,vocab_size, train =False, MAX_LEN = MAX_LEN)
  print(f'testing_loss: {test_loss: .6f}, testing_acc: {test_acc :.6f}')
  result_df = pd.DataFrame(results, columns =['epoch', 'tr_loss','tr_acc', 'val_loss', 'val_acc'])
  result_df.to_csv(model_location+'_result.csv')
  return result_df


In [None]:
encoder = BertModel.from_pretrained('google/bert_uncased_L-4_H-512_A-8').to(device)
tokenizer = BertTokenizer.from_pretrained('google/bert_uncased_L-4_H-512_A-8', do_lower=True)
encoder_config = BertConfig.from_pretrained("google/bert_uncased_L-4_H-512_A-8")
starting_word = tokenizer.vocab['[unused0]']
for params in encoder.parameters():
    params.require_grad = False

Downloading:   0%|          | 0.00/383 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/111M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/bert_uncased_L-4_H-512_A-8 were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

pd.read_csv

Loss function to work on Rouge based loss

In [None]:
class RougeLoss(nn.CrossEntropyLoss):

  def __init__(self, alpha=0.25):
    super().__init__(ignore_index=0)
    self.alpha = alpha

  def forward(self, predicted, actual):
    loss = super().forward(predicted.permute(0,2,1),actual)
    pred = predicted.max(-1)[1]
    rouge_output = rouge.compute(predictions=pred, references=actual )
    return  (1- self.alpha) * loss - self.alpha * torch.log(torch.tensor( rouge_output['rouge1'].mid.fmeasure, requires_grad = True) + 0.00001)

In [None]:
decoder = AttnDecoderforgetGRU(512,512,2,encoder_config.vocab_size).to(device)
optimizer = optim.Adam(decoder.parameters(), lr = 1e-3)
criterion = RougeLoss()

In [None]:
result = trainer(encoder, decoder, train_dataloader,val_dataloader,test_dataloader,starting_word, optimizer, criterion, encoder_config.vocab_size,
             EPOCHS = 40, MAX_LEN = 10, checkpoint = 4, model_location = root_dir+'/attndecoder_FRGT_SAMPLE_GRU_4_1', teacher_forcing=0.3,runner = attnRunnerForgetGRU)



epoch 0, train s/it:  0.25 val s/it:  0.26, tr_loss:  0.366596, tr_acc: 0.000000, val_loss:  0.311565, val_acc:  0.027778
epoch 1, train s/it:  0.23 val s/it:  0.25, tr_loss:  0.292960, tr_acc: 0.111111, val_loss:  0.313113, val_acc:  0.027778
epoch 2, train s/it:  0.24 val s/it:  0.26, tr_loss:  0.248704, tr_acc: 0.083333, val_loss:  0.324183, val_acc:  0.027778
epoch 3, train s/it:  0.25 val s/it:  0.25, tr_loss:  0.187549, tr_acc: 0.083333, val_loss:  0.380074, val_acc:  0.027778
epoch 4, train s/it:  0.23 val s/it:  0.25, tr_loss:  0.148647, tr_acc: 0.083333, val_loss:  0.474050, val_acc:  0.000000
epoch 5, train s/it:  0.23 val s/it:  0.24, tr_loss:  0.127905, tr_acc: 0.083333, val_loss:  0.502847, val_acc:  0.000000
epoch 6, train s/it:  0.26 val s/it:  0.28, tr_loss:  0.112877, tr_acc: 0.138889, val_loss:  0.529170, val_acc:  0.000000
epoch 7, train s/it:  0.27 val s/it:  0.27, tr_loss:  0.105203, tr_acc: 0.166667, val_loss:  0.550532, val_acc:  0.000000
epoch 8, train s/it:  0.

In [None]:
#Loading the old model
decoder =  AttnDecoderforgetGRU(512,512,2,encoder_config.vocab_size).to(device)
decoder.load_state_dict(torch.load(root_dir+'/attndecoder_FRGT_GRU_2'))
decoder.eval()

AttnDecoderforgetGRU(
  (attn): Linear(in_features=1024, out_features=512, bias=True)
  (softmax): Softmax(dim=None)
  (gru1): GRU(1024, 512, num_layers=2, batch_first=True, dropout=0.2)
  (gru2): GRU(1024, 512, num_layers=2, batch_first=True, dropout=0.2)
  (gru3): GRU(1024, 512, num_layers=2, batch_first=True, dropout=0.2)
  (gru4): GRU(1024, 512, num_layers=2, batch_first=True, dropout=0.2)
  (fc1): Linear(in_features=512, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=512, bias=True)
  (forget): Linear(in_features=1024, out_features=512, bias=True)
  (out): Linear(in_features=512, out_features=30522, bias=True)
)

Summarizer function

In [None]:
def attnSummerizerForgetGRU(encoder, decoder, dataloader, tokenizer, startingword,  train =False, MAX_LEN = 30, teacher_forcing=False):

  
  decoder.eval()
  actual = []
  predicted = []
  for i,batch in enumerate(dataloader):
    padded_art, padded_sum, seg_art, mask_art, mask_sum = batch
    this_batch_size = padded_art.shape[0]
    encoder_ = None
    input_ = None
    with torch.no_grad():
      encoder_ =  encoder( input_ids = padded_art.to(device),
                                attention_mask = mask_art.to(device),
                                token_type_ids = seg_art.to(device))
      start_tensor = torch.tensor([[starting_word]]).repeat(this_batch_size,1).to(device)
      input_ = encoder(start_tensor)['pooler_output']
      del start_tensor
    
    encoder_output = encoder_['pooler_output']
    encoder_hidden = encoder_['last_hidden_state']    
    summ_ = torch.empty((this_batch_size, 0)).to(device)
    prev_hidden = decoder.initHidden(this_batch_size)
    

    for di in range(1,MAX_LEN):
          #out, prev_hidden, encoder_output = decoder(input_, prev_hidden, encoder_output)
          out, prev_hidden, encoder_output = decoder(input_, prev_hidden, encoder_output,encoder_hidden)
          del input_
          next = out.max(-1)[1]
          summ_ = torch.cat((summ_,next),1)
          with torch.no_grad():
                input_ = encoder(next)['pooler_output']
    target = padded_sum[:,:,1:MAX_LEN].squeeze(1)
    predicted.extend(tokenizer.batch_decode(summ_))
    actual.extend(tokenizer.batch_decode(target))
  return pd.DataFrame({'actual':actual,'predicted':predicted})
  

In [None]:
train_df = attnSummerizerForgetGRU(encoder, decoder, test_dataloader, tokenizer, starting_word,  MAX_LEN =10)
print(train_df.iloc[0]['predicted'])
print(train_df.iloc[0]['actual'])

the final season of of of of of of
edward archbold was among those in a contest




isn courageous courageous possesses possesses光 possesses possesses光 possesses光 possesses光 bender possesses光 possesses光 bender possesses光 possesses光 bender possesses光 possesses光 bender possesses光 possesses光 bender possesses光 possesses光 bender possesses光 possesses光 bender possesses光 possesses光 bender possesses光 possesses光 bender possesses光 possesses光 bender possesses光 possesses光 bender possesses光 possesses光 bender possesses光 possesses光 bender possesses光 possesses光 bender
the confederations cup tournament takes place in south africa this month [unused2] eight teams are involved in the tournament including spain, italy and brazil [unused2] entry is limited to regional champions, the world cup holders and the hosts [unused1] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [None]:
print(encoder)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 512, padding_idx=0)
    (position_embeddings): Embedding(512, 512)
    (token_type_embeddings): Embedding(2, 512)
    (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=512, out_features=512, bias=True)
            (key): Linear(in_features=512, out_features=512, bias=True)
            (value): Linear(in_features=512, out_features=512, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=512, out_features=512, bias=True)
            (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [None]:
def attnSummerizerForgetGRUGenerator(encoder, decoder, tokenizer, startingword, src, segs, MAX_LEN = 30):

  
  decoder.eval()
  actual = []
  predicted = []
  padded_art = pad_art(src).squeeze(1)
  seg_art = pad_seg(segs).squeeze(1)
  mask_art = mask(padded_art).squeeze(1)
  
  
  encoder_ = None
  input_ = None
  with torch.no_grad():
    encoder_ =  encoder( input_ids = padded_art.to(device),
                              attention_mask = mask_art.to(device),
                              token_type_ids = seg_art.to(device))
    start_tensor = torch.tensor([[starting_word]]).repeat(1,1).to(device)
    input_ = encoder(start_tensor)['pooler_output']
    del start_tensor
  
  encoder_output = encoder_['pooler_output']
  encoder_hidden = encoder_['last_hidden_state']    
  summ_ = torch.empty((1, 0)).to(device)
  prev_hidden = decoder.initHidden(1)
    

  for di in range(1,MAX_LEN):
        out, prev_hidden, encoder_output = decoder(input_, prev_hidden, encoder_output)
        del input_
        next = out.max(-1)[1]
        summ_ = torch.cat((summ_,next),1)
        with torch.no_grad():
              input_ = encoder(next)['pooler_output']
 
  predicted.extend(tokenizer.batch_decode(summ_))
  
  return predicted

In [None]:
art = 'This food tastes very bad. It smells too'
summ = ''
raw_article0, raw_summary0 = clean_message(art), clean_message(summ)
rdt = RawDocTokenize()                                                                                        

preprcessed1 = rdt.get_tokenized_output(raw_article0, raw_summary0, padding=False)

clss = preprcessed1['clss']
segs = preprcessed1['segs']
src =preprcessed1['src']
article = preprcessed1['article']
tgt = preprcessed1['tgt']
summary = preprcessed1['summary']
attnSummerizerForgetGRUGenerator(encoder, decoder, tokenizer, starting_word, src, segs, MAX_LEN = 30)



['not cat dog.. [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1]']