In [None]:
!pip install transformers
!pip install datasets
!pip install contractions
!pip install nltk
!pip3 install pickle5

In [None]:
import torch
from torch import Tensor
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from timeit import default_timer as timer

import os
import pickle5 as pickle
import random
import math

import sys
sys.path.insert(0, '/content/') # change path to folder where library is placed
import libraryForBuildingDatasetOptimized as l4bdOptimized

Emptying GPU cache and checking its RAM available

In [None]:
torch.cuda.empty_cache()

Mounting Google Drive where PreprocessedDataset is loaded
If training locally not needed

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# HERE WE DEFINE GLOBAL VARIABLES
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

PATH_TO_CSV_OF_TOPENGLISHFREQWORDS = '/content/topEnglishWords.csv'
PATH_TO_DATASET = '/content/drive/MyDrive/LFN_project/PreprocessedDataset'
FINAL_VOCABULARY_SIZE = 9998 # special characters, UNK tags and tokens like [START] [EOS] exluded --> will be ca 30136
MAX_NUMBER_OF_TAGS_FOR_UNK_WORDS = 100 # make sure number of UNK tags is equal to the one with which the article was preprocessed

MAX_NUMBER_OF_WORDS_IN_SENTENCE = 50
MAX_NUMBER_OF_SENTENCES_AE = 5

BATCH_SIZE = 16 # adjust according to GPU available 

NUMBER_EXPECTED_FEATURES_INPUT = 770 # 768 from BERT embedding + 2 added by us for UNK tags
NUMBER_EXPECTED_FEATURES_OUTPUT = FINAL_VOCABULARY_SIZE + 138 # 10136

In [None]:
NUMBER_EXPECTED_FEATURES_OUTPUT

10136

# Data preparation

OUR final vocabulary has [PAD] token at index 0 to indicate padding tokens used to have all sequences of fixed length (transformers work with fixed size input).


[START] token to indicate start of Summary;


[SEP] token to indicate end of sentence but followed by another sentence


[EOS] to indicate EOS and stop summary production.


Has then a list of [UNKidx] and [UNK4BOTHDIC] tokens.


The embeddings for the input are precomputed using BERT and adding 2 positions to encode UNK words


In [None]:
finalVocabularyAsDict = l4bdOptimized.loadTargetVocabulary(FINAL_VOCABULARY_SIZE, MAX_NUMBER_OF_TAGS_FOR_UNK_WORDS, path=PATH_TO_CSV_OF_TOPENGLISHFREQWORDS) 

In [None]:
print(finalVocabularyAsDict)



In [None]:
print(len(finalVocabularyAsDict))

10136


# NOW THE PICKLE IS A TRIPLE CONTAINING ALSO HIGHLIGHTS


In [6]:
class PreprocessedDatasetForAE(Dataset):
    def __init__(self, pathToFolder, trainValidationOrTest):
        self.pathToFolder = os.path.join(pathToFolder, trainValidationOrTest) # eg './PreprocessedDataset' , 'train'
        self.finalVocabularyAsDict = l4bdOptimized.loadTargetVocabulary(FINAL_VOCABULARY_SIZE, MAX_NUMBER_OF_TAGS_FOR_UNK_WORDS, path=PATH_TO_CSV_OF_TOPENGLISHFREQWORDS) 
        self.maxNumberOfWordsInSentence = MAX_NUMBER_OF_WORDS_IN_SENTENCE
        self.maxNumberOfSentences = MAX_NUMBER_OF_SENTENCES_AE
        self.trainValidationOrTest = trainValidationOrTest

    def __len__(self):
        if '.DS_Store' in os.listdir(self.pathToFolder):
            return len(os.listdir(self.pathToFolder)) -1
        else:
            return len(os.listdir(self.pathToFolder))

    def __getitem__(self, idx):
        # load the preprocessed article from pickle file (for each idx there is a file with source article preprocessed)
        preprocessedDatasetReloaded = None
        pathToArticle = os.path.join(self.pathToFolder, str(idx) + '.pickle')
        with open(pathToArticle, 'rb') as handle:
            preprocessedDatasetReloaded = pickle.load(handle)

        # it is a tuple containing data and the originalArticle as string
        data, originalArticle, highlights = preprocessedDatasetReloaded #, highlightsOriginal if NEW dataset
        # unwrap data dictionary  
        wordsListWithTAGSList = data['wordsListWithTAGSList']
        wordLevelEmbeddingsAsTensorList = data['wordLevelEmbeddingsAsTensorList']
        unknownWordsDictionaryForAnArticle = data['unknownWordsDictionaryForAnArticle']

        
        # with p=0.5 we pick some random sentences
        numberOfSentences = 5
        sentencesToPick = [random.randint(0, len(wordLevelEmbeddingsAsTensorList)-1) for i in range(numberOfSentences)] # eg [0, 23, 52]
        sentencesToPick.sort() # ordering sentences (important for UNK tags order)

        # we have a list of tensors where each one is the embedding of the picked sentences, we don't stack them as will be fed separately to encoder
        inputSentencesTensorsList = [wordLevelEmbeddingsAsTensorList[i].type(torch.float32) if i < len(wordLevelEmbeddingsAsTensorList) else torch.zeros((MAX_NUMBER_OF_WORDS_IN_SENTENCE, NUMBER_EXPECTED_FEATURES_INPUT)).type(torch.float32) for i in sentencesToPick] # converting from float16 back to float32
        
        # we now want to pick the same sentences from the article, concatenate them and preprocess them as target
        expandedContractionsArticle = l4bdOptimized.expandContractions(originalArticle)
        sentencesInOriginalArticleList = l4bdOptimized.splitSentences(expandedContractionsArticle)
        originalSentencesPickedToBeTarget = [sentencesInOriginalArticleList[i] if i < len(sentencesInOriginalArticleList) else  ' ' for i in sentencesToPick]
        targetSentence = ''
        for sent in originalSentencesPickedToBeTarget:
            targetSentence += sent
            # place space when concatenating them
            if sent[-1] == '.':
                targetSentence += ' '

        # if corrupted article with 0 sentences minimum sentence is [START] [EOS]
        # START added in preprocessHighlights function, EOS replaces '.' so we set targetSentence = '.'
        if len(targetSentence) <= 5:
          targetSentence = ' '

        # unknownWordsDictionaryForAnArticle has to be the same produced when preprocessing source article
        targetSentencesEmbedded = l4bdOptimized.preprocessHighlights(targetSentence, unknownWordsDictionaryForAnArticle, self.finalVocabularyAsDict)

        # to enable batching and speed up computation we cannot keep inputSentencesTensorsList even if we would like to keep input sentences separated to be fed indivifually to the encoder
        # moreover we need to have them all to same dimension

        # MAX length to be decided 
        # for now we decide that max # words is 50
        # we pad all tensors to have same shape[0] st each tensor 50 x 770 or cut if dim0 greater than max allowed
        # we need to have like a mask [1, 1, 1, 1, 0] so we can understand which are padding elements and which are not
        # during forward pass we can split them again splitting/grouping by 50

        listOfPaddedTensors = []
        whichArePaddingWordsMaskList = []
        for tensor in inputSentencesTensorsList:
            # if dim0 greater than self.maxNumberOfWordsInSentence we truncate
            if tensor.shape[0] > self.maxNumberOfWordsInSentence:
                tensorPadded = tensor[:self.maxNumberOfWordsInSentence, :]
                listOfPaddedTensors.append(tensorPadded)
                whichArePaddingWordsMaskList.append(torch.ones(tensorPadded.shape[0])) # all ones
            else:
                paddingBottom = self.maxNumberOfWordsInSentence - tensor.shape[0]
                padding = torch.nn.ZeroPad2d((0,0,0,paddingBottom))
                tensorPadded = padding(tensor)
                listOfPaddedTensors.append(tensorPadded)
                whichArePaddingWordsMaskList.append(torch.cat((torch.ones(tensor.shape[0]), torch.zeros(paddingBottom)))) # 0 if it is padding

        inputSentencesAsTensor = torch.cat(listOfPaddedTensors)
        # we now pad with all ZEROs st all input instances has same length
        # since we decided to have in input at most 5 sentences each tensor will be padded to be 5*MaxNumWords x 770
        inputSentencesAsTensor = torch.cat((inputSentencesAsTensor, torch.zeros((self.maxNumberOfSentences * self.maxNumberOfWordsInSentence - inputSentencesAsTensor.shape[0], NUMBER_EXPECTED_FEATURES_INPUT))))
        # same for whichArePaddingWordsMask
        whichArePaddingWordsMask = torch.cat(whichArePaddingWordsMaskList)
        #print(whichArePaddingWordsMask)
        whichArePaddingWordsMask = torch.cat((whichArePaddingWordsMask, torch.zeros(self.maxNumberOfSentences * self.maxNumberOfWordsInSentence - whichArePaddingWordsMask.shape[0])))

        # as we padded for input we need to pad also target sentences
        # the target is obtained by concatenating all sentences and embedding them in a single matrix of dimension (#words, finalVocabularyLen)
        # we need to pad along dim=0, we may chose as dim0 = #self.maxNumberOfWordsInSentence * self.maxNumberOfSentences
        if targetSentencesEmbedded.shape[0] > self.maxNumberOfSentences * self.maxNumberOfWordsInSentence:
            # if too long we retain maxNRows -1 and add last the embedding of EOS token
            targetSentencesEmbeddedPadded = targetSentencesEmbedded[0: (self.maxNumberOfSentences * self.maxNumberOfWordsInSentence) -1, :]
            targetSentencesEmbeddedPadded = torch.cat((targetSentencesEmbeddedPadded, torch.unsqueeze(targetSentencesEmbedded[-1, :], dim=0))) # retrieving EOS embedding from last position of targetSentencesEmbedded
            #print(targetSentencesEmbeddedPadded.shape)
            #print(torch.unsqueeze(targetSentencesEmbedded[-1, :], dim=0).shape)
            whichArePaddingWordsTargetSentence = torch.ones(targetSentencesEmbeddedPadded.shape[0]) # all ones since all valid
        else:
            paddingBottom = self.maxNumberOfWordsInSentence * self.maxNumberOfSentences - targetSentencesEmbedded.shape[0]
            padding = torch.nn.ZeroPad2d((0,0,0,paddingBottom))
            targetSentencesEmbeddedPadded = padding(targetSentencesEmbedded)
            whichArePaddingWordsTargetSentence = torch.cat((torch.ones(targetSentencesEmbedded.shape[0]), torch.zeros(paddingBottom)))
            

        # returning also a tensor with 6 values [articleNumber, sentence1, sent2, sent3, sent4, sent5]
        # then load article, produce dictionary of UNKtags, predict and convert back using dictionary
        articleIdx_sentencesPicked = torch.tensor([idx, *sentencesToPick])

        return inputSentencesAsTensor.type(torch.float32), whichArePaddingWordsMask.type(torch.int), targetSentencesEmbeddedPadded.type(torch.float32), whichArePaddingWordsTargetSentence.type(torch.int), articleIdx_sentencesPicked.type(torch.int)

Testing Dataset class and Data loader

In [None]:
testDataset = PreprocessedDatasetForAE(PATH_TO_DATASET, 'test') 
testDataLoader = DataLoader(testDataset, batch_size=BATCH_SIZE, shuffle=True, drop_last = True)

In [None]:
# to test Data loader
iterationOutput = next(iter(testDataLoader))
print(len(iterationOutput))

inputSentencesAsTensor, whichArePaddingWordsMask, targetSentencesEmbeddedPadded, whichArePaddingWordsTargetSent, articleIdx_sentencesPicked = iterationOutput

print(inputSentencesAsTensor.shape)
print(whichArePaddingWordsMask.shape)
print(targetSentencesEmbeddedPadded.shape)
print(whichArePaddingWordsTargetSent.shape)

print(inputSentencesAsTensor.type())
print(whichArePaddingWordsMask.type())

print(targetSentencesEmbeddedPadded.type())
print(whichArePaddingWordsTargetSent.type())

print(articleIdx_sentencesPicked.type())
print(articleIdx_sentencesPicked)


5
torch.Size([16, 250, 770])
torch.Size([16, 250])
torch.Size([16, 250, 10136])
torch.Size([16, 250])
torch.FloatTensor
torch.IntTensor
torch.FloatTensor
torch.IntTensor
torch.IntTensor
tensor([[378,   1,   6,  13,  19,  20],
        [278,   7,  10,  19,  19,  26],
        [295,   1,  21,  28,  32,  37],
        [279,  12,  15,  18,  19,  23],
        [165,   8,   8,  10,  13,  15],
        [176,   4,   6,   9,  14,  15],
        [ 48,   7,  15,  16,  18,  21],
        [107,   3,  28,  29,  43,  57],
        [ 23,   2,   3,  12,  14,  15],
        [ 24,  14,  32,  34,  37,  40],
        [ 68,  13,  15,  19,  20,  34],
        [464,   0,   1,   5,   7,   8],
        [ 84,  11,  19,  20,  20,  21],
        [ 46,   8,  16,  24,  30,  38],
        [418,   0,   6,  12,  40,  44],
        [318,   1,   1,   8,  16,  23]], dtype=torch.int32)


## MODEL DEFINITION AND HELPER CLASSES

POSITIONAL ENCODING helper class and functions to create masks

## MODEL DEFINITION

In [7]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float, max_len: int):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        # x: Tensor, shape [batch_size, seq_len , embedding_dim]
        x = x + self.pe[:, :x.shape[1], :]
        return self.dropout(x)


def generate_square_subsequent_mask(sz, DEVICE):
    # take upper triangular part of matrix of all 1 (output still a matrix but lower triangular part are zeros)
    # transpose st 0 are now on the upper triangular part
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    # replace 0 in upper triangular part with -infinity and initialize lower triangular part to 0
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_masks_for_input(inputEmbeddingsForSingleSentenceBatched, whichArePaddingMaskBatched, DEVICE):
    batch_size = inputEmbeddingsForSingleSentenceBatched.shape[0]
    src_seq_len = inputEmbeddingsForSingleSentenceBatched.shape[1] 
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=DEVICE).type(torch.bool)
    src_padding_mask = (whichArePaddingMaskBatched == 0).to(DEVICE) 
    return src_mask, src_padding_mask


def create_masks_for_output(embeddingForTargetSentenceBatched, whichArePaddingMaskBatched, DEVICE):
    batch_size = embeddingForTargetSentenceBatched.shape[0]
    tgt_seq_len = embeddingForTargetSentenceBatched.shape[1]
    tgt_mask = generate_square_subsequent_mask(tgt_seq_len, DEVICE)
    tgt_padding_mask = (whichArePaddingMaskBatched == 0).to(DEVICE)
    return tgt_mask, tgt_padding_mask


def create_memory_key_pad_mask(whichArePaddingWordsMaskSplittedBatch):
    batch_size = whichArePaddingWordsMaskSplittedBatch.shape[0]
    # we sum over last dimension 
    # if count !=0 we mark it as valid sentence
    # in practice during training always all valid (all False)
    memory_key_pad_mask = torch.sum(whichArePaddingWordsMaskSplittedBatch, dim=-1)
    return (memory_key_pad_mask == 0)


class AETransformer(nn.Module):

    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 source_emb_size: int,
                 nheadEncoder: int,
                 target_emb_size : int,
                 target_emb_size_reduced : int,
                 nheadDecoder: int,
                 dropout: float = 0.2, 
                 device : int = 0):
        super(AETransformer, self).__init__()
        self.DEVICE = device

        # the encoder stack of layers works on a single sentence representation at the time (BATCH_SIZE x MAX_NUM_WORDS=50 x source_emb_size=770)
        self.encoder = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=source_emb_size, 
                                                                        nhead=nheadEncoder,
                                                                        dim_feedforward=512,
                                                                        dropout=0.2,
                                                                        activation='relu',
                                                                        layer_norm_eps = 1e-5,
                                                                        batch_first = True,
                                                                        norm_first = False),
                                            num_layers=num_encoder_layers)
                                            #enable_nested_tensor = False)
        
        # NB: since the decoder takes in input both the encoder hidden representations and the target (autoregressive)
        # we have to reduce the one-hot encoding to smaller vectors, will be = to target_emb_size_reduced = source_emb_size
        # where target_emb_size_reduced is the reduced learned target representation (from onehot of len(finalVoc) to smaller encoding)
        self.reduceTargetEmbDimNN = nn.Sequential(nn.Linear(target_emb_size, target_emb_size_reduced), nn.ReLU())

        self.decoder = nn.TransformerDecoder(nn.TransformerDecoderLayer(d_model=target_emb_size_reduced,
                                                                        nhead=nheadDecoder,
                                                                        dim_feedforward=512,
                                                                        dropout=0.2,
                                                                        activation='relu',
                                                                        layer_norm_eps = 1e-5,
                                                                        batch_first = True,
                                                                        norm_first = False),
                                            num_layers=num_decoder_layers)
        
        # positional_encoding_sourceSentence works on a single sentence representation at the time of length MAX_NUMBER_OF_WORDS_IN_SENTENCE
        self.positional_encoding_sourceSentence = PositionalEncoding(source_emb_size, 0.2, MAX_NUMBER_OF_WORDS_IN_SENTENCE)
        # positional_encoding_targetSentence works on entire target sentence (obtained concatenating the sentences in input)
        self.positional_encoding_targetSentence = PositionalEncoding(target_emb_size_reduced, 0.2, MAX_NUMBER_OF_WORDS_IN_SENTENCE * MAX_NUMBER_OF_SENTENCES_AE)

        # from target_emb_size_reduced (decoder output dim) back to one hot vector len(finalVoc)
        self.generator = nn.Linear(target_emb_size_reduced, target_emb_size) # NB no need for softmax or other non linear functions as the crossentropy loss in pytorch already includes computing softmax


    # inputSentencesAsTensorsSplittedBatch has dimension (BATCH_DIM, MAX_N_SEQ=5, MAX_N_WORDS=50, INPUTDIM=770)
    # during forward we pass to the encoder (BATCH_DIM, MAX_N_WORDS=50, INPUTDIM=770) for each of the 5 sequences
    # the whichArePaddingWordsMaskSplittedBatch will allow to create src_key_padding mask to not let attend to padding tokens
    def forward(self,
                inputSentencesAsTensorsSplittedBatch: Tensor,
                whichArePaddingWordsMaskSplittedBatch: Tensor,
                targetSentencesEmbeddedPaddedBatch : Tensor, 
                whichArePaddingWordsTargetSentenceBatch : Tensor,
                memory_key_pad_mask: Tensor):
        
        # list of 1 tensor for each sentence (BATCHDIM, 50, 770)
        sentencesList = [ inputSentencesAsTensorsSplittedBatch[:, i, :, :] for i in range(MAX_NUMBER_OF_SENTENCES_AE) ]
        inputMasksList = [ whichArePaddingWordsMaskSplittedBatch[:, i, :] for i in range(MAX_NUMBER_OF_SENTENCES_AE) ]

        # here we append hidden representation from the encoder to stack them vertically
        encodedSentenceSUM = []
        # in order in the final model to allow to use a varible node-representations from GNN output (eg top k degree nodes) (but always <=5) 
        # we always stack vertically 5 node-representations but mask the "not-true ones"
        for sentence, mask in zip(sentencesList, inputMasksList):
            # create src_mask, key_padding mask
            src_mask, src_padding_mask = create_masks_for_input(sentence, mask, self.DEVICE)
            # add positional encoding
            sentence = self.positional_encoding_sourceSentence(sentence)  
            # pass throw encoder
            encodedSentence = self.encoder(sentence, src_mask, src_padding_mask)
            # now encodedSentence has shape (BATCH_SIZE, MAX_NUM_WORDS, 770) (has has to be input for any other encoder layer when stacking)
            # we want the output of the encoder module to have dimension (BATCH_SIZE, 770) 
            # each sentence represented through a matrix is now repr. through a vector
            # what we can collapse matrix in 1 vector by taking mean along dim=0
            encodedSentence_collapsed = torch.mean(encodedSentence, 1) # collapsing dim 1
            encodedSentenceSUM.append(encodedSentence_collapsed) # stacking representations vertically (BATCH_SIZE, NUMBER OF SENTENCES, 770)
            
        encodedSentenceSUM = (torch.stack(encodedSentenceSUM)).to(self.DEVICE, torch.float32)
        encodedSentenceSUM = torch.permute(encodedSentenceSUM, (1, 0, 2)) # permuting to have batch at dim 0

        # reducing target dimension, position encoding of target, create target masks and decode
        targetSentencesEmbeddedPaddedBatch_reduced = self.reduceTargetEmbDimNN(targetSentencesEmbeddedPaddedBatch)
        targetSentencesEmbeddedPaddedBatch_reduced_withPosEnc = self.positional_encoding_targetSentence(targetSentencesEmbeddedPaddedBatch_reduced)
        tgt_mask, tgt_padding_mask = create_masks_for_output(targetSentencesEmbeddedPaddedBatch_reduced, whichArePaddingWordsTargetSentenceBatch, self.DEVICE)

        decoderOut = self.decoder(targetSentencesEmbeddedPaddedBatch_reduced_withPosEnc, encodedSentenceSUM, tgt_mask, memory_mask=None, tgt_key_padding_mask=tgt_padding_mask, memory_key_padding_mask=memory_key_pad_mask)
        
        # from target_emb_size_reduced back to len(finalVoc)
        return self.generator(decoderOut)


    # for the AE task given all sentences in one single tensor similar to what does in forward splits them and computes hidden embedding summing 
    # this is not what the GNN model will use but useful for testing AE
    def encodeSentences(self, inputSentencesAsTensorsSplittedBatch: Tensor, whichArePaddingWordsMaskSplittedBatch: Tensor):
        # list of 1 tensor for each sentence (BATCHDIM, 50, 770)
        sentencesList = [ inputSentencesAsTensorsSplittedBatch[:, i, :, :] for i in range(MAX_NUMBER_OF_SENTENCES_AE) ]
        inputMasksList = [ whichArePaddingWordsMaskSplittedBatch[:, i, :] for i in range(MAX_NUMBER_OF_SENTENCES_AE) ]
        encodedSentenceSUM = []
        for sentence, mask in zip(sentencesList, inputMasksList):
            # create src_mask, key_padding mask
            src_mask, src_padding_mask = create_masks_for_input(sentence, mask, self.DEVICE)
            # add positional encoding
            sentence = self.positional_encoding_sourceSentence(sentence)  
            # pass throw encoder
            encodedSentence = self.encoder(sentence, src_mask, src_padding_mask)
            # now encodedSentence has shape (BATCH_SIZE, MAX_NUM_WORDS, 770) (has has to be input for any other encoder layer when stacking
            encodedSentence_collapsed = torch.mean(encodedSentence, 1) # collapsing dim 1
            encodedSentenceSUM.append(encodedSentence_collapsed)
        encodedSentenceSUM = torch.stack(encodedSentenceSUM).to(self.DEVICE, torch.float32)
        encodedSentenceSUM = torch.permute(encodedSentenceSUM, (1, 0, 2))
        return encodedSentenceSUM


    # to encode single sentence: this is what will be used to get CONCEPT(nodei)
    # remember to work always with BATCH_DIM at first dimension
    def encodeSingleSentence(self, sentenceAsMatrix: Tensor, whichWordsArePadding: Tensor):
        src_mask, src_padding_mask = create_masks_for_input(sentenceAsMatrix, whichWordsArePadding, self.DEVICE)
        # add positional encoding
        sentence = self.positional_encoding_sourceSentence(sentence)  
        # pass throw encoder
        encodedSentence = self.encoder(sentence, src_mask, src_padding_mask)
        # now encodedSentence has shape (BATCH_SIZE, MAX_NUM_WORDS, 770) (has has to be input for any other encoder layer when stacking)
        # we want the output of the encoder module to have dimension (BATCH_SIZE, 770) 
        encodedSentence_collapsed = torch.mean(encodedSentence, 1) # collapsing dim 1
        return encodedSentence_collapsed # this represents sentence representation for the node-sentence, input to GNN


    # decode from 5 node-sentence representations (after GNN) stacked vertically
    def decodeFromEcodedSentenceSUM_afterGNN(self, tgt: Tensor, tgt_mask: Tensor, encodedSentenceSUM_afterGNN: Tensor, memory_key_pad_mask: Tensor):
        tgt_reduced = self.reduceTargetEmbDimNN(tgt)
        tgt_reduced_withPosEnc = self.positional_encoding_targetSentence(tgt_reduced)
        decoderOut = self.decoder(tgt_reduced_withPosEnc, encodedSentenceSUM_afterGNN, tgt_mask, memory_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask = memory_key_pad_mask)
        return decoderOut

NB: since the encoder works stackin N encoder layers the output dim of the encoder will be (source_emb_size x 50 x 770)(has to be the same as the input to be fed potentially into another encoder layer)
From matrix representing a sentence (50 x 770) to a single vector embedding but of same "width" (770) we need to perform some manipulation (eg mean to collapse 1 dimension)
        

## MODEL INSTANTIATION from tar checkpoint

In [8]:
checkpoint = torch.load('/content/drive/MyDrive/LFN_project/EncDec_moreComplex/transfPretrainCP_multiGPU_more_x2_Complex.tar', map_location=DEVICE)

In [9]:
NUM_ENCODER_LAYERS = 4
NUM_DECODER_LAYERS = 4
SOURCE_EMB_SIZE = NUMBER_EXPECTED_FEATURES_INPUT #(770)
NHEAD_ENCODER = 11 # SOURCE_EMB_SIZE // NHEAD_ENCODER = 0 MUST BE DIVISIBLE! 
TARGET_EMB_SIZE = NUMBER_EXPECTED_FEATURES_OUTPUT # (10136)
TARGET_EMB_SIZE_REDUCED = NUMBER_EXPECTED_FEATURES_INPUT 
N_HEAD_DECODER = 11 # TARGET_EMB_SIZE_REDUCED // N_HEAD_DECODER = 0 MUST BE DIVISIBLE!
DROPOUT = 0.3

# model instantiation
transformer = AETransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, SOURCE_EMB_SIZE, NHEAD_ENCODER, TARGET_EMB_SIZE, TARGET_EMB_SIZE_REDUCED, N_HEAD_DECODER, DROPOUT, DEVICE)

transformer.load_state_dict(checkpoint['model_state_dict'])

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=0) # IF PAD does not contribute to loss ([PAD] token is class 0 since first element in finalVoc)


In [10]:
total_params = sum(
	param.numel() for param in transformer.parameters()
)
print('Transformer total number of parameters: {}'.format(total_params))


Transformer total number of parameters: 50465402


In [None]:
# Print model's state_dict
print("Model's state_dict:")
for param_tensor in transformer.state_dict():
    print(param_tensor, "\t", transformer.state_dict()[param_tensor].size())

In [11]:
def unpack_inputSentencesAsTensorBatch_whichArePaddingWordsMaskBatch(inputSentencesAsTensorBatch, whichArePaddingWordsMaskBatch):
  inputSentencesAsTensorsSplittedBatchList = []
  whichArePaddingWordsMaskSplittedBatchList = []
  # unbatching
  for inputSentencesAsTensor, whichArePaddingWordsMask in zip(inputSentencesAsTensorBatch, whichArePaddingWordsMaskBatch):
    inputSetOfSentencesSplittedList = []
    whichArePaddingWordsMaskSplittedList = []
    for i in range(0, inputSentencesAsTensor.shape[0], MAX_NUMBER_OF_WORDS_IN_SENTENCE):
      inputSentence = inputSentencesAsTensor[i : i+MAX_NUMBER_OF_WORDS_IN_SENTENCE]
      inputSetOfSentencesSplittedList.append(inputSentence)
      inputSentenceMask = whichArePaddingWordsMask[i : i+MAX_NUMBER_OF_WORDS_IN_SENTENCE]
      whichArePaddingWordsMaskSplittedList.append(inputSentenceMask)

    inputSetOfSentencesSplitted = torch.stack(inputSetOfSentencesSplittedList)
    inputSentencesAsTensorsSplittedBatchList.append(inputSetOfSentencesSplitted)

    whichArePaddingWordsMaskSplitted = torch.stack(whichArePaddingWordsMaskSplittedList)
    whichArePaddingWordsMaskSplittedBatchList.append(whichArePaddingWordsMaskSplitted)
  
  # batching back
  inputSentencesAsTensorsSplittedBatch = torch.stack(inputSentencesAsTensorsSplittedBatchList)
  whichArePaddingWordsMaskSplittedBatch = torch.stack(whichArePaddingWordsMaskSplittedBatchList)

  return inputSentencesAsTensorsSplittedBatch, whichArePaddingWordsMaskSplittedBatch


# GREEDY DECODING

Testing on a test instance: we pick 5 sentences from an article, encode them using model.encodeSentences function. And then computing in an autoregressive way the target sentence (concatenation of the 5 in input) in the finalVocabularyEmbedding.

NB: here we work with encodeSentences function, all vectors in encodedSentenceSUM (encoder output) are considered valid and memory_padding_key all valid

When we will deploy in the GNN model:
1.   encoder encodes single sentence at a time (use encodeSingleSentence function) (remember to provide with batch=1 at first dimension and to conver to DEVICE); encoder not trained in the GNN
2.   Apply GNN using these vector representations
3.   Stack vertically final node representations if degree>k (in any case not more than 5), if less provide memory_key_padding_mask
4. Pass through decoder using decodeFromEcodedSentenceSUM_afterGNN function (decoder trained also in GNN model)




In [12]:
# function to generate output sentence using greedy algorithm
# we give in input source article preprocessed, pick 5 sentences, pass them through the model and see what it generates
def greedy_decode(model):
    # set model to eval mode 
    model.eval()
    # for now we simply try using source article 0 in Test folder
    testDataset = PreprocessedDatasetForAE(PATH_TO_DATASET, 'test')
    # we don't shuffle to mantain order, and batchsize only 1
    testDataLoader = DataLoader(testDataset, batch_size=1, shuffle=True, drop_last = True)

    # picking a single source article (in this case idx=0), note it will be already in Batched form
    inputSentencesAsTensorBatch, whichArePaddingWordsMaskBatch, targetSentencesEmbeddedPaddedBatch, whichArePaddingWordsTargetSentenceBatch, articleIdx_sentencesPicked = next(iter(testDataLoader))

    # splitting before forwarding such that each sentence is passed to the encoder separately
    # unpacking to provide sentences separately through the encoder
    inputSentencesAsTensorsSplittedBatch, whichArePaddingWordsMaskSplittedBatch = unpack_inputSentencesAsTensorBatch_whichArePaddingWordsMaskBatch(inputSentencesAsTensorBatch, whichArePaddingWordsMaskBatch) 
    # torch.Size([1, 5, 50, 770]), # torch.Size([1, 5, 50])

    # converting tensor to DEVICE (to allocate it on GPU)
    inputSentencesAsTensorsSplittedBatch = inputSentencesAsTensorsSplittedBatch.to(DEVICE)
    whichArePaddingWordsMaskSplittedBatch = whichArePaddingWordsMaskSplittedBatch.to(DEVICE)

    #print('inputSentencesAsTensorsSplittedBatch shape: {}'.format(inputSentencesAsTensorsSplittedBatch.shape))
    #print('whichArePaddingWordsMaskSplittedBatch shape: {}'.format(whichArePaddingWordsMaskSplittedBatch.shape))
    #print(targetSentencesEmbeddedPaddedBatch.shape)
    #print(whichArePaddingWordsTargetSentenceBatch.shape)

    finalVocabularyAsDict = l4bdOptimized.loadTargetVocabulary(FINAL_VOCABULARY_SIZE, MAX_NUMBER_OF_TAGS_FOR_UNK_WORDS, path=PATH_TO_CSV_OF_TOPENGLISHFREQWORDS) 
    max_len = MAX_NUMBER_OF_SENTENCES_AE * MAX_NUMBER_OF_WORDS_IN_SENTENCE
    # we start with ys being the onehot encoding in finalVocabulary of [START] token
    ys = (nn.functional.one_hot(torch.tensor([list(finalVocabularyAsDict).index('[START]')]), num_classes=len(finalVocabularyAsDict))[0]).type(torch.float32).to(DEVICE)
    ys = torch.unsqueeze(ys, dim=0)
    ys = torch.unsqueeze(ys, dim=0)
    # now ys has shape torch.Size([1, 1, 30136]) that is one only element in the batch, 1 token in the sequence, one-hot encoding for [START]
    #print('ys initial shape: {}'.format(ys.shape))
  
    encodedSentenceSUM = model.encodeSentences(inputSentencesAsTensorsSplittedBatch, whichArePaddingWordsMaskSplittedBatch)
    # generating memory_key_padding_mask: since here all 5 stacked representations are valid we pass all False
    memory_key_pad_mask = torch.Tensor([False, False, False, False, False])
    memory_key_pad_mask = torch.unsqueeze(memory_key_pad_mask, dim=0).to(DEVICE)
    #print('encodedSentenceSUM out from encoder stacking vertically hidden representations: {}'.format(encodedSentenceSUM.shape))

    for i in range(MAX_NUMBER_OF_WORDS_IN_SENTENCE * MAX_NUMBER_OF_SENTENCES_AE -1):
      tgt_mask = (generate_square_subsequent_mask(ys.shape[1], DEVICE).type(torch.bool)).to(DEVICE)
      output_decoder = model.decodeFromEcodedSentenceSUM_afterGNN(ys, tgt_mask, encodedSentenceSUM, memory_key_pad_mask)
      #print('output_decoder shape: {}'.format(output_decoder.shape)) # (1=batch, numberOfWordsInTgt, 770)

      # from decoder_output to probabilities through generator, from batch to single probab vector
      output_decoder_lastToken = output_decoder[:, -1, :] # torch.Size([1, 770]) # -1 to pick last word
      #print('output_decoder_lastToken shape: {}'.format(output_decoder_lastToken.shape))
      prob = model.generator(output_decoder_lastToken) #torch.Size([1, 30136])
      #print('prob shape : {}'.format(prob.shape))
      # if we use CrossEntropy prob are unnormalized
      # or we change generator applying a Logsoftmax and then using NLL
      # or we can apply now here softmax?
      softmaxLayer = torch.nn.Softmax(dim=1)
      next_word_idx = torch.argmax(softmaxLayer(prob))
      next_word_idx = next_word_idx.item() 
      #print(next_word_idx)
      # now we append to ys being contructed the next_word as one-hot final vocabulary embedding
      next_word_embedded = (nn.functional.one_hot(torch.tensor([next_word_idx]), num_classes=len(finalVocabularyAsDict))[0]).type(torch.float32).to(DEVICE)
      next_word_embedded = torch.unsqueeze(next_word_embedded, dim=0)
      next_word_embedded = torch.unsqueeze(next_word_embedded, dim=0)
      ys = torch.cat((ys, next_word_embedded), dim=1)
      #print(ys.shape)
      
      if next_word_idx == list(finalVocabularyAsDict).index('[EOS]'):
        break

    # convert ys to string
    # TODO: load UNKDICTIONARYFORARTICLE to retrieve back UNK tags 
    article_idx = articleIdx_sentencesPicked[0][0].item()
    sentencesPicked = articleIdx_sentencesPicked[0][1:]
    #print(article_idx)
    #print(sentencesPicked)

    preprocessedDatasetReloaded = None
    pathToArticle = os.path.join(os.path.join(PATH_TO_DATASET, 'test'), str(article_idx) + '.pickle')
    with open(pathToArticle, 'rb') as handle:
        preprocessedDatasetReloaded = pickle.load(handle)
    # it is a tuple containing data and the originalArticle as string
    data, originalArticle, highlights = preprocessedDatasetReloaded

    unknownWordsDictionaryForAnArticle = data['unknownWordsDictionaryForAnArticle']
    # retrieving 5 original sentences picked for comparison
    expandedContractionsArticle = l4bdOptimized.expandContractions(originalArticle)
    sentencesInOriginalArticleList = l4bdOptimized.splitSentences(expandedContractionsArticle)
    originalSentencesPickedToBeTarget = [sentencesInOriginalArticleList[i] if i < len(sentencesInOriginalArticleList) else  ' ' for i in sentencesPicked]
    
    convertedSentences = l4bdOptimized.fromFinalVocabularyEncodingBackToWords(ys[0], finalVocabularyAsDict, unknownWordsDictionaryForAnArticle)
    
    splitted_convertedSentences = l4bdOptimized.splitSentences(convertedSentences)

    return splitted_convertedSentences, originalSentencesPickedToBeTarget


In [21]:
predictedSentences, groudTruth = greedy_decode(transformer)

print("\nGROUND TRUTH SENTENCES:\n")
for sent in groudTruth:
  print(sent)

print("\nPREDICTED SENTENCES:\n")
for sent in predictedSentences:
  print(sent)


GROUND TRUTH SENTENCES:

On a mobile site, which was still active, the network said it was "hacked by an Islamist group."
However, by late morning, a number of pages on the network's website had messages saying they were under maintenance.
The outage began around 8:45 p.m. Paris time (2:45 p.m.
TV5Monde offers round-the-clock entertainment and news programming that reaches 260 million homes worldwide, according to the Ministry of Culture and Communications.
It functions under a partnership among the governments of France, Canada and Switzerland, as well as the Wallonia-Brussels Federation.

PREDICTED SENTENCES:

however , by late morning , a number of pages on the network UNK website had messages saying they were under maintenance .
on a mobile site , which was still active , the network said it was " UNK by an al French group .  "
the UNK began around UNK UNK time ( UNK UNK .
it operates under a partnership between the governments of france , france and france , as well as the UNK pa

In [18]:
!pip install rouge-metric

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge-metric
  Downloading rouge_metric-1.0.1-py3-none-any.whl (151 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m151.7/151.7 KB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rouge-metric
Successfully installed rouge-metric-1.0.1


In [22]:
# order of sentences may be not same
# we try to order them based simply on sentence length
predictedSentences.sort(key=lambda x: len(x), reverse=True)
print(predictedSentences)

groudTruth.sort(key=lambda x: len(x), reverse=True)
print(groudTruth)

['French offers UNK programming and news programming that reaches ISIS million people worldwide , according to the ministry of health and services .', 'however , by late morning , a number of pages on the network UNK website had messages saying they were under maintenance .', 'it operates under a partnership between the governments of france , france and france , as well as the UNK partnership .', 'on a mobile site , which was still active , the network said it was " UNK by an al French group .  "', 'the UNK began around UNK UNK time ( UNK UNK .']
['TV5Monde offers round-the-clock entertainment and news programming that reaches 260 million homes worldwide, according to the Ministry of Culture and Communications.', 'It functions under a partnership among the governments of France, Canada and Switzerland, as well as the Wallonia-Brussels Federation.', "However, by late morning, a number of pages on the network's website had messages saying they were under maintenance.", 'On a mobile site

In [23]:
from rouge_metric import PyRouge

# from list to single sentence with \n instead of '.'
prediction = ''
for sentence in predictedSentences:
  tokens = sentence.split()
  for tok in tokens:
    if tok != '.':
      prediction += tok.lower()
    else:
      prediction += '\n'
    prediction += ' '

print(prediction)

# from list to single sentence with \n instead of '.'
gt = ''
for sentence in groudTruth:
  tokens = sentence.split()
  for tok in tokens:
    if tok[-1] == '.':
      gt += tok[:-1]
      gt += '\n'
    else:
      if len(tok) > 1 and tok[-2] == '.':
        gt += tok[:-2]
        gt += '\n'
      else:
        gt += tok.lower()
    gt += ' '

print(gt)

# Load summary results
hypotheses = [prediction]
references = [[gt]]

# Evaluate document-wise ROUGE scores
rouge = PyRouge(rouge_n=(1, 2, 4), rouge_l=True, rouge_w=True,
                rouge_w_weight=1.2, rouge_s=True, rouge_su=True, skip_gap=4)
scores = rouge.evaluate(hypotheses, references)
print(scores)

french offers unk programming and news programming that reaches isis million people worldwide , according to the ministry of health and services 
 however , by late morning , a number of pages on the network unk website had messages saying they were under maintenance 
 it operates under a partnership between the governments of france , france and france , as well as the unk partnership 
 on a mobile site , which was still active , the network said it was " unk by an al french group 
 " the unk began around unk unk time ( unk unk 
 
tv5monde offers round-the-clock entertainment and news programming that reaches 260 million homes worldwide, according to the ministry of culture and Communications
 it functions under a partnership among the governments of france, canada and switzerland, as well as the wallonia-brussels Federation
 however, by late morning, a number of pages on the network's website had messages saying they were under maintenance
 on a mobile site, which was still active, t

ROUGE METRICS OVER ALL DATASET