In [0]:
%%capture
!pip install transformers
from transformers import XLNetTokenizer
import glob, os
from multiprocessing import Pool
import random
import torch
from torch.utils.data import Dataset, DataLoader, Sampler
import transformers
import math
import numpy as np
import random

# This function processes news articles gathered and preprocessed by the XSum data processor:
# https://github.com/EdinburghNLP/XSum
#
# The pre-processor generates a large number of files, each of which corresponds to a single article. The format of
# each file is text, with following format:
# [XSUM]URL[XSUM]
# <URL where article originates from>
# [XSUM]INTRODUCTION[XSUM]
# <Summary of the article>
# [XSUM]RESTBODY[XSUM]
# <Article text>

JUNK_HEADER_TEXT = ["Share this with\n",
                    "Email\n",
                    "FaceBook\n",
                    "Facebook\n",
                    "Messenger\n",
                    "Twitter\n",
                    "Pinterest\n",
                    "WhatsApp\n",
                    "LinkedIn\n",
                    "Linkedin\n",
                    "Copy this link\n",
                    "These are external links and will open in a new window\n"]

# Processes the contents of an XSUM file and returns a dict: {'text', 'summary'}
def map_read_files(filepath):
    with open(filepath, encoding="utf-8") as file:
        content = file.read()
        SUMMARY_INDEX = 4
        TEXT_INDEX = 6
        splitted = content.split("[XSUM]")

        summary = splitted[SUMMARY_INDEX].strip()
        text = splitted[TEXT_INDEX]
        for junk in JUNK_HEADER_TEXT:
            text = text.replace(junk, "").strip()

        # Don't accept content with too small of text content or title content. Often these are very bad examples.
        if len(text) < 1024:
            return None
        if len(summary) < 30:
            return None

        if(len(text.split(" "))>1500):
            return None
        
        return {"summary": summary, "text": text}

tok = XLNetTokenizer.from_pretrained("xlnet-base-cased")

def map_tokenize_news(processed):
    text = processed["text"]
    text_enc = tok.encode(
        text, add_special_tokens=False, max_length=None, pad_to_max_length=False
    )

    title = processed["summary"]
    # Insert the title as the second sentence, forcing the proper token types.
    title_enc = tok.encode(
        title, add_special_tokens=False, max_length=None, pad_to_max_length=False
    )

    # Push resultants to a simple list and return it
    return {
        "text": torch.tensor(text_enc, dtype=torch.long),
        "target": torch.tensor(title_enc, dtype=torch.long),
    }

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
# This class implements a Dataset which is capable of feeding a model which expects a stream of
# text with a generative target.
#
# It is unique because it supports extremely long sequences by "chunking" those sequences into several
# parts. Data retrieved from this set should then be fed into the model one chunk at a time until all chunks
# are exhausted. The target text of the language model will be found in the last chunk, so you should perform a
# backward() pass on the loss from that pass.
#
# To facilitate this, this dataset sorts the data it returns by chunk size. It is strongly recommended that you
# do not randomize this data without batching first.
#
# Clients should use the get_dataloader() method to retrieve a dataloader from this class. This
# custom dataloader will guarantee that retrieved batches have the same internal chunk length.
#
# An example where this might be used is text summarization, where long text is fed in and
# the generative target is a summary of that text.
class ChunkedTextDataset(Dataset):
    # data_file=File path to pytorch pickle file with a list of dicts containing tokenized {"text", "target"}
    # tokenizer=huggingface-spec tokenizer used to tokenize data_file.
    # max_chunk_len=Sequence size per chunk. This minus `max_gen_len` is the space left for the actual text.
    # max_gen_len=A fixed upper cap for the sequence length of the generated text.
    # pad_left=Whether the padding should go on the left or the right.
    def __init__(
        self,
        data_file: str,
        tokenizer: transformers.PreTrainedTokenizer,
        max_chunk_len=192,
        max_gen_len=64,
        pad_left=False,
    ):
        self.tokenizer = tokenizer
        self.max_chunk_len = max_chunk_len
        self.max_gen_len = max_gen_len
        self.pad_left = pad_left
        self.raw_data = torch.load(data_file)
        self.raw_data.sort(key=lambda x: x["text"].shape[0]) #sorting all the dictionaries based on the text length

    def process_element(self, text, target):
        # Tokens represented as 1-hot tensors which will be reused later in this function.
        bos_token_tensor = torch.tensor([self.tokenizer.bos_token_id], dtype=torch.long)
        eos_token_tensor = torch.tensor([self.tokenizer.eos_token_id], dtype=torch.long)
        sep_token_tensor = torch.tensor([self.tokenizer.sep_token_id], dtype=torch.long)
        if(text.shape[0]>1300):
          text=text[:1300]
        with torch.no_grad():
            target = torch.cat([target, eos_token_tensor]) #append end of sentence token at the end of target text
            target_len = target.shape[0] #length of target text doc
            # Truncate off the end of the target if it is too long.
            if target_len > self.max_gen_len:
                target = target[: self.max_gen_len]
                # Keep the <eos> token post-trucation.
                target[-1] = self.tokenizer.eos_token_id

            # The target will be the end of the sequence, so append that token onto it.
            # Also pad target to max length. This is done to force the model to predict the actual end of the sequence.
            pads_needed = self.max_gen_len - target_len
            if pads_needed > 0:
                target_padding = torch.full((pads_needed,), self.tokenizer.pad_token_id, dtype=torch.long)
                target = torch.cat([target, target_padding])
            target_len = self.max_gen_len

            # Straight up append the target to the text, with the special tokens in between.
            text = torch.cat([bos_token_tensor, text, sep_token_tensor, target]) #"<bos> text <sep> target"

            # Create attention_masks that'll go along with this tokenized text. Ignore the pads in the target because
            # we want the model to predict them.
            attention_mask = torch.ones(text.shape[0], dtype=torch.float) #1-should be attended to, 0-should not be attended to (padded tokens)

            # The permutation mask is all 0s for the text; all tokens attend to each other. For the target, our goal is
            # sequential generation, so the permutation will also be sequential. We won't let the text perform any
            # attention on the title.
            text_perm_mask = torch.zeros(
                (self.max_chunk_len, text.shape[0] - target_len), dtype=torch.float
            )
            target_perm_mask = torch.ones(
                (self.max_chunk_len, target_len), dtype=torch.float
            )
            for t_index in range(target_len):
                target_perm_mask[-t_index][-target_len:-t_index] = 0.0
            perm_mask = torch.cat([text_perm_mask, target_perm_mask], dim=-1)

            # Build target mappings. These are identical for all chunks - the target is just an eye tensor
            # up to target_len that is shifted to the end of the text sequence, with zeros placed everywhere.
            #eye-tensor has ones on diagonal, zeros everywhere else
            #target_mapping is another mask that allows you to tell the XLNet implementation how to map decoded 
            #tokens to labels so that a loss can be computed. For our purposes, this is going to be an eye tensor 
            #across the target text. This allows us to make the labels just the target text.
            target_mapping = torch.eye(target_len, dtype=torch.float)
            target_mapping_shift = torch.zeros((target_len, text.shape[0] - target_len), dtype=torch.float)
            target_mapping = torch.cat([target_mapping_shift, target_mapping], dim=-1)

            # We will chunk all inputs so that none exceed max_chunk_len, which will all be fed into the model
            # sequentially. Lets figure out what the total lengths are first.
            num_chunks = math.ceil(text.shape[0] / self.max_chunk_len)
            final_text_seq_len = num_chunks * self.max_chunk_len

            # Before we can feed text into torch.chunk, it needs to be an exact multiple of text_len_per_chunk -
            # which is final_text_seq_len calculated above. Pad the text to accomplish this.
            padding_needed = final_text_seq_len - text.shape[0]
            padding_tensor = torch.full(
                (padding_needed,),
                fill_value=self.tokenizer.pad_token_id,
                dtype=torch.long,
            )
            att_padding_tensor = torch.zeros(padding_needed, dtype=torch.float)
            perm_padding_tensor = torch.zeros((self.max_chunk_len, padding_needed), dtype=torch.float)
            tar_map_padding_tensor = torch.zeros((target_len, padding_needed), dtype=torch.float)
            if self.pad_left:
                text = torch.cat([padding_tensor, text], dim=0)
                attention_mask = torch.cat([att_padding_tensor, attention_mask], dim=0)
                perm_mask = torch.cat([perm_padding_tensor, perm_mask], dim=-1)
                target_mapping = torch.cat([tar_map_padding_tensor, target_mapping], dim=-1)
            else:
                text = torch.cat([text, padding_tensor], dim=0)
                attention_mask = torch.cat([attention_mask, att_padding_tensor], dim=0)
                perm_mask = torch.cat([perm_mask, perm_padding_tensor], dim=-1)
                target_mapping = torch.cat([target_mapping, tar_map_padding_tensor], dim=-1)

            chunked_text = torch.chunk(text, chunks=num_chunks)
            chunked_attention = torch.chunk(attention_mask, chunks=num_chunks)
            chunked_perm_mask = torch.chunk(perm_mask, chunks=num_chunks, dim=-1)
            chunked_target_mapping = torch.chunk(target_mapping, chunks=num_chunks, dim=-1)
            labels = target

            result = {
                "input_ids": chunked_text,
                "attention_masks": chunked_attention,
                "permutation_masks": chunked_perm_mask,
                "target_mappings": chunked_target_mapping,
                "labels": labels,
            }
            return result

    def num_chunks_for_index(self, i):
        text = self.raw_data[i]["text"]
        with torch.no_grad():
            return math.ceil(
                (text.shape[0] + self.max_gen_len + 3) / self.max_chunk_len
            )

    def __getitem__(self, index):
        return self.process_element(
            self.raw_data[index]["text"], self.raw_data[index]["target"]
        )

    def __len__(self):
        return len(self.raw_data)

    # Returns a batched dataloader for this dataset. See the documentation for ChunkedTextBatchSampler below for some
    # caveats on this.
    def get_dataloader(self, batch_sz: int, random=True, num_workers=1):
        return DataLoader(
            self,
            batch_sampler=ChunkedTextBatchSampler(self, batch_sz),
            shuffle=False,
            num_workers=num_workers,
        )


# This sampler will only return batches with the same chunk size.
#
# In order to achieve random=True with the above restriction, it "cheats". It does this by selecting a random permutation
# that "should" achieve coverage across the dataset when combined with the batch size, but almost certainly some elements
# will get skipped.
class ChunkedTextBatchSampler(Sampler):
    def __init__(
        self,
        chunked_text_set: ChunkedTextDataset,
        batch_sz: int,
        drop_last=True,
        random=True,
    ):
        self.chunked_text_set = chunked_text_set
        self.batch_sz = batch_sz
        self.drop_last = drop_last
        self.random = random

    def __iter__(self):
        batch = []
        batch_chunk_sz = 0

        if self.random:
            # minus batch_sz because we don't want to start from any element we cannot finish from.
            permutation = np.random.permutation(
                len(self.chunked_text_set) - self.batch_sz
            )
            yielded = 0
            for pidx in permutation:
                if yielded == len(self):
                    break
                batch = []
                chunk_sz = self.chunked_text_set.num_chunks_for_index(pidx)
                for b in range(self.batch_sz):
                    if chunk_sz != self.chunked_text_set.num_chunks_for_index(b + pidx):
                        break
                    batch.append(b + pidx)
                if len(batch) == self.batch_sz:
                    yielded += 1
                    yield batch

        else:
            for idx in range(len(self.chunked_text_set)):
                chunk_sz_idx = self.chunked_text_set.num_chunks_for_index(idx)
                if chunk_sz_idx != batch_chunk_sz:
                    batch.clear()
                    batch_chunk_sz = chunk_sz_idx
                batch.append(idx)
                if len(batch) == self.batch_sz:
                    yield batch
                    batch = []
                if len(batch) > 0 and not self.drop_last:
                    yield batch

    def __len__(self):
        # This is possibly (likely) not accurate, but it shouldn't matter.
        return int(len(self.chunked_text_set) / self.batch_sz)

In [0]:
!ls

gdrive	sample_data  test.pt


In [0]:
#load dataset
import os
input_folder=""
chunked_model_config = {
        "max_seq_len": 256,
        "model_name": "xlnet-base-cased",
        "predict_len": 32,
        "batch_size": 4,
        "starting_lr": 2e-5,
        "output_dir": "storage/outputs",
        "mem_len": 1024,
    }
from transformers import XLNetTokenizer
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")

# chunked_model_config is a dictionary initialized from command line arguments. Most fields
# are self-explanatory or can be inferred from the ChunkedTextDataset docs.
# Get the datasets

test_set = ChunkedTextDataset(
    os.path.join(input_folder, "test.pt"),
    tokenizer,
    chunked_model_config["max_seq_len"],
    chunked_model_config["predict_len"],
    pad_left=True,
)
test_loader = test_set.get_dataloader(1,num_workers=0)

In [0]:
print("Loading model...")
config = transformers.XLNetConfig.from_pretrained("xlnet-base-cased")
config.mem_len = chunked_model_config["mem_len"]
model = transformers.XLNetLMHeadModel.from_pretrained("xlnet-base-cased", config=config)

#Transfer model to cuda
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")# Transfer the network to GPU
model=model.to(device)

Loading model...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467042463.0, style=ProgressStyle(descri…




In [0]:
checkpoint = torch.load('/content/gdrive/My Drive/docsummary/xlnet/model4.pth')
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [0]:
import torch
import transformers
import os
import json
import random
from tqdm.auto import tqdm
model_dir='/content/gdrive/My Drive/docsummary/xlnet/chkpt4-8000'
data_file='test.pt'
number_to_generate=50
device='cuda'
do_sampling=False

# Load the chunk config. This is saved alongside the model if it was trained using train_xlnet_lang_modeler
# and saves some of the configuration options used to train the model which are also useful for generation.
with open(os.path.join(model_dir, "chunk_config.json"), "r") as chunk_cfg:
    chunk_config = json.load(chunk_cfg)
chunk_seq_len = chunk_config["max_seq_len"]
target_pred_max_len = chunk_config["predict_len"]

# Load the tokenizer, config and model.
tokenizer = transformers.XLNetTokenizer.from_pretrained(chunk_config["model_name"])
config = transformers.XLNetConfig.from_pretrained(chunk_config["model_name"])
config.mem_len = chunk_config["mem_len"]
model = transformers.XLNetLMHeadModel.from_pretrained(model_dir, config=config)

# Load the dataset for the file specified above. This class is the same one used during training to perform chunking
# and some preprocessing.
test_set = ChunkedTextDataset(
    data_file,
    tokenizer,
    chunk_config["max_seq_len"],
    chunk_config["predict_len"],
    pad_left=True,
)

#random.seed(12345)
device = torch.device(device)
model.to(device)
model.eval()
torch.set_grad_enabled(False)
results = {"meta": chunk_config, "results": []}

for i in tqdm(range(number_to_generate)):
    chunked_data = test_set[random.randint(0, len(test_set) - 1)]
    full_text = []

    # Start by establishing mems state - achieved by doing forward passes on all chunks but the last one.
    mems = None
    num_chunks = len(chunked_data["input_ids"])
    for c in range(num_chunks - 1):
        full_text.extend(chunked_data["input_ids"][c].tolist())

        model_inputs = {
            "input_ids": chunked_data["input_ids"][c].unsqueeze(0).to(device),
            "attention_mask": chunked_data["attention_masks"][c]
            .unsqueeze(0)
            .to(device),
            "perm_mask": chunked_data["permutation_masks"][c].unsqueeze(0).to(device),
            "target_mapping": chunked_data["target_mappings"][c]
            .unsqueeze(0)
            .to(device),
        }
        if mems is not None:
            model_inputs["mems"] = mems

        logits, mems = model.forward(**model_inputs)

    # Now get the input IDs minus the target* for the last chunk. This will serve as the "prompt" for the model.
    text_len = chunk_seq_len - target_pred_max_len
    prompt_inputs = chunked_data["input_ids"][-1][0:text_len]
    prompt_inputs = prompt_inputs.to(device)
    full_text.extend(prompt_inputs)
    prompt_inputs = prompt_inputs.unsqueeze(dim=0)  # generate() expects batched inputs.

    # Use the transformers generate function to do the actual generation now.
    if do_sampling:
        genned_results = model.generate(
            prompt_inputs,
            max_length=chunk_seq_len,
            do_sample=True,
            num_beams=4,
            temperature=0.7,
            top_k=0,
            top_p=0.9,
            repetition_penalty=5,
            eos_token_id=tokenizer.eos_token_id,
            num_return_sequences=5,
            mems=mems,
        )
    else:
        genned_results = model.generate(
            prompt_inputs,
            max_length=chunk_seq_len,
            do_sample=False,
            num_beams=12,
            repetition_penalty=5,
            eos_token_id=tokenizer.eos_token_id,
            num_return_sequences=2,
            mems=mems,
        )

    # Append results here.
    seqs, _ = genned_results.shape
    genned_texts = []
    prompt = tokenizer.decode(prompt_inputs[0])
    print(
        "\n------------------------------------------------------------------------------"
    )
    print("PROMPT: `%s`" % (prompt))
    for s in range(seqs):
        genned_texts.append(tokenizer.decode(genned_results[s][text_len:]))
        print("GENERATED: `%s`" % (genned_texts[-1]))
    result = {
        "prompt": tokenizer.decode(full_text),
        "generated": genned_texts,
        "actual": tokenizer.decode(chunked_data["labels"]),
    }
    print(
        "------------------------------------------------------------------------------"
    )
    results["results"].append(result)

# Record the results.
with open(model_dir + "/gen_results.json", "w", encoding="utf-8") as result_file:
    json.dump(results, result_file, sort_keys=True, indent=4)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


------------------------------------------------------------------------------
PROMPT: `said there were millions of workers who knew their next pay packet would not be enough. But Chancellor Philip Hammond said the UK economy "continued to confound the commentators with robust growth". Mr Corbyn said: "This budget has done nothing to tackle low pay... and nothing to make a fair economy that truly works for everyone." "It is built on unfairness," he said. Mr Corbyn accused the government of "cutting services and the living standards of the many, to continue to fund the tax cuts of the few". He calculated there would be a £70bn tax giveaway for "those who need it the least". As an example, he said that instead of using £10m to set up a children's funeral fund, the government was cutting support for bereaved families. He said there was a "crisis in job security" and "many people don't know if they will be working day to day", criticising the chancellor for failing to ban zero hour contra