<a href="https://colab.research.google.com/github/asking28/finsummary/blob/master/t5_summarization_github.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install pytorch-lightning
!pip install tensorboardX

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration


In [None]:

import glob
import logging

import time

import argparse
import json
import os
from os.path import join, exists
from datetime import timedelta
import pickle as pkl
from toolz import compose
import torch
from torch import optim
from torch.nn import functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader
import math
from torch import nn
from itertools import starmap
from toolz import curry, reduce
from torch.utils.data import Dataset
from torch.nn import init
import random
from collections import defaultdict
import re
from toolz.sandbox import unzip
from toolz import curry, concat, compose
from toolz import curried
import torch.multiprocessing as mp
from os.path import basename
import nltk
import gensim
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import tensorboardX
from time import time
from glob import glob
from torch.nn.utils import clip_grad_norm_
from transformers import get_constant_schedule_with_warmup
import pytorch_lightning as pl
import numpy as np
from transformers import (
    AdamW,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
)

In [None]:
!nvidia-smi

In [None]:
nltk.download('punkt')

In [None]:
nltk_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [None]:
def encode_sentences(tokenizer, sentences, max_length, pad_to_max_length=True, return_tensors="pt"):
    """
    This function reads the text files that we prepared and returns them in tokenized form.

    Actually tokenizer.batch_encode_plus returns these as a list of dictionaries where 
    each dictionary contains the word piece indices among other relevant inputs for training & inference
    """
    examples = []
    
    for text in sentences:
        tokenized = tokenizer.batch_encode_plus(
            [text], max_length=max_length, pad_to_max_length=pad_to_max_length, return_tensors=return_tensors,
        )
        examples.append(tokenized)
    return examples

In [None]:
import sys
class CnnDmDataset(Dataset):
    def __init__(self, split: str, path: str,tokenizer,source_length,target_length) -> None:
        assert split in ['training', 'validation', 'test']
        self._data_path = join(path, split)
        self._n_data = _count_data(self._data_path)
        self._json_files=os.listdir(join(self._data_path,'extraction'))
        self._json_files=list(filter(lambda x: x.endswith('json'),self._json_files))
        self._idx_list=[]
        self.tokenizer=tokenizer
        self.source_length=source_length
        self.target_length=target_length
        for i in range(len(self._json_files)):
          self._idx_list.append(self._json_files[i].split('.')[0])
    def __len__(self) -> int:
        return len(self._json_files)

    def __getitem__(self, i: int):
        # token = compose(list, _split_words)
        idx=self._idx_list[i]
        js_path=join(self._data_path,'extraction')
        try:
          with open(join(js_path, '{}.json'.format(idx)),'r') as f:
              js = json.loads(f.read())
        except:
          print(join(js_path, '{}.json'.format(idx)))
          sys.exit()
        f_name=js['filename']
        ext_fname=idx.split('_')[0]+'.txt'
        summ_path=join(self._data_path,'gold_summaries')
        with open(join(summ_path,f_name),encoding='utf8') as f:
          abs_data = f.read()
        abs_sentences=[]
        for sent in nltk_tokenizer.tokenize(abs_data):
          sent = sent.replace('\n', ' ')
          abs_sentences.append(sent)
        # abs_sents = token(abs_sentences)
        report_path=join(self._data_path,'annual_reports')
        with open(join(report_path,ext_fname),encoding='utf8') as f:
          ext_data=f.read()
        ext_sentences=[]
        for sent in nltk_tokenizer.tokenize(ext_data):
          sent=sent.replace('\n',' ')
          ext_sentences.append(sent)
        matching_report_sentences=[]
        for label in js['extracted_labels']:
          matching_report_sentences.append(ext_sentences[label])
        encoded_abs=encode_sentences(self.tokenizer,abs_sentences,self.target_length)
        encoded_exts=encode_sentences(self.tokenizer,matching_report_sentences,self.target_length)

        js['report']=encoded_exts
        js['summary']=encoded_abs
        return js




In [None]:
BUCKET_SIZE = 100
DATA_DIR = r"/content/drive/My Drive/finsummary/Data"
max_length=100

In [None]:
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')

In [None]:
class MatchDataset(CnnDmDataset):
    """ single article sentence -> single abstract sentence
    (dataset created by greedily matching ROUGE)
    """
    def __init__(self, split,t5_tokenizer,source_length,target_length):
        super().__init__(split, DATA_DIR,t5_tokenizer,source_length,target_length)

    def __getitem__(self, i):
        js_data = super().__getitem__(i)
        input_ids = torch.stack([x["input_ids"].squeeze() for x in js_data['report']])
        input_attention_mask = torch.stack([x["attention_mask"].squeeze() for x in js_data['report']])
        target_ids= torch.stack([x["input_ids"].squeeze() for x in js_data['summary']])
        return input_ids,input_attention_mask,target_ids

In [None]:
def set_seed(args: argparse.Namespace):
    """
    Set all the seeds to make results replicable
    """
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

In [None]:

def coll_fn(data):
    input_ids, input_attention_mask,target_ids = unzip(data)
    input_id_list =torch.stack(list(concat(input_ids)))
    input_mask_list=torch.stack(list(concat(input_attention_mask)))
    target_ids_list=torch.stack(list(concat(target_ids)))
    return input_id_list, input_mask_list, target_ids_list


In [None]:
class T5Module(pl.LightningModule):
  def __init__(self, hparams: argparse.Namespace, **config_kwargs):
        "Initialize a model."

        super().__init__()
        self.hparams = hparams
        cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None
        # Read the config file of the T5 model (T5Config)
        # AutoConfig allows you to read the configuration for a specified model (e.g. in this case, t5-base)
        # Reference: https://huggingface.co/transformers/model_doc/auto.html#autoconfig
        self.config = AutoConfig.from_pretrained(self.hparams.model_name_or_path)
        # Read the tokenizer of the T5 model (T5Tokenizer)
        # AutoTokenizer allows you to read the tokenizer for a specified model (e.g. in this case, t5-base)
        # Reference: https://huggingface.co/transformers/model_doc/t5.html#t5tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.hparams.model_name_or_path,
            cache_dir=cache_dir,
        )
        # Read the model file for the pre-trained T5 model (T5ForConditionalGeneration)
        # AutoModelWithLMHead allows you to read any of the language modelling models from the transformers library (e.g. in this case, t5-base)
        # Automodels reference: https://huggingface.co/transformers/model_doc/auto.html#automodel
        self.model = AutoModelWithLMHead.from_pretrained(
            self.hparams.model_name_or_path,
            from_tf=bool(".ckpt" in self.hparams.model_name_or_path), # Checkpoint is a TF format
            config=self.config,
            cache_dir=cache_dir,
        )

        # Save dataset params
        self.dataset_kwargs: dict = dict(
            data_dir=self.hparams.data_dir,
            max_source_length=self.hparams.max_source_length,
            max_target_length=self.hparams.max_target_length,
        )
  def forward(
        self,
        input_ids, # Indices of input sequence tokens in the vocabulary. 
        attention_mask=None, # Mask to avoid performing attention on padding token indices
        decoder_input_ids=None, # T5 uses the pad_token_id as the starting token for decoder_input_ids generation.
        lm_labels=None # Labels for computing the sequence classification/regression loss (see T5Model). Note: loss is returned when lm_label is provided.
        ):
        """
          loss (torch.FloatTensor of shape (1,), optional, returned when lm_label is provided
        """
        # Details on how to use this in the Hugging Face T5 docs: https://huggingface.co/transformers/model_doc/t5.html
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            lm_labels=lm_labels,
        )
  def get_dataloader(self, type_path: str, batch_size: int, shuffle: bool = False) -> DataLoader:
        dataset = MatchDataset(type_path,self.tokenizer,self.hparams.max_source_length,self.hparams.max_target_length)
        dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=coll_fn, shuffle=shuffle)
        return dataloader
  def train_dataloader(self) -> DataLoader:
        dataloader = self.get_dataloader("training", batch_size=self.hparams.train_batch_size, shuffle=True)
        t_total = (
            (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
            // self.hparams.gradient_accumulation_steps
            * float(self.hparams.num_train_epochs)
        )
        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        return dataloader

  def val_dataloader(self) -> DataLoader:
      return self.get_dataloader("validation", batch_size=self.hparams.eval_batch_size)
  def configure_optimizers(self):
      "Prepare optimizer and schedule (linear warmup and decay)"

      model = self.model
      # Weight decay will not be applied to "bias" and "LayerNorm.weight" parameters
      no_decay = ["bias", "LayerNorm.weight"]

      # Group parameters to those that will and will not have weight decay applied
      optimizer_grouped_parameters = [
          {
              "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
              "weight_decay": self.hparams.weight_decay,
          },
          {
              "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
              "weight_decay": 0.0,
          },
      ]
      # Use AdamW as an optimizer
      # Intro here: https://www.fast.ai/2018/07/02/adam-weight-decay/
      optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
      self.opt = optimizer
      return [optimizer]
  def _step(self, batch, return_text=False):
        """
        Runs forward pass and calculates loss per batch. Applied for training_step, and validation_step
        """
        pad_token_id = self.tokenizer.pad_token_id
        source_ids, source_mask, y = batch
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone()
        # Change pad_token_id to -100
        lm_labels[y[:, 1:] == pad_token_id] = -100
        # Run forward pass and calculate loss
        outputs = self(source_ids, attention_mask=source_mask, decoder_input_ids=y_ids, lm_labels=lm_labels,)
        # Only get loss from the output since that's all we need to apply our optimizer
        loss = outputs[0]
        if return_text:
            target_text = [self.tokenizer.decode(ids) for ids in y_ids]
            return loss, target_text
        else:
            return loss
  def training_step(self, batch, batch_idx):
        """
        Runs forward pass, calculates loss, and returns loss (and logs) in a dict
        """
        loss = self._step(batch)

        # Notice that each training step loss is recorded on tensorboard, which makes sense since we're tracking loss per batch
        tensorboard_logs = {"train_loss": loss}
        return {"loss": loss, "log": tensorboard_logs}
  def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
        """
        Adjust weights based on calculated gradients + learning rate scheduler, and refresh gradients
        Reference for optimizer_step: https://pytorch-lightning.readthedocs.io/en/latest/optimizers.html
        """
        if self.trainer.use_tpu:
            xm.optimizer_step(optimizer)
        else:
            # Adjust weights based on calculated gradients
            optimizer.step()

        # Refresh gradients (to zero)
        optimizer.zero_grad()
        # Update the learning rate scheduler
        self.lr_scheduler.step()

    # Step during validation

  def validation_step(self, batch, batch_idx):
      """
      Runs forward pass, calculates loss, and returns loss in a dict
      """

      # Return source and target text to calculate jaccard score only for validation
      loss, target_text = self._step(batch, return_text=True)

      # preds = self.test_step(batch, batch_idx)
      # preds_text = preds["preds"]
      # Track jaccard score to get validation accuracy
      # jaccard_score = [jaccard(p, t) for p, t in zip(preds_text, target_text)]

      return {"val_loss": loss}
  def test_step(self, batch, batch_idx):
        """
        Runs forward pass on test set and returns calculated loss, predictions, and targets
        Note: this assumes that your test set has targets (doesn't have for kaggle).
        """
        pad_token_id = self.tokenizer.pad_token_id
        source_ids, source_mask, _ = T5Dataset.trim_seq2seq_batch(batch, pad_token_id, test=True)
        # NOTE: the following kwargs get more speed and lower quality summaries than those in evaluate_cnn.py
        # Generate reference: https://github.com/huggingface/transformers/blob/3e0f06210646a440509efa718b30d18322d6a830/src/transformers/modeling_utils.py#L769
        # For the sentiment span extraction task, turning off early stopping proved superior
        generated_ids = self.model.generate(
            input_ids=source_ids,
            attention_mask=source_mask,
            num_beams=1,
            max_length=80,
            repetition_penalty=2.5,
            length_penalty=1.0,
            early_stopping=True,
            use_cache=True,
        )
        preds = [
            self.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True)
            for g in generated_ids
        ]

        return {"preds": preds}

  # Show loss after validation

  def validation_end(self, outputs):
      """
      Calculate average loss for all the validation batches
      """
      avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
      # jaccard_scores = sum([x["jaccard_score"] for x in outputs], [])
      # avg_jaccard_score = np.mean(jaccard_scores)
      tensorboard_logs = {"val_loss": avg_loss}
      return {"avg_val_loss": avg_loss,"log": tensorboard_logs}

  # Step during testing
  @staticmethod
  def add_model_specific_args(parser, root_dir):
      parser.add_argument(
          "--model_name_or_path",
          default=None,
          type=str,
          required=True,
          help="Path to pretrained model or model identifier from huggingface.co/models",
      )
      parser.add_argument(
          "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
      )
      parser.add_argument(
          "--tokenizer_name",
          default="",
          type=str,
          help="Pretrained tokenizer name or path if not the same as model_name",
      )
      parser.add_argument(
          "--cache_dir",
          default="",
          type=str,
          help="Where do you want to store the pre-trained models downloaded from s3",
      )
      parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
      parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
      parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
      parser.add_argument("--warmup_steps", default=5, type=int, help="Linear warmup over warmup_steps.")
      parser.add_argument(
          "--num_train_epochs", default=3, type=int, help="Total number of training epochs to perform."
      )

      parser.add_argument("--train_batch_size", default=1, type=int)
      parser.add_argument("--eval_batch_size", default=1, type=int)

      parser.add_argument(
          "--max_source_length",
          default=50,
          type=int,
          help="The maximum total input sequence length after tokenization. Sequences longer "
          "than this will be truncated, sequences shorter will be padded.",
      )
      parser.add_argument(
          "--max_target_length",
          default=50,
          type=int,
          help="The maximum total input sequence length after tokenization. Sequences longer "
          "than this will be truncated, sequences shorter will be padded.",
      )

      parser.add_argument(
          "--data_dir",
          default=DATA_DIR,
          type=str,
          required=True,
          help="The input data dir. Should contain the dataset files for the text generation task.",
      )
      return parser


In [None]:
class LoggingCallback(pl.Callback):
    def on_validation_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
        logger.info("***** Validation results *****")
        if pl_module.is_logger():
            metrics = trainer.callback_metrics
            # Log results
            for key in sorted(metrics):
                if key not in ["log", "progress_bar"]:
                    logger.info("{} = {}\n".format(key, str(metrics[key])))


def add_generic_args(parser, root_dir):
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model predictions and checkpoints will be written.",
    )

    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )

    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )

    parser.add_argument("--n_gpu", type=int, default=1)
    parser.add_argument("--n_tpu_cores", type=int, default=0)
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
    parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )

    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")


In [None]:
def generic_train(model: T5Module, args: argparse.Namespace):
    # init model
    set_seed(args)

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))

    # Can take out checkpoint saving after each epoch to save memory
    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=5
    )

    train_params = dict(
        accumulate_grad_batches=args.gradient_accumulation_steps,
        gpus=args.n_gpu,
        max_epochs=args.num_train_epochs,
        early_stop_callback=False,
        gradient_clip_val=args.max_grad_norm,
        checkpoint_callback=checkpoint_callback,
        callbacks=[LoggingCallback()],
    )

    if args.fp16:
        train_params["use_amp"] = args.fp16
        train_params["amp_level"] = args.fp16_opt_level

    if args.n_tpu_cores > 0:
        global xm
        import torch_xla.core.xla_model as xm

        train_params["num_tpu_cores"] = args.n_tpu_cores
        train_params["gpus"] = 0

    if args.n_gpu > 1:
        train_params["distributed_backend"] = "ddp"

    trainer = pl.Trainer(**train_params)

    if args.do_train:
        trainer.fit(model)

    return trainer

In [None]:

logging.basicConfig(level = logging.INFO)

logger = logging.getLogger(__name__)

def main(args):

    # If output_dir not provided, a folder will be generated in pwd
    if not args.output_dir:
        args.output_dir = os.path.join("/content/drive/My Drive/finsummary/results", f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}",)
        os.makedirs(args.output_dir)
    model = T5Module(args)
    trainer = generic_train(model, args)

    # Save the last model as model.bin
    #checkpoints = list(sorted(glob.glob(os.path.join(args.output_dir, "checkpointepoch=*.ckpt"), recursive=True)))
    #model = model.load_from_checkpoint(checkpoints[-1])
    model.model.save_pretrained(args.output_dir)
    # Save tokenizer files
    model.tokenizer.save_pretrained('./')
    
    # Optionally, predict on dev set and write to output_dir
    if args.do_predict:
        # See https://github.com/huggingface/transformers/issues/3159
        # pl use this format to create a checkpoint:
        # https://github.com/PyTorchLightning/pytorch-lightning/blob/master\
        # /pytorch_lightning/callbacks/model_checkpoint.py#L169
        trainer.test(model)
    return trainer


In [None]:
ARGS_STR = """
--data_dir=./ \
--model_name_or_path=t5-small \
--learning_rate=3e-5 \
--train_batch_size=1 \
--output_dir=output/ \
--do_train \
--n_gpu=1 \
--num_train_epochs 1 \
--max_source_length 80 \
"""
#
#--eval_batch_size=3 \
#--do_predict \

parser = argparse.ArgumentParser()
add_generic_args(parser, os.getcwd())
parser = T5Module.add_model_specific_args(parser, os.getcwd())
args = parser.parse_args(ARGS_STR.split())
trainer = main(args)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('t5-base',
            cache_dir="/root/.cache/torch/transformers/",
        )
train_loader = DataLoader(
    MatchDataset('training',tokenizer,100,100), batch_size=BUCKET_SIZE,
    shuffle=True,
    num_workers=4,
    collate_fn=coll_fn
)

In [None]:
# data=train_loader.dataset[3]

In [None]:
# len(data)

In [None]:
# source_lists, target_lists = data

In [None]:
# ten_list=[]
# for x in source_lists:
#   ten_list.append(x['input_ids'][0])

In [None]:
# torch.stack(ten_list)

In [None]:
# coll_fn(data)