In [None]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [None]:
!pip install --quiet transformers==4.5.0
#!pip install --quiet pytorch-lightenining==1.2.7
!pip install torchtext==0.8.0 torch==1.7.1 pytorch-lightning==1.2.2
!pip install datasets

[K     |████████████████████████████████| 2.1 MB 5.4 MB/s 
[K     |████████████████████████████████| 895 kB 36.7 MB/s 
[K     |████████████████████████████████| 3.3 MB 34.0 MB/s 
[?25hCollecting torchtext==0.8.0
  Downloading torchtext-0.8.0-cp37-cp37m-manylinux1_x86_64.whl (6.9 MB)
[K     |████████████████████████████████| 6.9 MB 4.7 MB/s 
[?25hCollecting torch==1.7.1
  Downloading torch-1.7.1-cp37-cp37m-manylinux1_x86_64.whl (776.8 MB)
[K     |██                              | 48.0 MB 1.9 MB/s eta 0:06:17

In [None]:
import json
import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset,DataLoader
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from termcolor import colored
import textwrap
import datasets
from transformers import(
    AdamW,
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer
)
from tqdm.auto import tqdm

In [None]:
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

%matplotlib inline
%config InlineBackend.figure_formats='retina'
sns.set(style='whitegrid',palette='muted',font_scale=1.2)
rcParams['figure.figsize']=16,10

In [None]:

pl.seed_everything(42)
train_data = datasets.load_dataset("xsum", split="train")
val_data = datasets.load_dataset("xsum", split="validation")
test_data = datasets.load_dataset("xsum", split="test")



In [None]:
pl.seed_everything(42)
train_data=pd.read_csv('/content/train_data.csv',encoding="latin-1",engine='python', error_bad_lines=False)
val_data=pd.read_csv('/content/val_data.csv',encoding="latin-1",engine='python', error_bad_lines=False)
test_data=pd.read_csv('/content/val_data.csv',encoding="latin-1",engine='python', error_bad_lines=False)

In [None]:
train_data=train_data[["document","summary"]]
test_data=test_data[["document","summary"]]
val_data=val_data[["document","summary"]]

In [None]:
print('full: ',train_data.iloc[0, 0], '\n')
print('summary:', train_data.iloc[0,1])

In [None]:
train_data.head()

In [None]:
class NewSummaryDataset(Dataset):
  def __init__(
      self,
      data,
      tokenizer:T5Tokenizer,
      text_max_token_len: int= 512,
      summary_max_token_len=128
  ):
    self.tokenizer=tokenizer
    self.data=data
    self.text_max_token_len=text_max_token_len
    self.summary_max_token_len=summary_max_token_len
  def __len__(self):
    return len(self.data)
    
  def __getitem__(self,index: int):
    data_row=self.data.iloc[index]
    text=data_row["document"]
    text_encoding=tokenizer(
        text,
        max_length=self.text_max_token_len,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt"
    )
    summary_encoding=tokenizer(
        text,
        max_length=self.summary_max_token_len,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt"
    )
    labels=summary_encoding["input_ids"]
    labels[labels==0]=-100

    return dict(
        text=text,
        summary=data_row["summary"],
        text_input_ids=text_encoding["input_ids"].flatten(),
        text_attention_mask=text_encoding["attention_mask"].flatten(),
        labels=labels.flatten(),
        labels_attention_mask=summary_encoding["attention_mask"].flatten()
    )

In [None]:
class NewSummaryDataModule(pl.LightningDataModule):
  def __init__(
      self,
      train_data,
      test_data,
      tokenizer:T5Tokenizer,
      text_max_token_len: int= 512,
      summary_max_token_len=128
  ):
    super().__init__()
    self.train_df=train_data
    self.test_df=test_data

    self.batch_size= batch_size
    self.tokenizer=tokenizer
    self.text_max_token_len=text_max_token_len
    self.summary_max_token_len=summary_max_token_len

  def setup(self, stage=None):
    self.train_dataset=NewSummaryDataset(
        self.train_df,
        self.tokenizer,
        self.text_max_token_len,
        self.summary_max_token_len
    )
    self.test_dataset=NewSummaryDataset(
        self.test_df,
        self.tokenizer,
        self.text_max_token_len,
        self.summary_max_token_len
    )
  def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        batch_size=self.batch_size,
        shuffle=True,
        num_workers=2
    )
  def val_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size=self.batch_size,
        shuffle=True,
        num_workers=2
    )
  def test_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size=self.batch_size,
        shuffle=True,
        num_workers=2
    )

In [None]:
Model_name="t5-base"
tokenizer=T5Tokenizer.from_pretrained(Model_name)

In [None]:
"""text_token_counts,summary_token_counts=[],[]
for i, row in train_dataloadertext"""

In [None]:
epoch=4
batch_size=10
data_module=NewSummaryDataModule(train_data,test_data,tokenizer,batch_size)
print(data_module)

In [None]:
class NewSummaryModule(pl.LightningModule):
  def __init__(self):
    super().__init__()
    self.model=T5ForConditionalGeneration.from_pretrained(Model_name,return_dict=True)
  def forward(self, input_ids, attention_mask,decoder_attention_mask,labels=None):
    output= self.model(
        input_ids,
        attention_mask=attention_mask,
        labels=labels,
        decoder_attention_mask=decoder_attention_mask
    )
    return output.loss, output.logits

  def training_step(self,batch,batch_idx):
    input_ids=batch["text_input_ids"]
    attention_mask=batch["text_attention_mask"]
    labels=batch["labels"]
    labels_attention_mask=batch["labels_attention_mask"]

    loss,outputs=self(
        input_ids=input_ids,
        attention_mask=attention_mask,
        decoder_attention_mask=labels_attention_mask,
        labels=labels
    )    
    self.log("train_loss", loss,prog_bar=True,logger=True)
    return loss
  def val_step(self,batch,batch_idx):
    input_ids=batch["text_input_ids"]
    attention_mask=batch["text_attention_mask"]
    labels=batch["labels"]
    labels_attention_mask=batch["labels_attention_mask"]

    loss,outputs=self(
        input_ids=input_ids,
        attention_mask=attention_mask,
        decoder_attention_mask=labels_attention_mask,
        labels=labels
    )    
    self.log("val_loss", loss,prog_bar=True,logger=True)
    return loss
  def test_step(self,batch,batch_idx):
    input_ids=batch["text_input_ids"]
    attention_mask=batch["text_attention_mask"]
    labels=batch["labels"]
    labels_attention_mask=batch["labels_attention_mask"]

    loss,outputs=self(
        input_ids=input_ids,
        attention_mask=attention_mask,
        decoder_attention_mask=labels_attention_mask,
        labels=labels
    )    
    self.log("test_loss", loss,prog_bar=True,logger=True)
    return loss
  def configure_optimizers(self):
    return AdamW(self.parameters(),lr=0.0001)

In [None]:
model=NewSummaryModule()

In [None]:
#%load_ext tensorboard 
#%tensorboard --logdir ./lightning_logs

In [None]:
checkpoint_callback=ModelCheckpoint(
    dirpath="checkpoints",
    filename="best_checkpoint",
    save_last=True,
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)

logger=TensorBoardLogger("lightning_logs",name="news-summary")

trainer=pl.Trainer(
    logger=logger,
    checkpoint_callback=checkpoint_callback,
    max_epochs=epoch,
    gpus=1,
    progress_bar_refresh_rate=30
)

In [None]:
trainer.fit(model,data_module)

In [None]:
def predict_text(text):
  preprocess_function(text)
  generated_ids=model.generate(
           input_ids=text_encoding["inputs_ids"],
           attention_mask=text_encoding["attention_mask"],
           max_length=150,
           num_beams=2,
           repitition_penalty=2.5,
           length_penalty=1.0,
           early_stopping=True)
  preds=[tokenizer.decode(gen_id,skip_special_tokens=True,clean_up_tokenization_spaces=True) for gen_id in generated_ids]
  return "".join(preds)
for i in raw_datasets["test"][:2]:
  predict_text(i['document'])