In [None]:
!nvidia-smi

In [None]:
!pip install - -quiet transformers == 4.5.0
!pip install - -quiet pytorch-lightning == 1.2.7

In [None]:
import json
import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from termcolor import colored
import textwrap
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer
)
from tqdm.auto import tqdm


In [None]:
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc


In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
rcParams['figure.figsize'] = 16, 10


In [None]:
pl.seed_everything(42)


In [None]:
# dataset


In [None]:
# unzip dataset
!unzip - q


In [None]:
df = pd.read_csv('data/news_summary.csv', encoding='latin-1')
df.head()


In [None]:
df = df[['text', 'ctext']]
df.head(1)


In [None]:
df.columns = ['summary', 'complete_news']


In [None]:
df.shape


In [None]:
# check na
df.isnull().sum()


In [None]:
df = df.dropna()
df.head()


In [None]:
df.shape


In [None]:
train_df, test_df = train_test_split(df, test_size=0.1)
print(train_df.shape)
print(test_df.shape)


In [None]:
# docs


In [None]:
class NewsSummaryDataset(Dataset):
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: T5Tokenizer,
        c_news_max_token_length: int = 512,
        summary_max_token_length: int = 128
    ):
        self.tokenizer = tokenizer
        self.data = data
        self.c_news_max_token_length = c_news_max_token_length
        self.summary_max_token_len = summary_max_token_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]

        text = data_row['complete_news']

        text_encoding = tokenizer(
            text,
            max_length=self.c_news_max_token_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        summary_encoding = tokenizer(
            data_row['summary'],
            max_length=self.summary_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        labels = summary_encoding['input_ids']
        labels[labels == 0] = -100

        return dict(
            text=text,
            summary=data_row['summary'],
            text_input_ids=text_encoding['input_ids'].flatten(),
            text_attention_mask=text_encoding['attention_mask'].flatten(),
            labels=labels.flatten(),
            labels_attention_mask=summary_encoding['attention_mask'].flatten()
        )


In [None]:
class NewsSummaryDataModule(pl.LightningDataModule):

    def __init__(
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
        tokenizer: T5Tokenizer,
        batch_size: int = 8,
        c_news_max_token_length: int = 512,
        summary_max_token_length: int = 128
    ):

        super().__init__()

        self.train_df = train_df
        self.test_df = test_df

        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.c_news_max_token_length = c_news_max_token_length
        self.summary_max_token_length = summary_max_token_length

    def setup(self, stage=None):
        self.train_dataset = NewsSummaryDataset(
            self.train_df,
            self.tokenizer,
            self.c_news_max_token_length,
            self.summary_max_token_length
        )

        self.test_dataset = NewsSummaryDataset(
            self.test_df,
            self.tokenizer,
            self.c_news_max_token_length,
            self.summary_max_token_length
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=2
        )

    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=2
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=2
        )


In [None]:
MODEL = 't5-base'

tokenizer = T5Tokenizer.from_pretrained(MODEL)
