In [1]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torch.nn.utils.prune as prune
from transformers import T5ForConditionalGeneration, T5Tokenizer


2024-05-06 08:13:37.464006: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-06 08:13:37.464097: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-06 08:13:37.640004: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
import pandas as pd
from torch.utils.data import Dataset

class SummarizationDataset(Dataset):
    def __init__(self, tokenizer, file_path, max_length=512, use_percentage=10):
        self.dataframe = pd.read_csv(file_path)
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Sample a percentage of the data if use_percentage is less than 100
        if use_percentage < 100:
            self.dataframe = self.dataframe.sample(frac=use_percentage / 100.0, random_state=42).reset_index(drop=True)

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        article_text = row['article']
        highlights_text = row['highlights']
        
        input_text = f"summarize: {article_text}"
        source_encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        target_encoding = self.tokenizer(
            highlights_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': source_encoding['input_ids'].squeeze(),
            'attention_mask': source_encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze()
        }


In [3]:
path_of_csv_file= '/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv'

In [4]:
test_data_path = '/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/test.csv'

In [5]:
model = T5ForConditionalGeneration.from_pretrained('t5-small')

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [6]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
# Load dataset
train_dataset = SummarizationDataset(tokenizer, path_of_csv_file,max_length =512, use_percentage = 10)


In [8]:
test_dataset = SummarizationDataset(tokenizer, test_data_path, max_length=512)

In [9]:
from transformers import T5ForConditionalGeneration, T5Config

In [13]:
from transformers import T5ForConditionalGeneration
from torch.quantization import QuantStub, DeQuantStub, prepare_qat, convert

# class QuantizedT5(T5ForConditionalGeneration):
#     def __init__(self, config):
#         super().__init__(config)
#         self.quant = QuantStub()
#         self.dequant = DeQuantStub()

#     def forward(self, input_ids, attention_mask=None, decoder_input_ids=None, labels=None):
#         input_ids = self.quant(input_ids)
#         outputs = super().forward(input_ids, attention_mask=attention_mask,
#                                   decoder_input_ids=decoder_input_ids, labels=labels)
#         output = self.dequant(outputs.logits)
#         return output

# # # Initialize the model
# # model = QuantizedT5.from_pretrained('t5-small')

# # # Set the model to training mode
# # model.train()

# # # Prepare the model for QAT
# # model = prepare_qat(model, inplace=True)


In [14]:
class QuantizedT5(T5ForConditionalGeneration):
    def __init__(self, config):
        super().__init__(config)
        self.quant = QuantStub()
        self.dequant = DeQuantStub()

    def forward(self, input_ids, attention_mask=None, decoder_input_ids=None, labels=None, **kwargs):
        input_ids = self.quant(input_ids)
        # Pass along the additional kwargs to the superclass forward method
        outputs = super().forward(input_ids, attention_mask=attention_mask,
                                  decoder_input_ids=decoder_input_ids, labels=labels, **kwargs)
        logits = self.dequant(outputs.logits)
        return logits


In [15]:
model = QuantizedT5(T5Config.from_pretrained('t5-small'))

In [16]:
from transformers import Trainer

In [17]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
#     
#     fp16 = True,  # enable mixed precision it help speed up training
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
#     evaluation_strategy="no",
#     evaluation_strategy="steps",
    save_strategy="epoch",
#     load_best_model_at_end=True,
)


In [18]:
from torch.utils.data import random_split

In [19]:
# Define a split ratio for training and validation
train_size = int(0.9 * len(train_dataset))  # 90% for training
eval_size = len(train_dataset) - train_size  # 10% for evaluation

In [20]:
# Split the dataset
train_subset, eval_subset = random_split(train_dataset, [train_size, eval_size])


In [21]:
# Assume `training_args` as you defined previously
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=eval_subset
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [22]:
import wandb
wandb.init(mode="disabled")




In [23]:
 # Train and fine-tune with QAT
trainer.train()

Epoch,Training Loss,Validation Loss
1,-2.2779,-2.285426


TrainOutput(global_step=3230, training_loss=-1.4252931433796145, metrics={'train_runtime': 1635.6526, 'train_samples_per_second': 15.797, 'train_steps_per_second': 1.975, 'total_flos': 3497096808235008.0, 'train_loss': -1.4252931433796145, 'epoch': 1.0})

In [24]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=0a4d74f5860b4751beec61fa1a5b4173e1920649ad8cf3d361cd99f47a55e587
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Note: you may need to restart the kernel to use updated packages.


In [25]:
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False,num_workers = 4)


In [26]:
from datasets import load_metric


In [27]:
def compute_rouge_scores(model, dataloader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    predictions = []
    references = []
    rouge = load_metric('rouge', trust_remote_code=True)

    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=150,
                num_beams=4,
                length_penalty=2.0,
                early_stopping=True
            )

        decoded_preds = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        decoded_refs = [tokenizer.decode(label, skip_special_tokens=True) for label in labels]

        predictions.extend(decoded_preds)
        references.extend(decoded_refs)

    result = rouge.compute(predictions=predictions, references=references)
    return result



In [28]:
# Now call the function
rouge_scores = compute_rouge_scores(model, test_dataloader)
print("ROUGE Scores:", rouge_scores)


  rouge = load_metric('rouge', trust_remote_code=True)


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

TypeError: QuantizedT5.forward() missing 1 required positional argument: 'input_ids'

In [None]:
def simplified_rouge_scores(rouge_results):
    # Extract only mid F1 scores for ROUGE-1, ROUGE-2, and ROUGE-L
    simplified_scores = {
        'rouge1_fmeasure': rouge_results['rouge1'].mid.fmeasure,
        'rouge2_fmeasure': rouge_results['rouge2'].mid.fmeasure,
        'rougeL_fmeasure': rouge_results['rougeL'].mid.fmeasure
    }
    return simplified_scores

# Assuming rouge_scores is the output from your previous compute_rouge_scores function
simplified_scores = simplified_rouge_scores(rouge_scores)
print("Simplified ROUGE Scores:", simplified_scores)
