In [1]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torch.nn.utils.prune as prune


2024-05-05 07:51:44.644752: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-05 07:51:44.644806: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-05 07:51:44.646347: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
import pandas as pd
from torch.utils.data import Dataset

class SummarizationDataset(Dataset):
    def __init__(self, tokenizer, file_path, max_length=512, use_percentage=10):
        self.dataframe = pd.read_csv(file_path)
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Sample a percentage of the data if use_percentage is less than 100
        if use_percentage < 100:
            self.dataframe = self.dataframe.sample(frac=use_percentage / 100.0, random_state=42).reset_index(drop=True)

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        article_text = row['article']
        highlights_text = row['highlights']
        
        input_text = f"summarize: {article_text}"
        source_encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        target_encoding = self.tokenizer(
            highlights_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': source_encoding['input_ids'].squeeze(),
            'attention_mask': source_encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze()
        }


In [3]:
# # Load tokenizer and model
# tokenizer = T5Tokenizer.from_pretrained('t5-small')
# model = T5ForConditionalGeneration.from_pretrained('t5-small')

In [4]:
from transformers import Trainer, TrainingArguments
import torch

In [5]:
class DistillationTrainer(Trainer):
    def __init__(self, teacher_model, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model
        self.teacher_model.eval()  # Set the teacher model to evaluation mode
        self.teacher_model.to(self.args.device)  # Move teacher model to the same device as the student model

    def compute_loss(self, model, inputs, return_outputs=False):
        # Generate outputs using the student model
        outputs_student = model(**inputs)
        # Generate outputs using the teacher model
        with torch.no_grad():
            inputs["decoder_input_ids"] = inputs["labels"]  # To ensure teacher uses correct inputs
            outputs_teacher = self.teacher_model(**inputs)

        # Compute distillation loss: Mean Squared Error (MSE) between logits
        loss_fn = torch.nn.MSELoss()
        distillation_loss = loss_fn(outputs_student.logits, outputs_teacher.logits)

        # Optionally, add task-specific loss (e.g., cross-entropy), you might adjust the weights for these losses
        if return_outputs:
            return (distillation_loss, outputs_student)
        return distillation_loss

In [6]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

In [7]:
# Load models and tokenizer
teacher_model = T5ForConditionalGeneration.from_pretrained('t5-base')
student_model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
path_of_csv_file= '/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv'

In [9]:
test_data_path = '/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/test.csv'

In [10]:
# Load dataset
train_dataset = SummarizationDataset(tokenizer, path_of_csv_file,max_length =512, use_percentage = 10)


In [11]:
test_dataset = SummarizationDataset(tokenizer, test_data_path, max_length=512)

In [23]:
# Define a split ratio for training and validation
train_size = int(0.9 * len(train_dataset))  # 90% for training
eval_size = len(train_dataset) - train_size  # 10% for evaluation

In [25]:
from torch.utils.data import random_split

In [26]:
# Split the dataset
train_subset, eval_subset = random_split(train_dataset, [train_size, eval_size])


In [12]:
import wandb
wandb.init(mode="disabled")




In [28]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    fp16 = True,  # enable mixed precision it help speed up training
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
#     evaluation_strategy="no",
#     evaluation_strategy="steps",
    save_strategy="epoch",
#     load_best_model_at_end=True,
)


In [29]:
from transformers import EarlyStoppingCallback

In [31]:
trainer = DistillationTrainer(
    teacher_model=teacher_model,
    model=student_model,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=eval_subset,
    tokenizer=tokenizer
)

In [32]:
# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,7.1225,5.236245


TrainOutput(global_step=3230, training_loss=7.543378134334788, metrics={'train_runtime': 4627.296, 'train_samples_per_second': 5.584, 'train_steps_per_second': 0.698, 'total_flos': 3497096808235008.0, 'train_loss': 7.543378134334788, 'epoch': 1.0})

In [55]:
model = student_model

In [57]:
pip install rouge_score

Note: you may need to restart the kernel to use updated packages.


In [58]:
from torch.utils.data import Dataset, DataLoader

In [59]:
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False,num_workers = 4)


In [60]:
def compute_rouge_scores(model, dataloader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    predictions = []
    references = []
    rouge = load_metric('rouge', trust_remote_code=True)

    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=150,
                num_beams=4,
                length_penalty=2.0,
                early_stopping=True
            )

        decoded_preds = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        decoded_refs = [tokenizer.decode(label, skip_special_tokens=True) for label in labels]

        predictions.extend(decoded_preds)
        references.extend(decoded_refs)

    result = rouge.compute(predictions=predictions, references=references)
    return result



In [61]:
# Now call the function
rouge_scores = compute_rouge_scores(model, test_dataloader)
print("ROUGE Scores:", rouge_scores)


ROUGE Scores: {'rouge1': AggregateScore(low=Score(precision=0.37860150801855524, recall=0.3644438133539695, fmeasure=0.3605036498310606), mid=Score(precision=0.38729855112042405, recall=0.3728883465202554, fmeasure=0.36790665112815457), high=Score(precision=0.39663106258088404, recall=0.38131312970547937, fmeasure=0.3758778879222655)), 'rouge2': AggregateScore(low=Score(precision=0.1647436770638004, recall=0.15732297463493133, fmeasure=0.15600281403315705), mid=Score(precision=0.17298847894482808, recall=0.16545175618833638, fmeasure=0.16371361312004643), high=Score(precision=0.18202116402648422, recall=0.17317626596454702, fmeasure=0.17123023513487295)), 'rougeL': AggregateScore(low=Score(precision=0.2655018279037113, recall=0.2582814640040355, fmeasure=0.2538265806336129), mid=Score(precision=0.2733984387066724, recall=0.26584152146761664, fmeasure=0.26099577120687967), high=Score(precision=0.2817875474313138, recall=0.27304500160586664, fmeasure=0.26804751092522766)), 'rougeLsum': A

In [62]:
def simplified_rouge_scores(rouge_results):
    # Extract only mid F1 scores for ROUGE-1, ROUGE-2, and ROUGE-L
    simplified_scores = {
        'rouge1_fmeasure': rouge_results['rouge1'].mid.fmeasure,
        'rouge2_fmeasure': rouge_results['rouge2'].mid.fmeasure,
        'rougeL_fmeasure': rouge_results['rougeL'].mid.fmeasure
    }
    return simplified_scores

# Assuming rouge_scores is the output from your previous compute_rouge_scores function
simplified_scores = simplified_rouge_scores(rouge_scores)
print("Simplified ROUGE Scores:", simplified_scores)


Simplified ROUGE Scores: {'rouge1_fmeasure': 0.36790665112815457, 'rouge2_fmeasure': 0.16371361312004643, 'rougeL_fmeasure': 0.26099577120687967}
