In [1]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torch.nn.utils.prune as prune


2024-05-02 04:52:52.759275: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-02 04:52:52.759371: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-02 04:52:52.933022: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
import pandas as pd
from torch.utils.data import Dataset

class SummarizationDataset(Dataset):
    def __init__(self, tokenizer, file_path, max_length=512, use_percentage=10):
        self.dataframe = pd.read_csv(file_path)
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Sample a percentage of the data if use_percentage is less than 100
        if use_percentage < 100:
            self.dataframe = self.dataframe.sample(frac=use_percentage / 100.0, random_state=42).reset_index(drop=True)

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        article_text = row['article']
        highlights_text = row['highlights']
        
        input_text = f"summarize: {article_text}"
        source_encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        target_encoding = self.tokenizer(
            highlights_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': source_encoding['input_ids'].squeeze(),
            'attention_mask': source_encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze()
        }


In [3]:
# Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [4]:
def apply_pruning(model):
    # Iterate over all modules and prune the linear layers found in the encoder and decoder
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            # Applying unstructured L1 pruning
            prune.l1_unstructured(module, name='weight', amount=0.2)
            # To make the pruning permanent, you might typically call prune.remove, but it is better to do it after training


In [5]:
# Pruning before training
apply_pruning(model)

In [6]:
path_of_csv_file= '/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv'

In [7]:
test_data_path = '/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/test.csv'

In [8]:
# Load dataset
train_dataset = SummarizationDataset(tokenizer, path_of_csv_file,max_length =512, use_percentage = 10)


In [9]:
test_dataset = SummarizationDataset(tokenizer, test_data_path, max_length=512)

In [16]:
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=3,              # Number of training epochs
    per_device_train_batch_size=4,   # Batch size for training
    per_device_eval_batch_size=4,    # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # Evaluate at the end of each epoch
    save_strategy="epoch"            # Save the model at the end of each epoch
)


In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [18]:
import wandb
wandb.init(mode="disabled")




In [19]:
# train the model 
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.3053,0.295893
2,0.2706,0.294397
3,0.2849,0.293791




TrainOutput(global_step=10767, training_loss=0.4340459548896697, metrics={'train_runtime': 5870.0362, 'train_samples_per_second': 14.673, 'train_steps_per_second': 1.834, 'total_flos': 1.1657395386187776e+16, 'train_loss': 0.4340459548896697, 'epoch': 3.0})

In [27]:
from datasets import load_metric

In [25]:
pip install rouge_score


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=f3304724dcc39a4b720906b3944144a078408571ef751ce1e3d54846735812c6
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Note: you may need to restart the kernel to use updated packages.


In [28]:
 rouge = load_metric('rouge')


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [31]:
def compute_rouge_scores(model, tokenizer, dataset):
    # Determine the device to use, prefer GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # Ensure model is on the right device
    
    model.eval()
    predictions = []
    references = []
    
    for item in dataset:
        with torch.no_grad():
            # Ensure input tensors are on the same device as the model
            input_ids = item['input_ids'].unsqueeze(0).to(device)
            attention_mask = item['attention_mask'].unsqueeze(0).to(device)
            
            output = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=150,
                num_beams=4,
                length_penalty=2.0,
                early_stopping=True
            )
            
            decoded_pred = tokenizer.decode(output[0], skip_special_tokens=True)
            decoded_ref = tokenizer.decode(item['labels'], skip_special_tokens=True)
            
            predictions.append(decoded_pred)
            references.append(decoded_ref)
    
    # Use the ROUGE metric to compute scores
    rouge = load_metric('rouge', trust_remote_code=True)
    result = rouge.compute(predictions=predictions, references=references)
    return result


In [32]:
rouge_scores = compute_rouge_scores(model, tokenizer, test_dataset)
print("ROUGE Scores:", rouge_scores)


ROUGE Scores: {'rouge1': AggregateScore(low=Score(precision=0.3833178797696526, recall=0.4128924727064259, fmeasure=0.3861500212442018), mid=Score(precision=0.3914560345232244, recall=0.42153068870032934, fmeasure=0.39341562083191584), high=Score(precision=0.4001189942708382, recall=0.43013691234513945, fmeasure=0.40094289357063784)), 'rouge2': AggregateScore(low=Score(precision=0.17398937305010279, recall=0.18689932114767902, fmeasure=0.17485570169413817), mid=Score(precision=0.18140134357642762, recall=0.19466531000103598, fmeasure=0.18171760210706495), high=Score(precision=0.18901059819320015, recall=0.20238142247338914, fmeasure=0.18883279345111356)), 'rougeL': AggregateScore(low=Score(precision=0.2693763043488119, recall=0.2922786870337757, fmeasure=0.27215409409571073), mid=Score(precision=0.27671012792470284, recall=0.3004676294856073, fmeasure=0.2792645447937093), high=Score(precision=0.2843663165748895, recall=0.3085978865124499, fmeasure=0.2866285472270553)), 'rougeLsum': Agg

In [34]:
def simplified_rouge_scores(rouge_results):
    # Extract only mid F1 scores for ROUGE-1, ROUGE-2, and ROUGE-L
    simplified_scores = {
        'rouge1_fmeasure': rouge_results['rouge1'].mid.fmeasure,
        'rouge2_fmeasure': rouge_results['rouge2'].mid.fmeasure,
        'rougeL_fmeasure': rouge_results['rougeL'].mid.fmeasure
    }
    return simplified_scores

# Assuming rouge_scores is the output from your previous compute_rouge_scores function
simplified_scores = simplified_rouge_scores(rouge_scores)
print("Simplified ROUGE Scores:", simplified_scores)


Simplified ROUGE Scores: {'rouge1_fmeasure': 0.39341562083191584, 'rouge2_fmeasure': 0.18171760210706495, 'rougeL_fmeasure': 0.2792645447937093}
