In [1]:
# pip install -r requirements.txt

In [2]:
!pip install --upgrade accelerate
!pip install transformers accelerate

Collecting accelerate
  Downloading accelerate-1.1.1-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.1.1-py3-none-any.whl (333 kB)
Installing collected packages: accelerate
Successfully installed accelerate-1.1.1


In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline,set_seed
import matplotlib.pyplot as plt
from datasets import load_dataset,load_from_disk
import pandas as pd
import torch
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
from tqdm import tqdm
import re

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\www58\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
if torch.cuda.is_available():
  device='cuda'
  print('GPU')
else:
  device='cpu'



In [9]:
import os
import requests
import zipfile

In [None]:
import os
import requests

# Define the URL of the file to be downloaded
file_url='https://github.com/krishnaik06/datasets/raw/refs/heads/main/summarizer-data.zip'

# Send the GET request
response = requests.get(file_url)
print(response.status_code)  # This should print 200 if the request was successful

# Specify the directory path and file path
path_dir = os.path.dirname('data/data.zip')
os.makedirs(path_dir, exist_ok=True)
file_path = os.path.join(path_dir, 'data.zip')  # Full file path

# Write the content of the response to the file
with open(file_path, 'wb') as z:
    z.write(response.content)
    
with zipfile.ZipFile('D:\MLOPS\Text-Summarization-Project\experiment\experiment\data.zip', 'r') as zip_ref:
   zip_ref.extractall()


200


In [21]:
data=load_from_disk('samsum_dataset')
data

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [22]:
split_len=[len(data[split]) for split in data]
print(f'Split lengths: {split_len}')
print(f'Features: {data["train"].column_names}')
print(f'Overview of data:\n{pd.DataFrame(data["train"][:1]).T}')
print(f'Number of Examples in Dataset: ',{data["train"][1]['summary']})

print('*'*60)

Split lengths: [14732, 819, 818]
Features: ['id', 'dialogue', 'summary']
Overview of data:
                                                          0
id                                                 13818513
dialogue  Amanda: I baked  cookies. Do you want some?\r\...
summary   Amanda baked cookies and will bring Jerry some...
Number of Examples in Dataset:  {'Olivia and Olivier are voting for liberals in this election. '}
************************************************************


In [23]:
print('*'*60,'ds','+'*60)

************************************************************ ds ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


In [24]:
tokenizer=AutoTokenizer.from_pretrained('google/pegasus-xsum')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [25]:
def convert_examples_to_features(example_batch):
  input_encodings=tokenizer(example_batch['dialogue'],max_length=1024,truncation=True)

  with tokenizer.as_target_tokenizer():
    target_encodings=tokenizer(example_batch['summary'],max_length=128,truncation=True)

  return {
      'input_ids':input_encodings['input_ids'],
      'attention_mask':input_encodings['attention_mask'],
      'labels':target_encodings['input_ids']
  }

In [26]:
data_pt=data.map(convert_examples_to_features,batched=True)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 14732/14732 [00:08<00:00, 1686.71 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 819/819 [00:00<00:00, 1849.93 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 818/818 [00:00<00:00, 1886.47 examples/s]


In [27]:
data_pt['train']

Dataset({
    features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 14732
})

In [28]:
data_pt['test']

Dataset({
    features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 819
})

In [None]:
model_pegasus=AutoModelForSeq2SeqLM.from_pretrained('google/pegasus-xsum')

In [None]:
from transformers import DataCollatorForSeq2Seq
seq2seq_data_collator=DataCollatorForSeq2Seq(tokenizer,model=model_pegasus)

In [None]:
from transformers import TrainingArguments,Trainer
trainer_arg=TrainingArguments(
    output_dir='pegasus-samsum',num_train_epochs=1,warmup_steps=500,
    per_device_train_batch_size=1,per_device_eval_batch_size=1,
    weight_decay=0.01,logging_steps=10,
    evaluation_strategy='steps',eval_steps=500,save_steps=1e6,
    gradient_accumulation_steps=16
)



In [None]:
trainer=Trainer(model=model_pegasus,args=trainer_arg,
                tokenizer=tokenizer,data_collator=seq2seq_data_collator,
                train_dataset=data_pt['test'],
                eval_dataset=data_pt['validation'])

  trainer=Trainer(model=model_pegasus,args=trainer_arg,


In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33marpanchakraborty500[0m ([33marpanchakraborty500-me[0m). Use [1m`wandb login --relogin`[0m to force relogin


RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling cublasLtMatmul with transpose_mat1 1 transpose_mat2 0 m 1024 n 43 k 1024 mat1_ld 1024 mat2_ld 1024 result_ld 1024 abcType 0 computeType 68 scaleType 0