In [1]:

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import load_dataset
import torch

In [2]:
model_name = "facebook/bart-large-cnn"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



In [3]:
print(type(model)) 
print(type(tokenizer))

<class 'transformers.models.bart.modeling_bart.BartForConditionalGeneration'>
<class 'transformers.models.bart.tokenization_bart_fast.BartTokenizerFast'>


In [4]:
from datasets import load_dataset

# Load datasets
train_dataset = load_dataset('csv', data_files='data/preprocessed_train.csv')['train']
val_dataset = load_dataset('csv', data_files='data/preprocessed_validation.csv')['train']
test_dataset = load_dataset('csv', data_files='data/preprocessed_test.csv')['train']

# Preprocess function
def preprocess_function(examples):
    inputs = examples["article"]  
    targets = examples["abstract"]  
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding=True)
    labels = tokenizer(targets, max_length=150, truncation=True, padding=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply tokenization
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

# Save preprocessed datasets for reuse
tokenized_train.save_to_disk('data/tokenized_train')
tokenized_val.save_to_disk('data/tokenized_val')
tokenized_test.save_to_disk('data/tokenized_test')



Map:   0%|          | 0/6658 [00:00<?, ? examples/s]

Saving the dataset (0/6 shards):   0%|          | 0/117232 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6633 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6658 [00:00<?, ? examples/s]

In [5]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
)


In [6]:
#initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
)
