In [1]:
pip install peft transformers datasets torch accelerate





In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
from datasets import load_dataset
from peft import LoraConfig, PeftModel, get_peft_model


In [3]:
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [4]:
from peft import LoraConfig, get_peft_model

# Define LoRA configuration

lora_config = LoraConfig(
    r=1,  # Integer rank value
    lora_alpha=16,  # Scaling factor
    target_modules=["q_proj"],  # Reduced target modules
    lora_dropout=0.2,  # Increased dropout rate
    bias="none",  # No bias fine-tuning
    task_type="SEQ_2_SEQ_LM"  # Task type
)

# Apply LoRA to the model
peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()




The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


trainable params: 73,728 || all params: 406,364,160 || trainable%: 0.0181


In [5]:
# Load datasets
train_dataset = load_dataset('csv', data_files='data/preprocessed_train.csv')['train']
val_dataset = load_dataset('csv', data_files='data/preprocessed_validation.csv')['train']
test_dataset = load_dataset('csv', data_files='data/preprocessed_test.csv')['train']

train_dataset = train_dataset.select(range(4800))  # **4800 samples for training**
val_dataset = val_dataset.select(range(500))       # **500 samples for validation**
test_dataset = test_dataset.select(range(500))

# Preprocess function
def preprocess_function(examples):
    inputs = examples["article"]
    targets = examples["abstract"]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=150, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize datasets 
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/4800 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [15]:
# Save preprocessed datasets for reuse
tokenized_train.save_to_disk('data/tokenized_train')
tokenized_val.save_to_disk('data/tokenized_val')
tokenized_test.save_to_disk('data/tokenized_test')

from transformers import TrainingArguments
!pip install tensorboard

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,             # Reduced for quicker runs
    save_steps=50,                  # Save checkpoints every 50 steps
    logging_steps=10,               # Log progress every 10 steps
    eval_strategy="steps",    # Evaluate every 'eval_steps'
    eval_steps=1000,                  # Perform evaluation every 1000 steps
    save_total_limit=2,             # Keep the last 2 checkpoints only
    remove_unused_columns=False,    # Prevent dropping unused columns
    fp16=True,                      # Enable mixed precision for faster training
    disable_tqdm=False              # Enable progress bar in Jupyter Notebook
)



Saving the dataset (0/1 shards):   0%|          | 0/4800 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]



In [16]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding="longest")

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
)

# **TRAIN THE MODEL**
trainer.train()

# **SAVE FINE-TUNED MODEL**
model.save_pretrained("fine_tuned_bart_small")
tokenizer.save_pretrained("fine_tuned_bart_small")


Step,Training Loss,Validation Loss
1000,2.9595,2.876822


('fine_tuned_bart_small\\tokenizer_config.json',
 'fine_tuned_bart_small\\special_tokens_map.json',
 'fine_tuned_bart_small\\vocab.json',
 'fine_tuned_bart_small\\merges.txt',
 'fine_tuned_bart_small\\added_tokens.json',
 'fine_tuned_bart_small\\tokenizer.json')

In [17]:
def summarize(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=1024).to("cuda")
    summary_ids = model.generate(inputs["input_ids"], max_length=150, min_length=40, length_penalty=2.0, num_beams=4)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)