In [None]:
from huggingface_hub import login

# Your HuggingFace access token for authentication.
access_token = "YOUR_HUGGINGFACE_TOKEN"

login(
  token=access_token,
  add_to_git_credential=True
)

In [None]:
# Imports necessary libraries and modules for the task.
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer
import evaluate

In [None]:
# Specifies the model to be used and the new model name for fine-tuning.
model_name = "google/gemma-2b-it"
new_model_name = "gemma-finetuned"

In [None]:
# Optimize performance by setting the compute data type.
compute_dtype = torch.float16

# Configuration for model quantization to reduce model size and potentially increase inference speed.
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
# Load datasets for training, validation, and testing.
train_data = load_dataset("cnn_dailymail", "3.0.0", split="train[:3000]")
val_data = load_dataset("cnn_dailymail", "3.0.0", split="validation[:1000]")
test_data = load_dataset("cnn_dailymail", "3.0.0", split="test[:1000]")

In [None]:
# Preprocess the dataset by formatting it for the fine-tuning task.
def preprocess_dataset(dataset):
    df = []
    for data in dataset:
        question, answer = data['article'], data['highlights']
        prompt = f"<bos><start_of_turn>user\nYou are a helpful assistant for text summarization tasks. Once I provide you with the original content, please summarize it. Here is the content: {question}<end_of_turn>\n<start_of_turn>model\n{answer}<end_of_turn>"
        df.append(prompt)
    return df

# Preprocess and convert datasets to the required format.
train_dataset = Dataset.from_pandas(pd.DataFrame({"text": preprocess_dataset(train_data)}))
val_dataset = Dataset.from_pandas(pd.DataFrame({"text": preprocess_dataset(val_data)}))
test_dataset = Dataset.from_pandas(pd.DataFrame({"text": preprocess_dataset(test_data)}))

In [None]:
# Load the pre-trained model with specified configuration.
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map={"": 0},
    token=access_token
)

# Disable caching to save memory and set parallel processing configurations.
model.config.use_cache = False
model.config.pretraining_tp = 1

In [None]:
# Load tokenizer and configure it for use with the model.
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = "right"

In [None]:
# Define training parameters.
training_params = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    warmup_steps=2,
    max_steps=50,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=1,
    output_dir="./results",
    optim="paged_adamw_8bit"
)

# Configure parameters for PEFT (Progressive Embedding Fine-Tuning).
peft_params = LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

# Initialize and train the model.
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    peft_config=peft_params,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
    dataset_text_field="text"
)
trainer.train()

In [None]:
# Save the fine-tuned model and tokenizer.
save_path = "YOUR_PATH"
trainer.model.save_pretrained(save_path)
trainer.tokenizer.save_pretrained(save_path)

In [None]:
device = torch.cuda.current_device()

original_content = "YOUR_ORIGINAL_CONTENT"

prompt = "You are a helpful assistant for text summarization tasks. Once I provide you with the original content, please summarize it."
input_text = f"<bos><start_of_turn>user\n{prompt} Here is the content: {original_content}<end_of_turn>\n<start_of_turn>model\n"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

with torch.no_grad():
    output_ids = model.generate(input_ids, max_length=2048, early_stopping=True, do_sample = True, num_beams=3, temperature=0.9, top_k=5, top_p=0.9, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)

generated_summary = tokenizer.decode(output_ids[0], skip_special_tokens=True).split("\nmodel")[1]

print(f"generated_summary:{generated_summary}")

In [None]:
# Prepare the actual and generated summaries
actual_summaries = [data['highlights'] for data in test_data]
generated_summaries = []

# Prepare the input sequences for batch generation
input_texts = []
for data in test_data:
    question = data['article']
    input_text = f"<bos><start_of_turn>user\n{prompt} Here is the content: {question}<end_of_turn>\n<start_of_turn>model\n"
    input_texts.append(input_text)

# Tokenize the input sequences
input_ids = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)

# Set the batch size
batch_size = 1

# Split the input_ids into batches
input_ids_batches = torch.split(input_ids, batch_size)

# Initialize an empty list to store the generated summaries
generated_summaries = []

# Generate summaries in batch
with torch.no_grad():
    for input_ids_batch in input_ids_batches:
        output_ids = model.generate(input_ids_batch, max_length=3000, early_stopping=True, do_sample=True, num_beams=2, temperature=0.9, top_k=5, top_p=0.9, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)
        generated_summaries.extend(output_ids)

# Decode the generated summaries
generated_summaries = [tokenizer.decode(output_id, skip_special_tokens=True).split("\nmodel")[1] for output_id in generated_summaries]

In [None]:
# Load the required metrics
rouge_metric = evaluate.load('rouge', trust_remote_code=True)
bleu_metric = evaluate.load('bleu', trust_remote_code=True)
bertscore_metric = evaluate.load('bertscore', trust_remote_code=True)

def calculate_metrics(actual_summaries, generated_summaries):
    rouge_scores = rouge_metric.compute(predictions=generated_summaries, references=actual_summaries, use_stemmer=True)
    bleu_scores = bleu_metric.compute(predictions=generated_summaries, references=actual_summaries)
    bertscore_scores = bertscore_metric.compute(predictions=generated_summaries, references=actual_summaries, lang='en')

    return rouge_scores, bleu_scores, bertscore_scores

# Calculate the evaluation metrics
rouge_scores, bleu_scores, bertscore_scores = calculate_metrics(actual_summaries, generated_summaries)

# Store the results in a DataFrame
metrics_df = pd.DataFrame({
    'ROUGE-1': [rouge_scores['rouge1']],
    'ROUGE-2': [rouge_scores['rouge2']],
    'ROUGE-L': [rouge_scores['rougeL']],
    'BLEU': [bleu_scores['bleu']],
    'BERTScore': [bertscore_scores['f1'][0]]
})

print(metrics_df)