# Lightweight Fine-Tuning Project

## Loading and Evaluation a Foundation Model

In [2]:
!pip install datasets
!pip install transformers[torch]
!pip install accelerate -U
#necessary imports
!pip install peft
from transformers import GPT2ForSequenceClassification, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, AutoModelForCausalLM
from datasets import load_dataset, load_metric
import numpy as np
from peft import LoftQConfig, LoraConfig, get_peft_model, TaskType







In [3]:
# Load reduced dataset for faster processing
dataset = load_dataset("amazon_polarity", split='train[:10000]').train_test_split(
    test_size=0.2, shuffle=True, seed=23
    )

splits = ["train", "test"]

model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/6.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/260M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/258M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/254M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/117M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3600000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/400000 [00:00<?, ? examples/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [6]:

# Lambda function to tokenize all the examples
tokenized_dataset = {}
for split in splits:
    tokenized_dataset[split] = dataset[split].map(
        lambda x: tokenizer(x["content"], truncation=True, padding='max_length', max_length=512), batched=True
    )

tokenized_dataset["train"]

NameError: name 'tokenized_datasets' is not defined

In [None]:
# Load GPT-2 model - not optimal for sentiment analysis, but used for exercise purposes

model = GPT2ForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    id2label={0: "bad review", 1: "good review"},
    label2id={"bad review": 0, "good review": 1},
    pad_token_id=tokenizer.eos_token_id,  # Set pad token id

)

# Unfreeze all the model parameters.
for param in model.parameters():
    param.requires_grad = True

In [None]:
# Define training arguments for the trainer
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    num_train_epochs=2,
    weight_decay=0.01,
    per_device_train_batch_size=128,
    logging_dir='./logs',
    remove_unused_columns=False
)

def compute_metrics(eval_pred):
    # Compute accuracy
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": (predictions == labels).mean()}

In [None]:
#Train Model
metric = load_metric("accuracy")


# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"]
)


trainer.train()


# Evaluate the model
evaluation_results = trainer.evaluate()

print(evaluation_results)

In [None]:
for i in range(10):
    print(dataset['train'][i])

## Performing PEFT

In [None]:

# Create a LoRA config with appropriate hyperparameters

# Create a PEFT Config for LoRA
config = LoraConfig(
r=8, # Rank
lora_alpha=32,
target_modules=['c_attn', 'c_proj'],
lora_dropout=0.1,
bias="none",
task_type=TaskType.SEQ_CLS
)


lora_model = get_peft_model(model, config)





In [None]:
# Initialize the Trainer with the PEFT model
trainer = Trainer(
    model=lora_model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics
)

# Training loop with at least one epoch
trainer.train()

# Save the trained PEFT model
lora_model.save_pretrained("gpt-lora")

## Performing Inference with a PEFT model

In [None]:
# Evaluate the original pre-trained model
evaluation_results_before_finetuning = trainer.evaluate()

# Load the saved PEFT model weights
peft_model = GPT2ForSequenceClassification.from_pretrained('./gpt-lora')

# Initialize the Trainer with the PEFT model
trainer.model = peft_model

# Evaluate the PEFT model after fine-tuning
evaluation_results_after_finetuning = trainer.evaluate()

# Compare the results
print("Results before fine-tuning: ", evaluation_results_before_finetuning)
print("Results after fine-tuning: ", evaluation_results_after_finetuning)