In [3]:
pip install transformers peft bitsandbytes trl deepeval datasets torch rouge_score bert_score

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig
from trl import SFTTrainer

import numpy as np
from rouge_score import rouge_scorer
from bert_score import score as bert_score
#from moverscore_v2 import get_idf_dict, word_mover_score

# hugging-face token = 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

In [5]:
#################################
### Setup Quantization Config ###
#################################
compute_dtype = getattr(torch, "float16")
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [16]:
#######################
### Load Base Model ###
#######################
base_model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
llama_3 = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"":0}
)

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend


RuntimeError: CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend

In [14]:
######################
### Load Tokenizer ###
######################
tokenizer = AutoTokenizer.from_pretrained(
  base_model, 
  trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct.
401 Client Error. (Request ID: Root=1-67428ad7-7de6e5991e7947216cb6b55a;0470d565-f232-40ca-b7eb-cb4890a49239)

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.1-8B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in.

In [15]:
####################
### Load Dataset ###
####################
train_dataset = load_dataset("csv", data_files="./Datasets/train.csv", split="train") #, streaming=True

FileNotFoundError: Unable to find '/home/zhy542/Downloads/./Datasets/train.csv'

In [6]:
###########################
### Sample from Dataset ###
###########################
print(next(iter(train_dataset)))

{'text': "(Jan 30, 2020  7:00 PM) Fred Silverman, who steered programming for each of the Big Three broadcast networks and brought All in the Family, Roots, Hawaii Five-O and other hit series and miniseries to television, died Thursday. He was 82. Silverman, who had been battling cancer, died at his home in the Pacific Palisades area of Los Angeles, the AP reports. Silverman's gift for picking winners prompted Time magazine to dub him  The Man with the Golden Gut  in a 1977 profile. As ABC's entertainment chief, Silverman turned the network’s fortunes around with shows including Roots, Rich Man, Poor Man and Charlie’s Angels. He had already brought success to CBS with an overhaul that included replacing country-themed series such as Green Acres with what advertisers considered more upscale and urban fare, including The Mary Tyler Moore Show, The Bob Newhart Show and Mannix. He couldn't repeat that success when he moved to NBC.", 'summary': 'Fred Silverman Put a Series of Hits on All 3 

In [None]:
#########################################
### Load LoRA Configurations for PEFT ###
#########################################
peft_config = LoraConfig(
    lora_alpha = 16,
    lora_dropout= 0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [15]:
##############################
### Set Training Arguments ###
##############################
training_arguments = TrainingArguments(
    output_dir="/home/zhy542/tuning_results",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,    
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant"
)


##########################
### Set SFT Parameters ###
##########################
trainer = SFTTrainer(
    model=llama_3,
    train_dataset=train_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=1024, # was set to None
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)

NameError: name 'llama_3' is not defined

In [10]:
#######################
### Fine-Tune Model ###
#######################
trainer.train()

Step,Training Loss
25,2.3288
50,2.145
75,2.2066
100,2.1321
125,2.1949
150,2.0723
175,2.1822
200,2.0873
225,2.0805
250,2.1127


TrainOutput(global_step=9711, training_loss=2.046107460205028, metrics={'train_runtime': 14502.4488, 'train_samples_per_second': 1.339, 'train_steps_per_second': 0.67, 'total_flos': 2.1255014040639898e+17, 'train_loss': 2.046107460205028, 'epoch': 1.0})

In [12]:
##################
### Save Model ###
##################
new_model = "tuned-llama-3.1-8b"
trainer.model.save_pretrained(new_model)
trainer.processing_class.save_pretrained(new_model)

('tuned-llama-3.1-8b/tokenizer_config.json',
 'tuned-llama-3.1-8b/special_tokens_map.json',
 'tuned-llama-3.1-8b/tokenizer.json')

In [12]:
#################
### Try Model ###
#################
device = "cuda" if torch.cuda.is_available() else "cpu"
prompt = "What is a large language model?"
pipe = pipeline(
  task="text-generation", 
  model="/home/zhy542/tuned-llama-3.1-8b", 
  tokenizer=tokenizer, 
  max_length=200,
  device = 'cpu'
)
result = pipe(f"[s][INST] {prompt} [/INST]")
print(result[0]['generated_text'])

NameError: name 'tokenizer' is not defined

In [None]:
#################
### Evaluate Model ###
#################


# Sample ground truth (reference) and model-generated (prediction) headlines
references = [
    "Breaking news: Major earthquake hits California",
    "Scientists discover a new species of deep-sea fish"
]
predictions = [
    "Major earthquake strikes California",
    "New deep-sea fish species discovered"
]

# Accuracy
def calculate_accuracy(references, predictions):
    return np.mean([ref == pred for ref, pred in zip(references, predictions)])

# ROUGE Scores
def calculate_rouge(references, predictions):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    scores = [scorer.score(ref, pred) for ref, pred in zip(references, predictions)]
    averaged_scores = {
        metric: np.mean([score[metric].fmeasure for score in scores])
        for metric in ["rouge1", "rouge2", "rougeL"]
    }
    return averaged_scores

# BERTScore
def calculate_bertscore(references, predictions):
    P, R, F1 = bert_score(predictions, references, lang="en", verbose=True)
    return {
        "precision": np.mean(P),
        "recall": np.mean(R),
        "f1": np.mean(F1)
    }

# MoverScore
def calculate_moverscore(references, predictions):
    idf_dict_ref = get_idf_dict(references)
    idf_dict_pred = get_idf_dict(predictions)
    scores = word_mover_score(references, predictions, idf_dict_ref, idf_dict_pred,
                              stop_words=[], n_gram=1, remove_subwords=True, batch_size=8)
    return np.mean(scores)

# Evaluate
accuracy = calculate_accuracy(references, predictions)
rouge_scores = calculate_rouge(references, predictions)
bertscore_results = calculate_bertscore(references, predictions)
moverscore_result = calculate_moverscore(references, predictions)

# Print results
print(f"Accuracy: {accuracy:.2f}")
print(f"ROUGE Scores: {rouge_scores}")
print(f"BERTScore: {bertscore_results}")
print(f"MoverScore: {moverscore_result:.2f}")

