In [4]:
!pip install transformers peft bitsandbytes trl deepeval datasets torch



In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig
from trl import SFTTrainer

# hugging-face token = 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

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#################################
### Setup Quantization Config ###
#################################
compute_dtype = getattr(torch, "float16")
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [3]:
#######################
### Load Base Model ###
#######################
base_model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
llama_3 = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"":0}
)

Loading checkpoint shards: 100%|██████████████████| 4/4 [00:47<00:00, 11.80s/it]


In [4]:
######################
### Load Tokenizer ###
######################
tokenizer = AutoTokenizer.from_pretrained(
  base_model, 
  trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [5]:
####################
### Load Dataset ###
####################
train_dataset = load_dataset("csv", data_files="./Datasets/train.csv", split="train") #, streaming=True

In [6]:
###########################
### Sample from Dataset ###
###########################
print(next(iter(train_dataset)))

{'text': "(Jan 30, 2020  7:00 PM) Fred Silverman, who steered programming for each of the Big Three broadcast networks and brought All in the Family, Roots, Hawaii Five-O and other hit series and miniseries to television, died Thursday. He was 82. Silverman, who had been battling cancer, died at his home in the Pacific Palisades area of Los Angeles, the AP reports. Silverman's gift for picking winners prompted Time magazine to dub him  The Man with the Golden Gut  in a 1977 profile. As ABC's entertainment chief, Silverman turned the network’s fortunes around with shows including Roots, Rich Man, Poor Man and Charlie’s Angels. He had already brought success to CBS with an overhaul that included replacing country-themed series such as Green Acres with what advertisers considered more upscale and urban fare, including The Mary Tyler Moore Show, The Bob Newhart Show and Mannix. He couldn't repeat that success when he moved to NBC.", 'summary': 'Fred Silverman Put a Series of Hits on All 3 

In [7]:
#########################################
### Load LoRA Configurations for PEFT ###
#########################################
peft_config = LoraConfig(
    lora_alpha = 16,
    lora_dropout= 0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [9]:
##############################
### Set Training Arguments ###
##############################
training_arguments = TrainingArguments(
    output_dir="/work/gpq253/tuning_results",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,    
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant"
)


##########################
### Set SFT Parameters ###
##########################
trainer = SFTTrainer(
    model=llama_3,
    train_dataset=train_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=1024, # was set to None
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [10]:
#######################
### Fine-Tune Model ###
#######################
trainer.train()

Step,Training Loss
25,2.3288
50,2.145
75,2.2066
100,2.1321
125,2.1949
150,2.0723
175,2.1822
200,2.0873
225,2.0805
250,2.1127


TrainOutput(global_step=9711, training_loss=2.046107460205028, metrics={'train_runtime': 14502.4488, 'train_samples_per_second': 1.339, 'train_steps_per_second': 0.67, 'total_flos': 2.1255014040639898e+17, 'train_loss': 2.046107460205028, 'epoch': 1.0})

In [12]:
##################
### Save Model ###
##################
new_model = "tuned-llama-3.1-8b"
trainer.model.save_pretrained(new_model)
trainer.processing_class.save_pretrained(new_model)

('tuned-llama-3.1-8b/tokenizer_config.json',
 'tuned-llama-3.1-8b/special_tokens_map.json',
 'tuned-llama-3.1-8b/tokenizer.json')

In [15]:
#################
### Try Model ###
#################
device = "cuda" if torch.cuda.is_available() else "cpu"
prompt = "What is a large language model?"
pipe = pipeline(
  task="text-generation", 
  model="/home/gpq253/tuned-llama-3.1-8b", 
  tokenizer=tokenizer, 
  max_length=200,
  device = 'cpu'
)
result = pipe(f"[s][INST] {prompt} [/INST]")
print(result[0]['generated_text'])

Loading checkpoint shards: 100%|██████████████████| 4/4 [00:38<00:00,  9.70s/it]


[s][INST] What is a large language model? [/INST] [s][INST] A large language model is a type of artificial intelligence (AI) model that is trained on a massive dataset of text, allowing it to generate human-like language and understand the nuances of language. These models are designed to learn patterns and relationships in language, enabling them to respond to a wide range of questions, topics, and styles. [/INST] [s][INST] Large language models are typically trained on a large corpus of text data, which can be sourced from various places, such as books, articles, and online content. The model learns to predict the next word in a sequence, given the context of the previous words. This process is repeated millions of times, allowing the model to develop a deep understanding of language. [/INST] [s][INST] The key characteristics of large language models include: [/INST] [s][INST] 1. **Large training datasets**: These models
