In [None]:
!pip install datasets
!pip install transformers -U
!pip install accelerate -U
!pip install trl
!pip install bitsandbytes       # for quantization
!pip install peft               # to allow us to use LoRA

Collecting bitsandbytes
  Using cached bitsandbytes-0.42.0-py3-none-any.whl.metadata (9.9 kB)
Using cached bitsandbytes-0.42.0-py3-none-any.whl (105.0 MB)
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.42.0
Collecting peft
  Downloading peft-0.17.1-py3-none-any.whl.metadata (14 kB)
Downloading peft-0.17.1-py3-none-any.whl (504 kB)
Installing collected packages: peft
Successfully installed peft-0.17.1


In [2]:
import torch
device = torch.device('cude' if torch.cuda.is_available() else 'cpu')

In [None]:
from datasets import load_dataset

DATASET_NAME = "ChrisHayduk/Llama-2-SQL-Dataset"
dataset = load_dataset(DATASET_NAME)    # downloads the entire dataset into our runtime

Generating train split: 100%|██████████| 70719/70719 [00:00<00:00, 582998.84 examples/s]
Generating eval split: 100%|██████████| 7858/7858 [00:00<00:00, 485789.00 examples/s]


In [4]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 70719
    })
    eval: Dataset({
        features: ['input', 'output'],
        num_rows: 7858
    })
})


In [8]:
print(dataset["train"][0]["input"])
print(dataset["train"][0]["output"])

Below is an instruction that describes a SQL generation task, paired with an input that provides further context about the available table schemas. Write SQL code that appropriately answers the request.

### Instruction:
What is the release date of Milk and Money?

### Input:
CREATE TABLE table_name_50 (release_date VARCHAR, title VARCHAR)

### Response: 
SELECT release_date FROM table_name_50 WHERE title = "milk and money"


In [9]:
full_training_data = dataset["train"]
shuffled = full_training_data.shuffle()     # randomize the dataset's order to remove any bias or ordering the creator may have used
training_dataset = shuffled.select(range(1000)) # only use the first 1000 examples for fine tuning (is why we need line above to prevent bias)
# ensures our randomly selected 1000 dataset is representative of the entire dataset
# when we have super large pretrained models, 1000 examples actually works great for fine tuning (the quality of our fine tuning data matters 
# a lot more than the number of examples)

In [10]:
import bitsandbytes as bnb
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,                      # load model in 4 bit format
    bnb_4bit_quant_type="nf4",              # set the quantization data type to normalized floating point 4
    bnb_4bit_compute_dtype="float16"        # set compute data type (used for actual computations but not for storing model weights) to be higher precision
)

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [None]:
import transformers
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

MODEL_NAME = "NousResearch/Llama-2-7b-hf"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=quantization_config,
    device_map="auto"
)
model.config.use_cache = True       # speeds up generation

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code = True
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

ImportError: The installed version of bitsandbytes (<0.43.1) requires CUDA, but CUDA is not available. You may need to install PyTorch with CUDA support or upgrade bitsandbytes to >=0.43.1.

In [None]:
def construct_datapoint(x):     # prepares our prompts for the format our model expects during training
    combined = x["input"] + x["output"]
    return tokenizer(combined, padding = True)      # tokenize the concatenated result right now, as the model expects integers just like before (with padding set to True)

training_dataset = training_dataset.map(construct_datapoint)    # for every single element in the training dataset, apply the above function (using Rust parallel processing)
# we didn't need to do this for GPT-2 because the dataset we were using was already in the format we wanted (every element was a giant string with the prompt and response together)

# the dataset we're given has an input and output format (prompt / response), however we don't actually train language models like this
# we actually just pass in a giant block of text and the model learns to predict the next token with all the training examples embedded within that block of text
# we need to actually concatenate the inputs and the outputs together for every single data point to pass in a single piece of text into the model for trainings