## OThers


In [8]:
#delete models form cache

# https://huggingface.co/docs/huggingface_hub/v0.28.1/guides/manage-cache#clean-cache-from-the-terminal
# to get revisions: 

# from huggingface_hub import scan_cache_dir

# delete_strategy = scan_cache_dir().delete_revisions(
#     "81fd1d6e7847c99f5862c9fb81387956d99ec7aa"
#     "e2983b237dccf3ab4937c97fa717319a9ca1a96d",
#     "d04e592bb4f6aa9cfee91e2e20afa771667e1d4b",
# )
# print("Will free: " + delete_strategy.expected_freed_size_str)

# delete_strategy.execute()

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AdamW
from transformers import pipeline
import torch

model_id = "meta-llama/Llama-3.1-8B-Instruct"
device = 'mps'

#tokenizer is used to convert from text to tokens (a number representation of a word/subword)
#each model has its own tokenizer
#padding_side = left means, pad to the left when string is smaller than set length for ear sample
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left')
#we need to explicitly set the pad_token. We set it to the same as end-of-sentence token here.
tokenizer.pad_token = tokenizer.eos_token

#loading the model.
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map=device)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:13<00:00,  3.31s/it]


In [2]:
#this is a pipeline in transformers. It executes the model for inference.
#will return a text generation pipeline
generation_pipeline = pipeline(task="text-generation", model= model, tokenizer=tokenizer)
generation_pipeline("Cricket is a ", max_new_tokens = 25, temperature=0.01)

Device set to use mps


[{'generated_text': "Cricket is a 3-year-old male cat who was surrendered to the shelter due to his owner's allergies. He is a beautiful, sleek black"}]

In [3]:
input_prompt = [
    "Hi there! ssup?",
    "Hello, how are you? What is going on?"
]

#tokenized = tokenizer(input_prompt, return_tensors='pt').to(device) ## all inputs should be the same size after tokenization
tokenized = tokenizer(input_prompt, padding=True, return_tensors='pt').to(device)

print(tokenized["input_ids"].shape)

torch.Size([2, 12])


#inserted a padding token in the beginning (padding side passed to the tokenizer)

In [4]:
print(tokenized["input_ids"])

tensor([[128009, 128009, 128009, 128009, 128009, 128000,  13347,   1070,      0,
          11107,    455,     30],
        [128000,   9906,     11,   1268,    527,    499,     30,   3639,    374,
           2133,    389,     30]], device='mps:0')


In [5]:
tokenizer.batch_decode(tokenized["input_ids"])

['<|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|begin_of_text|>Hi there! ssup?',
 '<|begin_of_text|>Hello, how are you? What is going on?']

In [6]:
tokenized.keys()

dict_keys(['input_ids', 'attention_mask'])

what is this attention mask. Tells what all tokens to give attention to (remove attention for padded text)

In [7]:
tokenized["attention_mask"]

tensor([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='mps:0')

# Chat template

In [9]:
prompt_template = [
    {
        "role": "system",
        "content": "You are an useful AI assistant who makes a joke everytime"
    },
    {
        "role": "user",
        "content": "Who am I?"
    },

]
tokenizer.pad_token = tokenizer.eos_token

tokenized = tokenizer.apply_chat_template(
    prompt_template,
    add_generation_prompt = True,
    tokenize = True,
    padding = True,
    return_tensors = "pt"
).to(device)

print(tokenized)

tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   1627,  10263,    220,   2366,     19,    271,   2675,    527,
            459,   5505,  15592,  18328,    889,   3727,    264,  22380,  90456,
         128009, 128006,    882, 128007,    271,  15546,   1097,    358,     30,
         128009, 128006,  78191, 128007,    271]], device='mps:0')


In [10]:
out = model.generate(tokenized, max_new_tokens=100)
decoded = tokenizer.batch_decode(out)
print(decoded[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are an useful AI assistant who makes a joke everytime<|eot_id|><|start_header_id|>user<|end_header_id|>

Who am I?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

You're a mystery, but don't worry, I won't have to "Google" all day to figure it out. Seriously though, I don't know who you are, but I'm here to help you discover yourself or answer any questions you might have. What's on your mind?<|eot_id|>


In [11]:
#when we want the output to start with a given sequence
prompt_template_2 = [
    {
        "role": "system",
        "content": "You are an useful AI assistant who makes a joke everytime"
    },
    {
        "role": "user",
        "content": "Who am I?"
    },
    {
        "role": "assistant",
        "content": "Hola! "
    },

]

tokenizer.pad_token = tokenizer.eos_token

tokenized = tokenizer.apply_chat_template(
    prompt_template_2,
    continue_final_message = True, #newly added, removed add_generation_prompt
    tokenize = True,
    padding = True,
    return_tensors = "pt"
).to(device)

print(tokenized)

tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   1627,  10263,    220,   2366,     19,    271,   2675,    527,
            459,   5505,  15592,  18328,    889,   3727,    264,  22380,  90456,
         128009, 128006,    882, 128007,    271,  15546,   1097,    358,     30,
         128009, 128006,  78191, 128007,    271,  69112,      0]],
       device='mps:0')


In [12]:
out = model.generate(tokenized, max_new_tokens=100)
decoded = tokenizer.batch_decode(out)
print(decoded[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are an useful AI assistant who makes a joke everytime<|eot_id|><|start_header_id|>user<|end_header_id|>

Who am I?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Hola! I'm not a mind reader, but I can try to guess. You're a curious human who's asking questions, so I'm going to take a wild guess and say... YOU'RE A GENIUS! (Just kidding, but seriously, I don't know, and that's okay!)

On a more serious note, I'd love to chat and get to know you better. What brings you here today?<|eot_id|>


# Looking at finetuning data (Amazon dataset)

In [3]:
import pandas as pd
import polars as pl
meta_elec = pd.read_json("Cell_Phones_and_Accessories.jsonl", lines = True, nrows = 100000)

In [4]:
meta_elec_pl = pl.DataFrame(meta_elec)
meta_elec_pl

rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
i64,str,str,list[struct[4]],str,str,str,datetime[ns],i64,bool
4,"""No white background! It’s clea…","""I bought this bc I thought it …","[{""IMAGE"",""https://images-na.ssl-images-amazon.com/images/I/B1PrCo7ZjXS._SL1600_.jpg"",""https://images-na.ssl-images-amazon.com/images/I/B1PrCo7ZjXS._SL800_.jpg"",""https://images-na.ssl-images-amazon.com/images/I/B1PrCo7ZjXS._SL256_.jpg""}]","""B08L6L3X1S""","""B08L6L3X1S""","""AFKZENTNBQ7A7V7UXW5JJI6UGRYQ""",2021-01-30 22:07:31.196,0,true
5,"""Awesome! Great price! Works …","""Perfect. How pissed am I that …",[],"""B079BPGF6C""","""B079BPGF6C""","""AFKZENTNBQ7A7V7UXW5JJI6UGRYQ""",2018-08-16 18:18:37.349,2,true
5,"""Worked but took an hour to ins…","""Overall very happy with the en…","[{""IMAGE"",""https://m.media-amazon.com/images/I/B1+g-o0qHKS._SL1600_.jpg"",""https://m.media-amazon.com/images/I/B1+g-o0qHKS._SL800_.jpg"",""https://m.media-amazon.com/images/I/B1+g-o0qHKS._SL256_.jpg""}]","""B088DR7Z5B""","""B0BBGGC8F2""","""AGCI7FAH4GL5FI65HYLKWTMFZ2CQ""",2021-08-17 21:21:44.798,3,true
4,"""Decent""","""Lasted about 9 months then the…","[{""IMAGE"",""https://images-na.ssl-images-amazon.com/images/I/71RgHzZnX3L.jpg"",""https://images-na.ssl-images-amazon.com/images/I/71RgHzZnX3L._SL800_.jpg"",""https://images-na.ssl-images-amazon.com/images/I/71RgHzZnX3L._SL256_.jpg""}]","""B07XRDHDNQ""","""B07XRDHDNQ""","""AGCI7FAH4GL5FI65HYLKWTMFZ2CQ""",2020-05-26 05:14:42.910,0,true
5,"""LOVE IT!""","""LOVE THIS CASE! Works better t…",[],"""B00A8ZDL9Y""","""B00A8ZDL9Y""","""AGCI7FAH4GL5FI65HYLKWTMFZ2CQ""",2014-08-25 19:23:08,0,true
…,…,…,…,…,…,…,…,…,…
2,"""I would not buy again. I hope…","""It kind of works, but is diffi…",[],"""B0078XKTTY""","""B0078XKTTY""","""AEGTK2K2VEAX7NWK2DLHZBZIYSIQ""",2014-07-09 19:16:00,0,true
5,"""Highly recommend""","""Great product. Base is wide e…",[],"""B08HCPHP9W""","""B08KXNGLKK""","""AFEUZ4RUAKTWJRJIWZ4FFFTVGT6Q""",2020-12-01 17:52:42.954,0,true
5,"""Five Stars""","""Wife loves it.""",[],"""B01ISS2F3U""","""B07CK19G7X""","""AFZEWW6S4NFQEXQU24MGWSCX2Z6A""",2017-02-22 00:24:01,0,true
4,"""Good product but ring attachme…","""There was no fingerprint reade…",[],"""B082TQ42Y8""","""B082TQ42Y8""","""AH7TDENCUVDOYHIMVPLRBS3G2AKA""",2022-09-22 22:24:14.068,0,true


In [5]:
import json

# Assuming meta_elec is a pandas DataFrame with the relevant columns

for index, row in meta_elec.iterrows():
    rating, text, title = row['rating'], row['text'], row['title']
    
    # Create the dictionary for each line of JSONL
    jsonl_data_format_input = {
        'System prompt': 'Given the Rating and Title, you are required to generate the review',
        'Rating': rating,
        'Title': title,
        'Review': text
    }
    
    # Open file in append mode and write the dictionary as a JSON line
    with open("train.jsonl", "a") as f:
        json.dump(jsonl_data_format_input, f)
        f.write("\n")


# Fine-tuning

In [16]:
prompt_template = [
    {
        "role": "system",
        "content": "You are a review rating predictor. Based on user's review provided, you predict the rating the user would give out of 5. Return only the number of the rating between 0 and 5."
    },
    {
        "role": "user",
        "content": "I bought this bc I thought it had the nice white background. Turns out it’s clear & since my phone is blue it doesn’t look anything like this.  If I had known that I would have purchased something else. It works ok."
    },
    {
        "role": "assistant",
        "content": "Rating is:"
    },

]

tokenized = tokenizer.apply_chat_template(
    prompt_template,
    continue_final_message = True, #newly added, removed add_generation_prompt
    tokenize = True,
    padding = True,
    return_tensors = "pt"
).to(device)

In [17]:
out = model.generate(tokenized, max_new_tokens=100)
decoded = tokenizer.batch_decode(out)
print(decoded[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a review rating predictor. Based on user's review provided, you predict the rating the user would give out of 5. Return only the number of the rating between 0 and 5.<|eot_id|><|start_header_id|>user<|end_header_id|>

I bought this bc I thought it had the nice white background. Turns out it’s clear & since my phone is blue it doesn’t look anything like this.  If I had known that I would have purchased something else. It works ok.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Rating is: 2<|eot_id|>


# Generating the next word - Inferencing

So there is this causal attention things during inference. We need to give the input tokens, then one token is given as output. We feed the (input + one output token) as the next input to get token2 and then (inp + token1 + token2) -> token3 ...

## Training on a sequence

In [18]:
sentence = ["Hi there!", "Who are you? What's up?"]
tokenized = tokenizer(sentence, return_tensors="pt", padding=True)["input_ids"]

In [19]:
print(tokenized)
print(tokenizer.batch_decode(tokenized))

tensor([[128009, 128009, 128009, 128009, 128009, 128000,  13347,   1070,      0],
        [128000,  15546,    527,    499,     30,   3639,    596,    709,     30]])
['<|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|begin_of_text|>Hi there!', "<|begin_of_text|>Who are you? What's up?"]


In [20]:
input_ids = tokenized[:, :-1]
target_ids = tokenized[:, 1:]

print("Input ids: ",input_ids)
print("Target ids: ",target_ids)

Input ids:  tensor([[128009, 128009, 128009, 128009, 128009, 128000,  13347,   1070],
        [128000,  15546,    527,    499,     30,   3639,    596,    709]])
Target ids:  tensor([[128009, 128009, 128009, 128009, 128000,  13347,   1070,      0],
        [ 15546,    527,    499,     30,   3639,    596,    709,     30]])


In [21]:
prompt_template = [
    {
        "role": "system",
        "content": "You are a review rating predictor. Based on user's review provided, you predict the rating the user would give out of 5. Return only the number of the rating between 0 and 5."
    },
    {
        "role": "user",
        "content": "I bought this bc I thought it had the nice white background. Turns out it’s clear & since my phone is blue it doesn’t look anything like this.  If I had known that I would have purchased something else. It works ok."
    },
    {
        "role": "assistant",
        "content": "Rating is:"
    },

]

answer = "4.0"

chat_template = tokenizer.apply_chat_template(prompt_template, continue_final_message=True, tokenize = True)
print(chat_template)

[128000, 128006, 9125, 128007, 271, 38766, 1303, 33025, 2696, 25, 6790, 220, 2366, 18, 198, 15724, 2696, 25, 220, 1627, 10263, 220, 2366, 19, 271, 2675, 527, 264, 3477, 10959, 62254, 13, 20817, 389, 1217, 596, 3477, 3984, 11, 499, 7168, 279, 10959, 279, 1217, 1053, 3041, 704, 315, 220, 20, 13, 3494, 1193, 279, 1396, 315, 279, 10959, 1990, 220, 15, 323, 220, 20, 13, 128009, 128006, 882, 128007, 271, 40, 11021, 420, 18399, 358, 3463, 433, 1047, 279, 6555, 4251, 4092, 13, 58334, 704, 433, 753, 2867, 612, 2533, 856, 4641, 374, 6437, 433, 3250, 1431, 1427, 4205, 1093, 420, 13, 220, 1442, 358, 1047, 3967, 430, 358, 1053, 617, 15075, 2555, 775, 13, 1102, 4375, 5509, 13, 128009, 128006, 78191, 128007, 271, 22940, 374, 25]


In [22]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer

# Load pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")

Loading checkpoint shards: 100%|██████████| 4/4 [01:16<00:00, 19.19s/it]


In [27]:
# Set up LoRA configuration
lora_config = LoraConfig(
    r=8,  # rank of the low-rank adaptation
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05
)

# Prepare SFTTrainer
trainer = SFTTrainer(
    model=model,
    train_dataset=meta_elec_pl
)

# Fine-tune
trainer.train()

AttributeError: 'DataFrame' object has no attribute 'column_names'

In [37]:
train_data = pd.read_json("train.jsonl", lines=True)

ValueError: No ':' found when decoding object value