In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import sentencepiece

from pathlib import Path
from dotenv import load_dotenv
import os

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [2]:
# load secret tokens

env_path = Path("../secrets.env").resolve()

if not env_path.exists():
    raise FileNotFoundError(f"secrets.env not found at: {env_path}")

load_dotenv(env_path)  # This reads the .env file
HF_token = os.getenv("HUGGING_FACE_token")
print("secrets.env found and loaded")

secrets.env found and loaded


In [3]:
base_model_id = "meta-llama/Llama-2-7b-hf"
lora_path = "./models/lora_apaca_llama2"



model = AutoModelForCausalLM.from_pretrained(
        base_model_id, 
        token=HF_token,
        dtype=torch.bfloat16,                       # trying to reduce space as it reaches GPU mem limit easily
        cache_dir="./.cache/huggingface").to("cuda")

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    token=HF_token,
    use_fast=False
)

model = PeftModel.from_pretrained(model, lora_path)
model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

    Found GPU0 NVIDIA GB10 which is of cuda capability 12.1.
    Minimum and Maximum cuda capability supported by this version of PyTorch is
    (8.0) - (12.0)
    
  queued_call()


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear(in_featu

In [4]:
def alpaca_prompt(instruction, input_text=""):
    if input_text:
        return f"""### Instruction:
{instruction}

### Input:
{input_text}

### Response:
"""
    else:
        return f"""### Instruction:
{instruction}

### Response:
"""

In [5]:
prompt = alpaca_prompt(
    instruction="Explain what reinforcement learning is",
)

inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=200,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

### Instruction:
Explain what reinforcement learning is

### Response:
t Reinforcement Learning (RL) is a type of machine learning algorithm that uses trial and error to improve performance on a task. The RL algorithm learns through trial and error, using rewards and punishments to guide its decisions. It takes into account the current state of the environment, the possible actions it can take, and the resulting consequences. As it explores different actions in the environment, the RL algorithm updates its policy over time until it finds an optimal solution.


Response must follow instructions, self-eplanatory. Tone Helpful and explanatory and Mentions steps, examples, structure


In [6]:
prompt = alpaca_prompt(
    "Write a Python function that checks if a number is prime"
)
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=200,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


### Instruction:
Write a Python function that checks if a number is prime

### Response:
 - def check_prime(num): 
     for i in range(2, num): 
         if (num % i) == 0: 
             return False
     else: 
         return True


In [7]:
# Base model vs Alpaca trained
prompt = "Explain gradient descent in exactly 3 bullet points. Do not use equations. Do not write more than one sentence per bullet."

base_model = AutoModelForCausalLM.from_pretrained(
        base_model_id, 
        token=HF_token,
        dtype=torch.bfloat16,                       # trying to reduce space as it reaches GPU mem limit easily
        cache_dir="./.cache/huggingface").to("cuda")


inputs = tokenizer(prompt, return_tensors="pt").to(device)

out_base = base_model.generate(**inputs, max_new_tokens=150)
out_lora = model.generate(**inputs, max_new_tokens=150)

print("BASE:\n", tokenizer.decode(out_base[0], skip_special_tokens=True))
print("\nLORA:\n", tokenizer.decode(out_lora[0], skip_special_tokens=True))


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

BASE:
 Explain gradient descent in exactly 3 bullet points. Do not use equations. Do not write more than one sentence per bullet.
Gradient descent is a method to minimize a function of many variables by iteratively changing the variables until the function value is minimized.
Gradient descent is an optimization technique that involves iteratively changing the values of variables in a function until the function value is minimized.
Gradient descent is a mathematical optimization technique that involves iteratively changing the values of variables in a function until the function value is minimized.
Gradient descent is an optimization technique that involves iteratively changing the values of variables in a function until the function value is minimized.
Gradient descent is a mathematical optimization technique that involves iteratively changing the values of variables in a function until the function value is minimized. It is a type of gradient-

LORA:
 Explain gradient descent in exact