### setup

In [7]:
from openai import OpenAI, AsyncOpenAI
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
from dotenv import load_dotenv
import torch as t
import asyncio
import os
from peft import LoraConfig, TaskType, get_peft_model


In [2]:
load_dotenv()
login(token = os.environ['HF_TOKEN'])

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [16]:
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.1-8B-Instruct')
model = AutoModelForCausalLM.from_pretrained(
        'meta-llama/Llama-3.1-8B-Instruct', 
        device_map = 'auto',
        torch_dtype=t.bfloat16,
)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

### LoRA

In [17]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
    (rotary_

In [19]:
# create LoRA configuration object
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, # type of task to train on
    inference_mode=False, # set to False for training
    r=8, 
    lora_alpha=32, 
    lora_dropout=0.1,
)

In [20]:
model.add_adapter(lora_config, adapter_name="lora_1")


In [21]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
            (lora_dropout): ModuleDict(
              (lora_1): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (lora_1): Linear(in_features=4096, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (lora_1): Linear(in_features=8, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): lora.Linear(
            (base_layer): Linear(in_features=4096, out_features=1024, bias

### Async Test

In [None]:
async def task1():
    print("Task 1: Start")
    await asyncio.sleep(2)
    print("Task 1: End")

async def task2():
    print("Task 2: Start")
    await asyncio.sleep(1)
    print("Task 2: End")

# async def main():
await asyncio.gather(task1(), task2())

# await main()

Task 1: Start
Task 2: Start
Task 2: End
Task 1: End


[None, 'hi']

In [23]:
async def chat_async():
    prompt = 'hello world'
    client = AsyncOpenAI()

    response = await client.responses.create(
        model = 'gpt-4.1-mini',
        input = prompt,
    )

    text_out = response.output[-1].content[0].text
    return text_out

def chat_reg():
    prompt = 'hello world'
    client = OpenAI()

    response = client.responses.create(
        model = 'gpt-4.1-mini',
        input = prompt,
    )

    text_out = response.output[-1].content[0].text
    return text_out

In [27]:
await chat_async()

'Hello! How can I assist you today?'