In [1]:
import torch, pynvml

total_gpus = torch.cuda.device_count()
largest_vram = 0
gpu_index = 0

def get_memory_free_MiB(gpu_index):
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(int(gpu_index))
    mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
    return mem_info.free // 1024 ** 2

for i in range(total_gpus):
    new_vram = get_memory_free_MiB(i)
    if new_vram > largest_vram:
        largest_vram = new_vram
        gpu_index = i
    print(f'GPU {i}: {torch.cuda.get_device_name(i)}')
    print(f'available memory of GPU {i}: {new_vram} MiB \n')

print(f'GPU {gpu_index} has the largest available VRAM: {largest_vram} MiB')
torch.cuda.set_device(gpu_index)
print(f'current cuda device is set to: {torch.cuda.current_device()}')

GPU 0: NVIDIA GeForce RTX 2080 Ti
available memory of GPU 0: 11003 MiB 

GPU 1: NVIDIA GeForce RTX 2080 Ti
available memory of GPU 1: 11000 MiB 

GPU 2: NVIDIA GeForce RTX 2080 Ti
available memory of GPU 2: 11000 MiB 

GPU 3: NVIDIA GeForce RTX 2080 Ti
available memory of GPU 3: 11000 MiB 

GPU 0 has the largest available VRAM: 11003 MiB
current cuda device is set to: 0


In [2]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

ADAPTER_PATH = 'models/gpt-neo-lora-checkpoint-final'

# Load the LoRA adapter config
peft_config = PeftConfig.from_pretrained(ADAPTER_PATH)

# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(peft_config.base_model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path)

# Merge LoRA adapter
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
model.eval().to("cuda")

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPTNeoForCausalLM(
      (transformer): GPTNeoModel(
        (wte): Embedding(50257, 2048)
        (wpe): Embedding(2048, 2048)
        (drop): Dropout(p=0.0, inplace=False)
        (h): ModuleList(
          (0-23): 24 x GPTNeoBlock(
            (ln_1): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
            (attn): GPTNeoAttention(
              (attention): GPTNeoSelfAttention(
                (attn_dropout): Dropout(p=0.0, inplace=False)
                (resid_dropout): Dropout(p=0.0, inplace=False)
                (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
                (v_proj): lora.Linear(
                  (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_

In [5]:
prompt = "### Prompt:\nHow do I install a text editor?\n### Response:\n"

inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1,
        pad_token_id=tokenizer.eos_token_id
    )

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

### Prompt:
How do I install a text editor?
### Response:
You can install any text editor you want.


In [28]:
def chat(prompt, temperature):
    model.eval()
    input_text = f"### Prompt:\n{prompt}\n### Response:\n"
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=temperature,
            top_p=0.9,
            top_k=64,
            do_sample=True,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)



In [29]:
response = chat('How do I develop applications for Android OS?', 0.7)
response

'### Prompt:\nHow do I develop applications for Android OS?\n### Response:\nYou have to install android-sdk, then you can do.'