In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from transformers import BitsAndBytesConfig
import torch
import gc

  from .autonotebook import tqdm as notebook_tqdm
2026-01-27 17:48:13.647737: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-01-27 17:48:14.037540: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2026-01-27 17:48:15.410358: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [None]:
class ModelLoad():

    def __init__(self, path : str):
        self.path = path
        self.tokenizer = None
        self.model = None
        self.pipe = None

    def load_in4bit(self, method):

        if self.model == None:
            self.tokenizer = AutoTokenizer.from_pretrained(self.path, local_files_only = True)
            self.model = AutoModelForCausalLM.from_pretrained(
                self.path, 
                device_map = 0, 
                quantization_config = BitsAndBytesConfig(
                    load_in_4bit= True,
                    bnb_4bit_compute_dtype= torch.float16,
                    bnb_4bit_quant_type= "nf4",
                    bnb_4bit_use_double_quant= True
                )
            )

            self.pipe = pipeline(method, tokenizer= self.tokenizer, model = self.model, device_map = 0 )

    def load_in8bit(self, method):

        if self.model == None:
            self.tokenizer = AutoTokenizer.from_pretrained(self.path, local_files_only = True)
            self.model = AutoModelForCausalLM.from_pretrained(
                self.path, 
                device_map = 0, 
                quantization_config = BitsAndBytesConfig(
                    load_in_8bit= True,
                )
            )

            self.pipe = pipeline(method, tokenizer= self.tokenizer, model = self.model, device_map = 0 )


    def response(self, query: str, max_new_tokens: int = 512) -> str:
        if self.pipe is None:
            raise RuntimeError("Model not loaded. Call load_in4bit() or load_in8bit() first.")

        prompt = (
            "You are a knowledgeable assistant. "
            "Answer the question clearly and concisely.\n\n"
            f"Question: {query}\n"
            "Answer:"
        )

        output = self.pipe(
            prompt,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            eos_token_id=self.tokenizer.eos_token_id,
        )[0]["generated_text"]

        return output.split("Answer:")[-1].strip()

    def clean(self):
        del self.pipe
        del self.model
        del self.tokenizer
        self.pipe = None
        self.model = None
        self.tokenizer = None

        gc.collect()
        torch.cuda.empty_cache()

In [4]:
path = "/home/ak/Projects/Models/TextGeneration/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/8afb486c1db24fe5011ec46dfbe5b5dccdb575c2"

In [5]:
llmLLama = ModelLoad(path)

In [7]:
llmLLama.load_in4bit("text-generation")

Loading checkpoint shards: 100%|██████████| 4/4 [00:18<00:00,  4.51s/it]
Device set to use cuda:0


In [18]:
question = "Generate me a YAML object taking 2 inputs"
print(llmLLama.response(query=question))

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Here is a YAML object that takes 2 inputs:

```
name: 'My YAML Object'
description: 'This YAML object takes two inputs: input1 and input2'
inputs:
  - name: input1
    type: string
  - name: input2
    type: integer
```



Would you like me to generate another YAML object or help you with something else? Let me know!


In [19]:
llmLLama.clean()