In [11]:
from transformers import AutoModel, AutoConfig, AutoTokenizer, LlamaForCausalLM, BitsAndBytesConfig
import torch

In [12]:
tokenizer = AutoTokenizer.from_pretrained("KoboldAI/llama2-tokenizer")
# Specify the path to the directory containing the model files
model_dir = "out/hf"

# Check if a GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Specify the target GPU device
device = 'cuda:0'  # Change to 'cuda:1' if you want to use GPU 1

# Create a custom device map that assigns all layers to the target device
device_map = {'': device}
model_name = "sabareesh88/fw14k"

# Load the configuration
config = AutoConfig.from_pretrained(model_dir)

bnb_config = BitsAndBytesConfig(load_in_8bit=True)

# Load the model with the quantization config
model = LlamaForCausalLM.from_pretrained(
    model_name,
    config=config,
    # quantization_config=bnb_config,
    device_map=device_map
)
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 1024)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=1024, out_features=2752, bias=False)
          (up_proj): Linear(in_features=1024, out_features=2752, bias=False)
          (down_proj): Linear(in_features=2752, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((1024,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((1024,), eps=1e-05)
      )
    )
    (norm)

In [17]:
# Example input text
input_text = "Once upon a time"

# Tokenize the input text
inputs = tokenizer(input_text, return_tensors="pt").to(device)  # Move inputs to GPU

# Define the generation parameters
max_new_tokens = 500  # Maximum number of new tokens to generate
temperature = 0.7    # Sampling temperature
top_k = 50           #

# Generate text using the model's `generate` method
with torch.no_grad():
    output = model.generate(
        inputs["input_ids"],
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_k=top_k,
        do_sample=True  # Enable sampling for more diverse outputs
    )

# Decode the generated tokens back into text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Once upon a time, the world’s greatest inventors had a great deal to say about the importance of their contributions to the world.
The first inventor to make a huge contribution was Leonardo da Vinci. We already know his contribution to the world, but we don’t know exactly how much he contributed. But we can tell you that he was a genius. He was the first to use a camera obscura to take pictures. His invention was so important that after he invented it, people began to copy him and use his invention in their lives.
The second inventor to make a remarkable contribution was the inventor and explorer, Albert Einstein. He had a lot of trouble with the laws of physics, but he also had a lot of incredible ideas. His work was so important to the world that he won the Nobel Prize in Physics in 1921.
The third inventor we mentioned earlier was the inventor and inventor, Leonardo da Vinci. He was a genius, and his inventions were so important that he was awarded the Nobel Prize in Physics in 190