In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline, StoppingCriteria, StoppingCriteriaList, TextStreamer
from datetime import datetime
import transformers
import torch

model_name = "meta-llama/Llama-3.1-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16
)
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype=torch.float16, 
    quantization_config=bnb_config,
    low_cpu_mem_usage=True
)


`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [23]:
class StopOnMessageEnd(StoppingCriteria):
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.end_tokens = {tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("\n\n")}

    def __call__(self, input_ids, scores, **kwargs):
        if input_ids.shape[1] < 2:
            return False
        last_token = input_ids[0, -1].item()
        penultimate_token = input_ids[0, -2].item()
        # Ensure proper message-end stopping
        return (
            last_token in self.end_tokens or 
            (last_token == self.tokenizer.convert_tokens_to_ids("\n") and penultimate_token == self.tokenizer.convert_tokens_to_ids("\n"))
        )


# Define the stopping criteria
stopping_criteria = StoppingCriteriaList([StopOnMessageEnd(tokenizer)])
streamer = TextStreamer(tokenizer, model, stopping_criteria=stopping_criteria, max_length=256, device="cuda")

In [None]:
import requests
def get_current_location():
    try:
        response = requests.get('https://ipinfo.io')
        data = response.json()
        return data['city'] + ", " + data['region'] + ", " + data['country']
    except Exception as e:
        return "Unknown Location"

Getting current location...
Location: Winterthur, Zurich, CH


In [None]:
conversation_history = f"""
<|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023.

You are a highly competent and precise virtual AI assistant named James. 
Your goal is to provide fact-based, clear, and concise answers to the user's questions.
Your replies are correct and don't show inconsistencies or contradictions.

Context Information:
The current day is {datetime.now().strftime("%B %d, %Y")}.
The current location is {get_current_location()}.

<|eot_id|>
"""

while True:
    # Get user input
    input_text = input("User: ")

    # Exit condition
    if input_text.lower() in ["exit", "quit"]:
        print("Exiting the chat. Goodbye!")
        break

    # Append user input to conversation history
    conversation_history += f"<|start_header_id|>user<|end_header_id|>\n{input_text} <|eot_id|>\n"

    # Prepare prompt for the model
    prompt = conversation_history + "<|start_header_id|>assistant<|end_header_id|>\n"

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate the response
    outputs = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=300,
        num_return_sequences=1,
        stopping_criteria=stopping_criteria,
        do_sample=True,
        temperature=0.6,
        top_k=50,
        top_p=0.95,
        pad_token_id=tokenizer.pad_token_id,  # Explicitly set pad_token_id
        eos_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=2,
        streamer=streamer,
    )

    # Decode the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Append assistant response to conversation history
    conversation_history += f"{response} <|eot_id|>\n"

    # Print the assistant's response
    print(f"James: {response}")

KeyboardInterrupt: Interrupted by user

In [61]:
input_text = """
What's your name?
"""

prompt = f"""
<|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023.

You are a highly competent and precise virtual AI assistant named James. 
Your goal is to provide fact-based, clear, and concise answers to the user's questions.
Your replies are correct and don't show inconsistencies or contradictions.

Context Information:
The current day is {datetime.now().strftime("%B %d, %Y")}.
The current location is {get_current_location()}.

<|eot_id|>

<|start_header_id|>user<|end_header_id|>
Hello, who are you? <|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
Hello! My name is James, I'm your personal AI assistant. How can I help you today? <|eot_id|>
<|start_header_id|>user<|end_header_id|>
Can you help me with some questions? <|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
Of course, I'd be happy to help. What do you need assistance with? <|eot_id|>
<|start_header_id|>user<|end_header_id|>
{input_text} <|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
"""

inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate the response with attention mask and stopping criteria
outputs = model.generate(
    inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=300,  # Adjust based on your requirement
    num_return_sequences=1,
    stopping_criteria=stopping_criteria,
    do_sample=True,
    temperature=0.6,
    top_k=50,
    top_p=0.95,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
    no_repeat_ngram_size=2,
    streamer=streamer,
)

# Decode the response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)