In [None]:
from transformers import AutoModelForCausalLM,AutoTokenizer,pipeline
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-chat-hf")
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-13b-chat-hf",
    load_in_4bit=True,
)

In [None]:
import torch 
torch.cuda.empty_cache()

In [None]:
from transformers import AutoModelForCausalLM,AutoTokenizer,pipeline
import torch.nn.functional as F
import torch
class ChatBot(object):
    def __init__(self,model_id):
        self.model = AutoModelForCausalLM.from_pretrained(model_id,load_in_4bit=True,)
        self.model.eval()
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        chat = [
        {"role": "system", "content": "You are a helpful and honest assistant."},
        ]
        print(tokenizer.apply_chat_template(chat, tokenize=False))
    def chat(self,prompt):
        #prompt = "Hello, how are you doing today?"
        inputs = self.tokenizer(prompt,return_tensors="pt").input_ids
        with torch.no_grad():
            out = self.model.generate(inputs,max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
        #probs = F.softmax(logits['logits'],dim=-1)
        #toks = probs.argmax(dim=-1)
        return self.tokenizer.batch_decode(out)#self.tokenizer.batch_decode(toks)


In [None]:
chatbot = ChatBot("meta-llama/Llama-2-13b-chat-hf")

In [None]:
resp = chatbot.chat("Hello, please explain how a catalytic converter works.")

In [None]:
print(resp[0])

In [None]:
x = chatbot.tokenizer.batch_decode(resp)

In [None]:
len(x)
print(x[0])

In [None]:
print()

## Chat pipeline

Prompting with Llama2: https://discuss.huggingface.co/t/trying-to-understand-system-prompts-with-llama-2-and-transformers-interface/59016

Pipelines in Huggingface: https://huggingface.co/docs/transformers/en/add_new_pipeline

In [1]:
import torch
from transformers import AutoModelForCausalLM,AutoTokenizer,pipeline
class ChatBot(object):
    def __init__(self,model_id):
        model = AutoModelForCausalLM.from_pretrained(model_id,load_in_4bit=True)
        model.eval()
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        self.chat_history = [
            {"role": "system", "content" : "You are a helpful and intelligent AI assistant who responds to user queries."}
        ]
        self.pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=self.tokenizer,
            torch_dtype=torch.bfloat16,
            device_map="auto"
        )
        self.num_tokens = len(self.tokenizer(self.chat_history[0]['content']).input_ids)
    def calculate_num_tokens(self,prompt):
        self.num_tokens+=len(self.tokenizer(prompt).input_ids)
        print('num_tokens:',self.num_tokens)
    def chat(self,prompt):
        self.calculate_num_tokens(prompt)
        self.chat_history.append(
            {"role": "user", "content": prompt}
        )
        resp = self.pipe(self.chat_history,max_new_tokens=512)
        print(resp)
        self.chat_history.append(
            {"role": "system", "content" : resp[0]['generated_text'][-1]['content']}
        )
        self.calculate_num_tokens(self.chat_history[-1]["content"])
        return self.chat_history[-1]["content"]

In [7]:
dir(chatbot.pipe.model)

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_assisted_decoding',
 '_auto_class',
 '_autoset_attn_implementation',
 '_backward_compatibility_gradient_checkpointing',
 '_backward_hooks',
 '_backward_pre_hooks',
 '_beam_sample',
 '_beam_search',
 '_buffers',
 '_call_impl',
 '_check_and_enable_flash_attn_2',
 '_check_and_enable_sdpa',
 '_compiled_call_impl',
 '_constrained_beam_search',
 '_contrastive_search',
 '_convert_head_mask_to_5d',
 '_copy_lm_head_original_to_resized',
 '_create_repo',
 '_dispatch_accelerate_model',
 '_expand_inputs_for_generation',
 

In [2]:
chatbot = ChatBot("meta-llama/Llama-2-13b-chat-hf")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
response = chatbot.chat("What is a catalytic converter?")

num_tokens: 27




[{'generated_text': [{'role': 'system', 'content': 'You are a helpful and intelligent AI assistant who responds to user queries.'}, {'role': 'user', 'content': 'What is a catalytic converter?'}, {'role': 'assistant', 'content': "  Hello! I'd be happy to help you understand what a catalytic converter is.\n\nA catalytic converter is an emissions control device that is used in vehicles to reduce the amount of harmful pollutants emitted into the atmosphere. It is typically located in the exhaust system of a vehicle and is designed to convert harmful pollutants like carbon monoxide (CO), hydrocarbons (HC), and nitrogen oxides (NOx) into less harmful substances like carbon dioxide (CO2), water (H2O), and nitrogen (N2).\n\nThe catalytic converter works by using a catalyst, typically a precious metal like platinum or palladium, to facilitate a chemical reaction that converts the harmful pollutants into less harmful substances. The catalyst is coated onto a ceramic or metallic honeycomb-like st

In [4]:
print(response)

  Hello! I'd be happy to help you understand what a catalytic converter is.

A catalytic converter is an emissions control device that is used in vehicles to reduce the amount of harmful pollutants emitted into the atmosphere. It is typically located in the exhaust system of a vehicle and is designed to convert harmful pollutants like carbon monoxide (CO), hydrocarbons (HC), and nitrogen oxides (NOx) into less harmful substances like carbon dioxide (CO2), water (H2O), and nitrogen (N2).

The catalytic converter works by using a catalyst, typically a precious metal like platinum or palladium, to facilitate a chemical reaction that converts the harmful pollutants into less harmful substances. The catalyst is coated onto a ceramic or metallic honeycomb-like structure, which is located within the converter.

As exhaust gas flows through the converter, it comes into contact with the catalyst, which causes a chemical reaction to occur. This reaction converts the harmful pollutants into less 

In [5]:
response2 = chatbot.chat("Can you explain what you mean by 'catalyst'?")

num_tokens: 350
[{'generated_text': [{'role': 'system', 'content': 'You are a helpful and intelligent AI assistant who responds to user queries.'}, {'role': 'user', 'content': 'What is a catalytic converter?'}, {'role': 'system', 'content': "  Hello! I'd be happy to help you understand what a catalytic converter is.\n\nA catalytic converter is an emissions control device that is used in vehicles to reduce the amount of harmful pollutants emitted into the atmosphere. It is typically located in the exhaust system of a vehicle and is designed to convert harmful pollutants like carbon monoxide (CO), hydrocarbons (HC), and nitrogen oxides (NOx) into less harmful substances like carbon dioxide (CO2), water (H2O), and nitrogen (N2).\n\nThe catalytic converter works by using a catalyst, typically a precious metal like platinum or palladium, to facilitate a chemical reaction that converts the harmful pollutants into less harmful substances. The catalyst is coated onto a ceramic or metallic hone

In [None]:
print(response2)

In [None]:
response3 = chatbot.chat("When was the catalytic converter invented?")

In [None]:
print(response3)

In [None]:
chatbot.num_tokens

In [None]:
response4 = chatbot.chat("What other parts of a car are important for pollution control?")

In [None]:
print(response4)

In [None]:
chatbot.calculate_num_tokens()

In [None]:
import torch
from transformers import AutoModelForCausalLM,AutoTokenizer,pipeline

model_id = "meta-llama/Llama-2-13b-chat-hf"
model = AutoModelForCausalLM.from_pretrained(model_id,load_in_4bit=True)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_id)


chat = [
    {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."},
    {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
]


pipe = pipeline("text-generation", model=model,tokenizer=tokenizer, torch_dtype=torch.bfloat16, device_map="auto")
response = pipe(chat, max_new_tokens=512)
print(response[0]['generated_text'][-1]['content'])

In [None]:
len(response)
type(response[0])
response[0].keys()
response[0]['generated_text']

## Chaotic playground:

In [None]:
import transformers
import torch

#model_id = "meta-llama/Meta-Llama-3-8B"
model_id = "SweatyCrayfish/llama-3-8b-quantized"

pipeline = transformers.pipeline("text-generation", model=model_id, load_in_4bit=True, device_map="auto",torch_dtype=torch.float32)
pipeline("Hey how are you doing today?")

In [None]:
import transformers
import torch

model_id = "meta-llama/Meta-Llama-3-70B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device="cuda",
)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

prompt = pipeline.tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = pipeline(
    prompt,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
print(outputs[0]["generated_text"][len(prompt):])


In [None]:
import torch
from transformers import AutoModelForCausalLM,AutoTokenizer

model_4bit = AutoModelForCausalLM.from_pretrained("SweatyCrayfish/llama-3-8b-quantized", load_in_4bit=True, torch_dtype=torch.float32)
tokenizer = AutoTokenizer.from_pretrained("SweatyCrayfish/llama-3-8b-quantized")


In [None]:
model_4bit.state_dict().keys()

In [None]:
from transformers import AutoModelForCausalLM,AutoTokenizer
import torch 
#model_id = "Meta-Llama-3-8B.Q2_K.gguf"
model_id = "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF-v2"
#model_4bit = AutoModelForCausalLM.from_pretrained("SweatyCrayfish/llama-3-8b-quantized", load_in_4bit=True, torch_dtype=torch.float32)
#tokenizer = AutoTokenizer.from_pretrained("SweatyCrayfish/llama-3-8b-quantized")
#tokenizer = AutoTokenizer.from_pretrained("QuantFactory/Meta-Llama-3-8B-GGUF")
model_4bit = AutoModelForCausalLM.from_pretrained(model_id)


In [None]:
model_id = "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF-v2"

In [None]:
#input_ids = torch.tensor(tokenizer.encode("Hey how are you doing today?"))
prompt = "What is the Python programming language?"
inputs = tokenizer(prompt, return_tensors="pt").input_ids

In [None]:
inputs

In [None]:
y = model_4bit(inputs)

In [None]:
y.keys()

In [None]:
import torch.nn.functional as F 

out = F.softmax(y['logits'],dim=-1).argmax(dim=-1)

In [None]:
out.shape

In [None]:
dir(tokenizer)
tokenizer.batch_decode(out)

In [None]:
model_4bit

In [None]:
from transformers import AutoModelForCausalLM,AutoTokenizer,pipeline
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-chat-hf")
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-13b-chat-hf",
    load_in_4bit=True,
)
#model = AutoModelForCausalLM.from_pretrained("gpt2", load_in_4bit=True, device_map="auto")
#tokenizer = AutoTokenizer.from_pretrained("gpt2")

#pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)



In [None]:
prompt = "Hello, how are you doing today?"
inputs = tokenizer(prompt,return_tensors="pt").input_ids
out = model.generate(inputs,max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)



In [None]:
out
tokenizer.batch_decode(out)

In [None]:
pipe("Hello, how are you today?")

In [None]:
model.eval()

In [None]:
from transformers import AutoModelForCausalLM,AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-chat-hf")


In [None]:
type(tokenizer.default_chat_template)

In [None]:
prompt = "What is the Python programming language?"

inputs = tokenizer(prompt, return_tensors="pt").input_ids



In [None]:
inputs.shape

In [None]:
import torch.nn.functional as F
with torch.no_grad():
    logits = model(inputs)
probs = F.softmax(logits['logits'],dim=-1)
toks = probs.argmax(dim=-1)

In [None]:
toks

In [None]:
probs.sum(dim=1)

In [None]:
outputs.keys()


In [None]:
tokenizer.batch_decode(toks)

In [None]:
tokenizer.default_chat_template

In [None]:
from transformers import pipeline

pipe = pipeline("text-generation", "HuggingFaceH4/zephyr-7b-beta")
messages = [
    {
        "role": "system",
        "content": "You are a friendly chatbot who always responds in the style of a pirate",
    },
    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
]
print(pipe(messages, max_new_tokens=128)[0]['generated_text'][-1])  # Print the assistant's response

In [None]:
pipe = pipeline("text-generation", "HuggingFaceH4/zephyr-7b-beta")
pipe = pipeline("text-generation","meta-llama/Llama-2-13b-chat-hf")
messages = [
    {
        "role": "system",
        "content": "You are a friendly chatbot who always responds in the style of a pirate",
    },
    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
]
print(pipe(messages, max_new_tokens=128)[0]['generated_text'][-1])  # Print the assistant's response