# Imports

In [None]:
import torch

In [None]:
torch.cuda.is_available()

In [None]:
from transformers import pipeline

from transformers import AutoModelForCausalLM, AutoTokenizer


In [None]:
!git config --global credential.helper store

!huggingface-cli login --token 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 --add-to-git-credential

# Transformers

In [None]:


device = "cuda" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")


In [None]:
model.half().to(device)

In [None]:

messages = [
    {"role": "user", "content": "What is your favourite condiment?"},
    {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
    {"role": "user", "content": "Do you have mayonnaise recipes?"}
]

encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

model_inputs = encodeds.to(device)


In [None]:

generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])

In [None]:
type(model)

In [None]:
type(tokenizer)

In [None]:
!wget https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF/resolve/main/zephyr-7b-beta.Q4_0.gguf?ref=localhost

In [None]:
!ls -ahl

In [None]:
!mv zephyr-7b-beta.Q4_0.gguf?ref=localhost zephyr-7b-beta.Q4_0.gguf

In [None]:
import time

def measure_tokens_per_second(model, tokenizer, prompts: list, max_tokens: int = 500) -> float:

    total_tokens_generated = 0
    total_time_taken = 0.0


    for prompt in prompts:
        # Tokenize the input prompt
        inputs = tokenizer(prompt, return_tensors="pt")
        
        # Measure the time taken to generate the output
        start_time = time.time()
        output = model.generate(inputs['input_ids'].to(device), max_length=inputs['input_ids'].shape[1] + max_tokens, do_sample=False)
        end_time = time.time()
        
        # Calculate the number of tokens generated
        num_tokens_generated = output.shape[1] - inputs['input_ids'].shape[1]
        
        # Calculate the time taken
        time_taken = end_time - start_time
        
        # Update the total tokens and time
        total_tokens_generated += num_tokens_generated
        total_time_taken += time_taken
    
    
    # Calculate average tokens per second
    average_tokens_per_second = total_tokens_generated / total_time_taken
    

    return average_tokens_per_second

In [None]:
prompts = [
    "What is your favourite condiment?",
    "Do you have any recipes for mayonnaise?",
    "Once upon a time ",
    "Once upon a time on Mars ",
    "Once upon a time on in the distance past ",
]
average_tokens_per_second = measure_tokens_per_second(model, tokenizer, prompts, max_tokens=2048)

In [None]:
print(average_tokens_per_second)

# Tokens per second = 44.72630659078358

In [None]:
max_tokens = 2048
messages = [
    {"role": "user", "content": "Tell me a long story"},
]

encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

model_inputs = encodeds.to(device)

output = model.generate(model_inputs, max_length=model_inputs.shape[1] + max_tokens, do_sample=False)

decoded = tokenizer.batch_decode(output)

print (decoded)

In [None]:

print (decoded[0])

In [None]:
import os
from transformers import AutoModelForCausalLM, AutoTokenizer

def download_model_to_folder(model_name: str, folder_path: str):
    # Ensure the folder exists
    os.makedirs(folder_path, exist_ok=True)
    
    # Download the model and tokenizer to the specified folder
    model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=folder_path)
    tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=folder_path)
    
    print(f"Model and tokenizer downloaded to {folder_path}")

# Example usage
model_name = "l3utterfly/mistral-7b-v0.1-layla-v4-chatml-gguf"
folder_path = "./models/layla"
download_model_to_folder(model_name, folder_path)

In [None]:
!ls ./models/gpt2/models--gpt2/refs/main

In [None]:
!ls ./models/gpt2/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e

In [None]:
!wget https://huggingface.co/l3utterfly/mistral-7b-v0.1-layla-v4-chatml-gguf/resolve/main/mistral-7b-v0.1-layla-v4-chatml-Q5_K.gguf?download=true

# Llama cpp server

In [None]:
!ls ./models/layla/models--l3utterfly--mistral-7b-v0.1-layla-v4-chatml-gguf/refs/main

In [None]:
!ls

In [None]:
pip install llama-cpp-python

In [1]:


from llama_cpp import Llama


# GLOBAL VARIABLES
my_model_path = "mistral-7b-v0.1-layla-v4-chatml-Q5_K.gguf"
CONTEXT_SIZE = 512


# LOAD THE MODEL
zephyr_model = Llama(
                    model_path=my_model_path,
                    n_ctx=CONTEXT_SIZE,
                    n_gpu_layers=33
)

llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from mistral-7b-v0.1-layla-v4-chatml-Q5_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = models
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_cou

In [2]:
def generate_text_from_prompt(
        user_prompt,
        max_tokens = 100,
        temperature = 0.3,
        top_p = 0.1,
        echo = True,
        stop = None):
    # Define the parameters
    model_output = zephyr_model(
        user_prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        echo=echo,
        stop=stop,
    )
    return model_output

In [3]:

my_prompt = "Tell me a long story"


response = generate_text_from_prompt(my_prompt)


print(response)


llama_print_timings:        load time =     112.72 ms
llama_print_timings:      sample time =      25.73 ms /   100 runs   (    0.26 ms per token,  3886.06 tokens per second)
llama_print_timings: prompt eval time =     112.66 ms /     6 tokens (   18.78 ms per token,    53.26 tokens per second)
llama_print_timings:        eval time =     840.86 ms /    99 runs   (    8.49 ms per token,   117.74 tokens per second)
llama_print_timings:       total time =    1000.44 ms /   105 tokens


{'id': 'cmpl-6edb3409-1e5c-47e9-998a-f436d6e847c8', 'object': 'text_completion', 'created': 1718827589, 'model': 'mistral-7b-v0.1-layla-v4-chatml-Q5_K.gguf', 'choices': [{'text': 'Tell me a long story,\n\nTell me a short one.\n\nI’ll listen to you,\n\nAnd I won’t say no.\n\nI’ll listen to your stories,\n\nOf love and of hate.\n\nI’ll listen to your stories,\n\nOf joy and of fate.\n\nI’ll listen to your stories,\n\nOf life and of death.\n\nI’ll listen to your stories,\n\nOf hope and of breath.', 'index': 0, 'logprobs': None, 'finish_reason': 'length'}], 'usage': {'prompt_tokens': 6, 'completion_tokens': 100, 'total_tokens': 106}}


In [4]:
response.keys()

dict_keys(['id', 'object', 'created', 'model', 'choices', 'usage'])

In [9]:
def print_dict_structure(d, indent=0, parent_is_list=False):
    """
    Recursively prints the structure of a dictionary, including the type of each object.
    
    Args:
    d (dict): The dictionary to print.
    indent (int): The current indentation level (used for recursion).
    parent_is_list (bool): Indicates if the parent element is a list.
    """
    prefix = '|' if indent > 0 else ''
    for i, (key, value) in enumerate(d.items()):
        is_last = (i == len(d) - 1)
        if is_last and not parent_is_list:
            branch = '└─'
        else:
            branch = '├─'
        
        print(f"{prefix}{'    ' * (indent - 1)}{branch}{key} ({type(value).__name__})")
        
        if isinstance(value, dict):
            print_dict_structure(value, indent + 1)
        elif isinstance(value, list):
            print(f"{prefix}{'    ' * (indent)}├─[")
            for j, item in enumerate(value):
                item_is_last = (j == len(value) - 1)
                if item_is_last:
                    sub_branch = '└─'
                else:
                    sub_branch = '├─'
                if isinstance(item, dict):
                    print(f"{prefix}{'    ' * (indent + 1)}{sub_branch}item ({type(item).__name__})")
                    print_dict_structure(item, indent + 2, parent_is_list=True)
                else:
                    print(f"{prefix}{'    ' * (indent + 1)}{sub_branch}{item} ({type(item).__name__})")
            print(f"{prefix}{'    ' * (indent)}└─]")

In [10]:
print_dict_structure(response)

├─id (str)
├─object (str)
├─created (int)
├─model (str)
├─choices (list)
├─[
    └─item (dict)
|    ├─text (str)
|    ├─index (int)
|    ├─logprobs (NoneType)
|    ├─finish_reason (str)
└─]
└─usage (dict)
|├─prompt_tokens (int)
|├─completion_tokens (int)
|└─total_tokens (int)


In [11]:
len(response["choices"])

1

In [12]:
print(response["choices"][0]["text"])

Tell me a long story,

Tell me a short one.

I’ll listen to you,

And I won’t say no.

I’ll listen to your stories,

Of love and of hate.

I’ll listen to your stories,

Of joy and of fate.

I’ll listen to your stories,

Of life and of death.

I’ll listen to your stories,

Of hope and of breath.


In [15]:
print(response["choices"][0]["finish_reason"])

length


In [18]:
response["usage"]["completion_tokens"]

100

## Benchmark llama cpp

In [None]:
import time

In [None]:

def llama_measure_tokens_per_second(prompts: list, max_tokens: int = 500) -> float:

    total_tokens_generated = 0
    total_time_taken = 0.0


    for prompt in prompts:
        # Measure the time taken to generate the output
        start_time = time.time()
        #tokenized_input = tokenizer(prompt, return_tensors="pt")
        output = generate_text_from_prompt(prompt, max_tokens=max_tokens)
        end_time = time.time()

        # Calculate the number of tokens generated
        num_tokens_generated = output["usage"]["completion_tokens"]
        
        # Calculate the time taken
        time_taken = end_time - start_time
        
        # Update the total tokens and time
        total_tokens_generated += num_tokens_generated
        total_time_taken += time_taken
    
    
    # Calculate average tokens per second
    average_tokens_per_second = total_tokens_generated / total_time_taken
    

    return average_tokens_per_second

In [None]:
prompts = [
    "What is your favourite condiment?",
    "Do you have any recipes for mayonnaise?",
    "Once upon a time ",
    "Once upon a time on Mars ",
    "Once upon a time on in the distance past ",
]
average_tokens_per_second = lamma_measure_tokens_per_second(prompts, max_tokens=2048)

In [None]:
print(average_tokens_per_second)

In [None]:

messages = [
    {"role": "user", "content": "What is your favourite condiment?"},
    {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
    {"role": "user", "content": "Do you have mayonnaise recipes?"}
]

encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

model_inputs = encodeds.to(device)


# Exllama

In [25]:
import requests

In [50]:
host = "threadripper"
port = "8400"
endpoint = "generate_text"

payload = {
  "user_prompt": "Write me a long story set in the X-Files universe: \n",
  "max_tokens": 10000,
  "temperature": 0.3,
  "top_p": 0.1,
  "echo": True,
  "overlap" : 500
}
response = requests.post(f"http://{host}:{port}/{endpoint}", json=payload)

In [51]:
response.json()["token_count"]

1847

In [52]:
print(response.json()["generated_text"])

Write me a long story set in the X-Files universe: 

I'm not sure if I can do this, but I'll try.

The year is 2015 and Mulder has been missing for over a year now. Scully has been working with the FBI ever since, trying to find him. She's been through hell and back, but she won't give up until she finds her partner.

One day, while she's investigating a case in New York City, she gets a call from an old friend of Mulder's. He tells her that he has information about where Mulder might be. Scully is hesitant at first, but eventually agrees to meet him.

When they finally meet up, the man tells Scully that he knows where Mulder is, but he can't tell her anything more until she helps him with a case. Scully reluctantly agrees and they head out to investigate.

The case turns out to be much bigger than either of them expected. It involves a group of people who are trying to take over the world by using alien technology. They've been kidnapping people and experimenting on them, turning them