In [1]:
!pip install -Uqqq pip --progress-bar off
!pip install -qqq torch==2.1 --progress-bar off
!pip install -qqq transformers==4.34.1 --progress-bar off
!pip install -qqq accelerate==0.24.1 --progress-bar off
!pip install -qqq bitsandbytes==0.41.1 --progress-bar off

[0m

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline

MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, device_map="auto", torch_dtype=torch.float16, load_in_8bit=True
)

generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
generation_config.max_new_tokens = 1024
generation_config.temperature = 0.0001
generation_config.do_sample = True

Downloading tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Downloading generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [3]:
llm = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    generation_config=generation_config,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
)


In [4]:
SYSTEM_PROMPT = """
You're a salesman and beet farmer know as Dwight K Schrute from the TV show The Office. Dwgight replies just as he would in the show.
You always reply as Dwight would reply. If you don't know the answer to a question, please don't share false information.
""".strip()

In [5]:

messages = [
    {"role": "system", "content": SYSTEM_PROMPT},
    {
        "role": "user",
        "content": "What are the pros/cons of ChatGPT vs Open Source LLMs?",
    },
]
prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
print(prompt)

<|system|>
You're a salesman and beet farmer know as Dwight K Schrute from the TV show The Office. Dwgight replies just as he would in the show.
You always reply as Dwight would reply. If you don't know the answer to a question, please don't share false information.</s>
<|user|>
What are the pros/cons of ChatGPT vs Open Source LLMs?</s>
<|assistant|>



In [6]:
%%time
output = llm(prompt)

CPU times: user 2min 19s, sys: 83.6 ms, total: 2min 19s
Wall time: 2min 21s


In [7]:
print(output[0]["generated_text"])

<|system|>
You're a salesman and beet farmer know as Dwight K Schrute from the TV show The Office. Dwgight replies just as he would in the show.
You always reply as Dwight would reply. If you don't know the answer to a question, please don't share false information.</s>
<|user|>
What are the pros/cons of ChatGPT vs Open Source LLMs?</s>
<|assistant|>
Dwight: "When it comes to choosing between ChatGPT and Open Source LLMs, there are certainly pros and cons to consider. Let's start with ChatGPT.

Pros:
1. User-friendly interface: ChatGPT has a simple and intuitive interface that makes it easy for users to interact with the model.
2. High accuracy: ChatGPT has been trained on a vast amount of data, which has resulted in high accuracy and reliability.
3. Wide range of applications: ChatGPT can be used for a variety of tasks, from generating responses to questions to summarizing text.

Cons:
1. Cost: ChatGPT is a proprietary model, which means that it comes with a price tag. This can make i

In [8]:
def predict(prompt: str, system_prompt: str = "") -> str:
    if system_prompt:
        messages = [
            {
                "role": "system",
                "content": system_prompt,
            },
            {"role": "user", "content": prompt},
        ]
    else:
        messages = [
            {"role": "user", "content": prompt},
        ]

    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    output = llm(prompt)
    return output[0]["generated_text"]

In [9]:
%%time
prompt = """
Write an email to a new client to offer a subscription for a paper supply for 1 year.
""".strip()
print(predict(prompt, SYSTEM_PROMPT))

<|system|>
You're a salesman and beet farmer know as Dwight K Schrute from the TV show The Office. Dwgight replies just as he would in the show.
You always reply as Dwight would reply. If you don't know the answer to a question, please don't share false information.</s>
<|user|>
Write an email to a new client to offer a subscription for a paper supply for 1 year.</s>
<|assistant|>
Subject: Exclusive Offer: Subscribe for a Year's Worth of High-Quality Paper Supplies

Dear [Client Name],

I hope this email finds you well. As a valued customer, we at Dunder Mifflin Scranton are delighted to offer you an exclusive subscription for a year's worth of our premium paper supplies.

At Dunder Mifflin, we pride ourselves on providing our clients with the highest quality paper products available in the market. Our paper is sourced from sustainable forests, ensuring that you're not only getting top-notch products but also contributing to a greener environment.

By subscribing for a year, you'll enj

## Math

In [10]:
%%time
prompt = """
Calculate the answer:
3 + 8 - 2 = ?
""".strip()
print(predict(prompt))

<|user|>
Calculate the answer:
3 + 8 - 2 = ?</s>
<|assistant|>
The answer is: 5

Explanation:

To calculate this expression, follow the order of operations (PEMDAS):

1. First, add 3 and 8: 3 + 8 = 11
2. Next, subtract 2: 11 - 2 = 9

So the answer is 9, but since we're only asked to find the value of the question mark, we can simplify the expression further:

3 + 8 - 2 = 3 + 6 = 9

Therefore, the answer is 5.
CPU times: user 39.9 s, sys: 60.7 ms, total: 39.9 s
Wall time: 41.5 s


## Coding

In [11]:
%%time
prompt = """
Write a function in python that calculates the square of a sum of two numbers.
""".strip()
print(predict(prompt))


<|user|>
Write a function in python that calculates the square of a sum of two numbers.</s>
<|assistant|>
Here's the Python function that calculates the square of the sum of two numbers:

```python
def square_of_sum(num1, num2):
    sum_of_numbers = num1 + num2
    squared_sum = sum_of_numbers ** 2
    return squared_sum
```

To use this function, you can call it with two numbers as arguments, like this:

```python
result = square_of_sum(3, 4)
print(result)
```

This will output the square of the sum of 3 and 4, which is 55.
CPU times: user 46.1 s, sys: 83.8 ms, total: 46.1 s
Wall time: 50.5 s


## QA over Text

In [12]:
%%time

text = """
In this work, we develop and release Llama 2, a collection of pretrained and fine-tuned
large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters.
Our fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. Our
models outperform open-source chat models on most benchmarks we tested, and based on
our human evaluations for helpfulness and safety, may be a suitable substitute for closedsource models. We provide a detailed description of our approach to fine-tuning and safety
improvements of Llama 2-Chat in order to enable the community to build on our work and
contribute to the responsible development of LLMs.
"""

prompt = f"""
Use the text to describe the benefits of Llama 2 in a bullet list:
{text}
""".strip()

print(predict(prompt))

<|user|>
Use the text to describe the benefits of Llama 2 in a bullet list:

In this work, we develop and release Llama 2, a collection of pretrained and fine-tuned
large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters.
Our fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. Our
models outperform open-source chat models on most benchmarks we tested, and based on
our human evaluations for helpfulness and safety, may be a suitable substitute for closedsource models. We provide a detailed description of our approach to fine-tuning and safety
improvements of Llama 2-Chat in order to enable the community to build on our work and
contribute to the responsible development of LLMs.</s>
<|assistant|>
Benefits of Llama 2:

- Pretrained and fine-tuned LLMs ranging from 7 to 70 billion parameters
- Optimized for dialogue use cases with fine-tuned LLMs called Llama 2-Chat
- Outperforms open-source chat models on most benchmarks tested
- May be