# LLM Generation at high level

```python
query = "Capital of"

output = ""
for i in range(MAX_GENERATED_TOKENS):
    output = output + LLM(output)
```

0. output = Capital of
1. output = Capital of <mark>France</mark>
1. output = Capital of France <mark>is</mark>
1. output = Capital of France is <mark>Paris</mark>

```
query = A chat between a curious user and an artificial intelligence assistant.
        The assistant gives helpful, detailed, and polite answers to the user's questions.
        USER: My name is Aniket
        ASSISTANT:
```

In [1]:
longchat_template = """A chat between a curious user and an artificial intelligence assistant.
The assistant gives helpful, detailed, and polite answers to the user's questions.
USER: {input}
ASSISTANT:"""

In [2]:
print(longchat_template.format(input="My name is Aniket"))

A chat between a curious user and an artificial intelligence assistant.
The assistant gives helpful, detailed, and polite answers to the user's questions.
USER: My name is Aniket
ASSISTANT:


In [3]:
print(longchat_template.format(input="What is the capital of France?"))

A chat between a curious user and an artificial intelligence assistant.
The assistant gives helpful, detailed, and polite answers to the user's questions.
USER: What is the capital of France?
ASSISTANT:


In [4]:
from llm_inference import LLMInference, prepare_weights
from rich import print

In [None]:
# path = str(prepare_weights("meta-llama/Llama-2-7b-chat-hf"))
# model = LLMInference(checkpoint_dir=path, quantize="bnb.nf4")

In [5]:
path = str(prepare_weights("lmsys/longchat-7b-16k"))
model = LLMInference(checkpoint_dir=path, quantize="bnb.nf4")

You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision


weights already exists at checkpoints/lmsys/longchat-7b-16k


Loading model 'checkpoints/lmsys/longchat-7b-16k/lit_model.pth' with {'org': 'lmsys', 'name': 'longchat-7b-16k', 'block_size': 16384, 'vocab_size': 32000, 'padding_multiple': 64, 'padded_vocab_size': 32000, 'n_layer': 32, 'n_head': 32, 'n_embd': 4096, 'rotary_percentage': 1.0, 'parallel_residual': False, 'bias': False, 'n_query_groups': 32, 'shared_attention_norm': False, '_norm_class': 'RMSNorm', 'norm_eps': 1e-06, '_mlp_class': 'LLaMAMLP', 'intermediate_size': 11008, 'condense_ratio': 8}


Time to instantiate model: 5.07 seconds.
Time to load the model weights: 11.70 seconds.
/home/aniket/miniconda3/envs/am/lib/python3.10/site-packages/lightning/fabric/fabric.py:943: The model passed to `Fabric.setup()` has 66 parameters on different devices (for example 'transformer.wte.weight' on cuda:0 and 'lm_head.weight' on cpu). Since `move_to_device=True`, all parameters will be moved to the new device. If this is not desired, set `Fabric.setup(..., move_to_device=False)`.


In [6]:
longchat_template = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
USER: {input}
ASSISTANT:"""

In [7]:
query = longchat_template.format(input="What is the capital of France?")
output = model.chat(query)
print(output)



Time for inference: 1.23 sec total, 5.70 tokens/sec
Memory used: 13.40 GB


In [8]:
output = model.chat(longchat_template.format(input="My name is Aniket"))
print(output)



Time for inference: 0.66 sec total, 18.09 tokens/sec
Memory used: 13.40 GB


In [9]:
output = model.chat(longchat_template.format(input="Write a poem on Lightning AI"))
print(output) 



Time for inference: 8.08 sec total, 22.53 tokens/sec
Memory used: 13.40 GB


## Memory

In [10]:
output = model.chat(longchat_template.format(input="My name is Aniket?"))
print(output)


output = model.chat(longchat_template.format(input="What is my name?"))
print(output)



Time for inference: 2.02 sec total, 5.94 tokens/sec
Memory used: 13.40 GB




Time for inference: 8.57 sec total, 6.89 tokens/sec
Memory used: 13.40 GB


In [11]:
longchat_template = """A chat between a curious user and an artificial intelligence assistant.
The assistant gives helpful, detailed, and polite answers to the user's questions.
Context:
User: My name is Aniket
Assistant: Hi, Aniket how are you?

USER: {input}
ASSISTANT:"""

In [12]:
longchat_template = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
Context:
USER: My name is Aniket!
ASSISTANT: How can I help you Aniket?
USER: {input}
ASSISTANT:"""

output = model.chat(longchat_template.format(input="What is my name?"))
print(output)



Time for inference: 1.37 sec total, 5.13 tokens/sec
Memory used: 13.40 GB


In [13]:
longchat_template = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
Context:
{history}
USER: {input}
ASSISTANT:"""

history ="USER: Hi, I am Aniket!\nAssistant: How can I help you Aniket?"

query = longchat_template.format(input="What is my name?", history=history)
output = model.chat(query)
print(output)



Time for inference: 1.35 sec total, 5.18 tokens/sec
Memory used: 13.40 GB


[PromptTemplate doc](https://python.langchain.com/docs/modules/model_io/prompts/prompt_templates/)

In [15]:
from langchain.prompts import PromptTemplate

longchat_template = """A chat between a curious user and an artificial intelligence assistant.
The assistant gives helpful, detailed, and polite answers to the user's questions.
Context:
{history}
USER: {input}
ASSISTANT:"""

longchat_prompt_template = PromptTemplate(
    input_variables=["input", "history"], template=longchat_template
)

In [16]:
print(longchat_prompt_template.format(
    input = "What is my name?",
    history ="USER: Hi, I am Aniket!\nAssistant: How can I help you Aniket?"
))

In [17]:
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferWindowMemory

from llm_chain import LitGPTLLM


llm = LitGPTLLM(model=model)


conversation = ConversationChain(
    llm=llm,
    prompt=longchat_prompt_template,
    verbose=False,
    memory=ConversationBufferWindowMemory(ai_prefix="Assistant", human_prefix="User", k=2),
)

In [18]:
conversation("hi, I am Aniket")["response"]



Time for inference: 2.04 sec total, 5.87 tokens/sec
Memory used: 13.40 GB


'Hello Aniket! How can I assist you today?'

In [19]:
conversation("What is my name?")["response"]



Time for inference: 1.38 sec total, 5.06 tokens/sec
Memory used: 13.40 GB


'Your name is Aniket.'

In [20]:
conversation("What is the timezone of London?")["response"]



Time for inference: 2.34 sec total, 5.99 tokens/sec
Memory used: 13.40 GB


'London is in the GMT (Greenwich Mean Time) timezone.'

In [21]:
print(conversation.memory.chat_memory)

In [22]:
from langchain.memory import ConversationBufferMemory

conversation = ConversationChain(
    llm=llm,
    prompt=longchat_prompt_template,
    verbose=False,
    memory=ConversationBufferMemory(ai_prefix="Assistant", human_prefix="User"),
)

In [23]:
output = conversation(
    "PyTorch Lightning is an open-source library developed by Lightning AI team."
)["response"]
print(output)



Time for inference: 38.56 sec total, 7.29 tokens/sec
Memory used: 13.40 GB


In [24]:
output = conversation(
    "who developed PyTorch Lightning? just give me the name of the team or person and nothing else."
)["response"]
print(output)



Time for inference: 3.12 sec total, 6.08 tokens/sec
Memory used: 13.41 GB


* https://twitter.com/yanndubs/status/1681644889145237504?s=20