# Generation with LLMs

In [1]:
from google.colab import drive

# mount google drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# set the models folder, HuggingFace will look into this folder and download the model if needed
%env HF_HOME=/content/drive/MyDrive/HMD/cache

env: HF_HOME=/content/drive/MyDrive/HMD/cache


We will use ```Llama2-7b-chat``` and ```Llama-3-8B-Instruct```.
These models are fine-tuned versions of the base models.
Since the models were prompted with specific templates during fine-tuning, we will use the same templates to have the models be in the best conditions.

In [4]:
MODELS = {
    "llama2": "meta-llama/Llama-2-7b-chat-hf",
    "llama3": "meta-llama/Meta-Llama-3-8B-Instruct",
}

TEMPLATES = {
    "llama2": "<s>[INST] <<SYS>>\n{}\n<</SYS>>\n\n{} [/INST]",
    "llama3": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>",
}

## Create Huggingface Access Token

1. Create an account on [HuggingFace](https://huggingface.co/join)
2. Log in on [HuggingFace](https://huggingface.co/login)
3. [Create a new access token](https://huggingface.co/settings/tokens)

    1. Click on "Create New Access Token"
    2. Select "Read" as Token Type
    3. Give it a name, e.g. HMD
    4. Create and "Copy" it, you won't be able to do it afterwards
    5. Paste it in the cell below to use it as an environment variable

In [None]:
# do not add quotes or double quotes, just replace the paste the token
%env HF_TOKEN=...

## Download the models (Only The first time)

1. Request access for [LLaMA 2](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) and [LLaMA 3](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) by following the instructions on their HuggingFace page
2. After having been granted access, run the code below to download the models (you will require 28GB of space)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

def download_models(models):
    for model_name in models.values():
        # triggers download of the models
        AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            torch_dtype=torch.float16
        )
        AutoTokenizer.from_pretrained(model_name)

download_models(MODELS)

## Prompt the models

Import the required libraries and classes.

In [5]:
import torch

from typing import Tuple
from transformers import AutoModelForCausalLM, AutoTokenizer, BatchEncoding, PreTrainedTokenizer, PreTrainedModel

Functions for loading the models and generate responses.

In [6]:
def load_model(model_name: str, dtype) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
    torch_dtype = torch.float32
    if dtype == "bf16":
        torch_dtype = torch.bfloat16
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch_dtype,

    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

def generate(
    model: PreTrainedModel,
    inputs: BatchEncoding,
    tokenizer: PreTrainedTokenizer,
    max_seq_length: int,
) -> str:
    output = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=max_seq_length,
        pad_token_id=tokenizer.eos_token_id,
    )
    return tokenizer.decode(
        output[0][len(inputs.input_ids[0]) :], skip_special_tokens=True
    )

Parameters and input for the generation.

In [None]:
model_name = "llama3"
chat_template = TEMPLATES[model_name]
model_name = MODELS[model_name]

dtype = "bf16"
max_seq_length = 512

Load the model and tokenizer based on the parameters.

In [None]:
model, tokenizer = load_model(model_name, dtype)

### Segment into into multiple sentences

In [None]:
segmentor_prompt = """
Break the user input into multiple sentences based on the following intents:
- pizza_ordering, if the user wants to order a pizza.
- request_info(drink_type), if the user wants to which drinks are available.
- out_of_domain, if the input does not match any of the above.
Only provide the sentences, without the intent in list separated by commas, as follows
["sentence1", "sentence2", ...]
Only provide the list.
"""

user_input = "I would like to order margherita pizza, please. Which kind of drinks do you have."

In [None]:
# Format and tokenize the input
input_text = chat_template.format(segmentor_prompt, user_input)
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

# Generate a response
response = generate(model, inputs, tokenizer, max_seq_length)
print(response)



["I would like to order a margherita pizza.", "Which kind of drinks do you have?"]


In [None]:
import json

json.loads(response)

['I would like to order a margherita pizza, please.',
 'What drinks are available?']

### NLU

In [None]:
nlu_system_prompt = """Identify the user intent from this list: 
[pizza_ordering, pizza_delivery, drink_ordering, out_of_domain].
If the intent is pizza_ordering, extract the following slot values from the user input
- pizza_size, the size of the pizza
- pizza_type, the type of pizza
- pizza_count, the number of pizzas.
If no values are present in the user input you have to put null as the value.
Output them in a json format.
Only output the json file.
The json format is: 
{
    "intent": "intent_value", 
    "slots": {
        "slot1": "value1",
        "slot2": "value2",
        "slot3": "value3"
    }
}"""

nlu_input = "I would like to order margherita pizza, please."

In [None]:
# Format and tokenize the input
input_text = chat_template.format(nlu_system_prompt, nlu_input)
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

# Generate a response
response = generate(model, inputs, tokenizer, max_seq_length)
print(response)



{
    "intent": "pizza_ordering", 
    "slots": {
        "pizza_type": "margherita",
        "pizza_size": null,
        "pizza_count": null
    }
}


### DST

In [None]:
from copy import deepcopy

def clean_response(response: dict) -> dict:
    final_dict = deepcopy(response)
    for key, value in response.items():
        if value == None:
            del final_dict[key]
        elif isinstance(value, dict):
            cleaned_dict = clean_response(deepcopy(value))
            if len(cleaned_dict) == 0:
                del final_dict[key]
            else:
                final_dict[key] = cleaned_dict
    return final_dict


def update_ds(ds: dict, nlu_response: dict) -> dict:
    for key, value in nlu_response.items():
        if value is None:
            continue
        elif isinstance(value, str):
            ds[key] = value
        elif isinstance(value, dict):
            ds[key] = update_ds(ds[key], value)
    
    return ds



In [None]:
import json

dialogue_state = {
    "intent": "pizza_ordering",
    "slots": {
        "pizza_type": "margherita",
        "pizza_size": None,
        "pizza_count": None
    }
}

nlu_response = """{
    "intent": "pizza_ordering", 
    "slots": {
        "pizza_type": null,
        "pizza_size": "medium",
        "pizza_count": null
    }
}"""


nlu_response = json.loads(nlu_response)
update_ds(dialogue_state, clean_response(nlu_response))

{'intent': 'pizza_ordering',
 'slots': {'pizza_type': 'margherita',
  'pizza_size': 'medium',
  'pizza_count': None}}

### DM

In [None]:
dm_system_prompt = """You are the Dialogue Manager.
Given the output of the NLU component, you should only generate the next best action from this list:
- request_info(slot), if a slot value is missing (null) by substituting slot with the missing slot name
- confirmation(intent), if all slots have been filled"""

dm_input = """
{
    "intent": "pizza_ordering",
    "slots": {
        "pizza_size": null,
        "pizza_type": "margherita",
        "pizza_count": null
    }
}"""

In [None]:
# Format and tokenize the input
input_text = chat_template.format(dm_system_prompt, dm_input)
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

# Generate a response
response = generate(model, inputs, tokenizer, max_seq_length)
print(response)



request_info("pizza_size")


### NLG

In [None]:
nlg_system_prompt = """You are the NLG component: you must be very polite.
Given the next best action classified by the Dialogue Manager (DM), 
you should only generate a lexicalized response for the user.
Possible next best actions are:
- request_info(slot): generate an appropriate question to ask the user for the missing slot value 
- confirmation(intent): generate an appropriate confirmation message for the user intent"""

nlg_input = """
{
    "NLU" : {
        "intent": "pizza_ordering",
        "slots": {
            "pizza_size": null,
            "pizza_type": "margherita",
            "pizza_count": null
        }
    },
    "DM": {
        "next_best_action": "request_info(pizza_size)"
    }
"""

In [None]:
# Format and tokenize the input
input_text = chat_template.format(nlg_system_prompt, nlg_input)
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

# Generate a response
response = generate(model, inputs, tokenizer, max_seq_length)
print(response)



What size pizza would you like to order?


### Full pipeline

In [None]:
# Format and tokenize the input
user_input = "I would like a small margherita pizza."
input_text = chat_template.format(nlu_system_prompt, user_input)
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
nlu_response = generate(model, inputs, tokenizer, max_seq_length)


input_text = chat_template.format(dm_system_prompt, nlu_response)
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
dm_response = generate(model, inputs, tokenizer, max_seq_length)


input_text = chat_template.format(nlg_system_prompt, dm_response)
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
nlg_response = generate(model, inputs, tokenizer, max_seq_length)

In [None]:
print(f"User: {user_input}")

print(f"NLU: {nlu_response}")

print(f"DM: {dm_response}")

print(f"NLG: {nlg_response}")

User: I would like a small margherita pizza.
NLU: 

{
    "intent": "pizza_ordering", 
    "slots": {
        "pizza_size": null,
        "pizza_type": "margherita",
        "pizza_count": null
    }
}
DM: 

request_info("pizza_size")
NLG: 

What size pizza would you like to order? Would you prefer a small, medium, large, or extra-large pie?


### Parameters

Parameters for the underlying python script:

```
usage: python -m query_model [-h] [--system-prompt SYSTEM_PROMPT] [--dtype {f32,bf16}] [--max_seq_length MAX_SEQ_LENGTH] [--return-full] [--dotenv-path DOTENV_PATH] {llama2,llama3} INPUT_TEXT

Query a specific model with a given input.

positional arguments:
  {llama2,llama3}       The model to query.
  INPUT_TEXT            The input to query the model with.

options:
  -h, --help            show this help message and exit
  --system-prompt SYSTEM_PROMPT
                        The system prompt to use for the model. (default: )
  --dtype {f32,bf16}    The data type to use for the model. (default: f32)
  --max_seq_length MAX_SEQ_LENGTH
                        The maximum sequence length to use for the model. (default: 128)
```