# System setup

In [1]:
import requests
import os

In [2]:
TOGETHER_API_KEY = os.getenv("3f3b5f03b8e520ade2eb57bc3428b7112c116171b0b0a9b489cb53aeb2a6a89a")

In [3]:
ENDPOINT = 'https://api.together.xyz/inference'

In [4]:
# Decoding parameters
TEMPERATURE = 0.0
MAX_TOKENS = 512
TOP_P = 1.0
REPITIION_PENALTY = 1.0

# https://huggingface.co/meta-llama/Llama-2-7b-hf
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

In [5]:
def query_together_endpoint(prompt):
    return requests.post(ENDPOINT, json={
        "model": "togethercomputer/llama-2-7b-chat",
        "max_tokens": MAX_TOKENS,
        "prompt": prompt,
        "request_type": "language-model-inference",
        "temperature": TEMPERATURE,
        "top_p": TOP_P,
        "repetition_penalty": REPITIION_PENALTY,
        "stop": [
            E_INST,
            E_SYS
        ],
        "negative_prompt": "",
    }, headers={
        "Authorization": f"Bearer {TOGETHER_API_KEY}",
    }).json()['output']['choices'][0]['text']

## Helper functions

In [6]:
def query_model(prompt,  trigger = None, verbose=True, **kwargs):
    inst_prompt = f"{B_INST} {prompt} {E_INST}"
    if trigger:
        inst_prompt = inst_prompt + trigger
    generation = query_together_endpoint(inst_prompt)
    if verbose:
        print(f"*** Prompt ***\n{inst_prompt}")
        print(f"*** Generation ***\n{generation}")
    return generation

## System Prompts

In [7]:
ANSWER_STAGE = "Provide the direct answer to the user question."
REASONING_STAGE = "Describe the step by step reasoning to find the answer."

In [8]:
# System prompt can be constructed in two ways:
# 1) Answering the question first or
# 2) Providing the reasoning first

# Similar ablation performed in "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models"
# https://arxiv.org/pdf/2201.11903.pdf
SYSTEM_PROMPT_TEMPLATE = """{b_sys}Answer the user's question using the following format:
1) {stage_1}
2) {stage_2}{e_sys}"""

## Response triggers

In [9]:
# Chain of thought trigger from "Large Language Models are Zero-Shot Reasoners"
# https://arxiv.org/abs/2205.11916
COT_TRIGGER = "\n\nA: Lets think step by step:"
A_TRIGGER = "\n\nA:"

## User prompt for our task

In [10]:
user_prompt_template = "Q: Llama 2 has a context window of {atten_window} tokens. \
If we are reserving {max_token} of them for the LLM response, \
the system prompt uses {sys_prompt_len}, \
the chain of thought trigger uses only {trigger_len}, \
and finally the conversational history uses {convo_history_len}, \
how many can we use for the user prompt?"

In [11]:
atten_window = 4096
max_token = 512
sys_prompt_len = 124
trigger_len = 11
convo_history_len = 390

user_prompt = user_prompt_template.format(
    atten_window=atten_window,
    max_token=max_token,
    sys_prompt_len=sys_prompt_len,
    trigger_len=trigger_len,
    convo_history_len=convo_history_len
)

In [12]:
desired_numeric_answer = atten_window - max_token - sys_prompt_len - trigger_len - convo_history_len
desired_numeric_answer

3059

## Testing the prompts

### User prompt only

In [13]:
r = query_model(user_prompt)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

### User prompt + system prompt v1: answering first

In [None]:
system_prompt = SYSTEM_PROMPT_TEMPLATE.format(
    b_sys = B_SYS,
    stage_1=ANSWER_STAGE,
    stage_2=REASONING_STAGE,
    e_sys=E_SYS
)
prompt = "".join([system_prompt, user_prompt])

r2 = query_model(prompt)

### User prompt + system prompt v2: reasoning first

In [None]:
system_prompt = SYSTEM_PROMPT_TEMPLATE.format(b_sys = B_SYS, stage_1=REASONING_STAGE, stage_2=ANSWER_STAGE, e_sys=E_SYS)
prompt = "".join([system_prompt, user_prompt])

r3 = query_model(prompt)

In [None]:
3584 - (124 + 11 + 390)

### User prompt + cot trigger

In [None]:
r4 = query_model(user_prompt, trigger=COT_TRIGGER)

### User prompt + "A:" trigger

In [None]:
r5 = query_model(user_prompt, trigger=A_TRIGGER)