# Llama 2 Code

### Run the following cells to install required packages


In [None]:
!pip install sentencepiece==0.1.99
!pip install transformers==4.31.0
!pip install accelerate==0.21.0
!pip install bitsandbytes==0.41.1

Collecting sentencepiece==0.1.99
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99
Collecting transformers==4.31.0
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m57.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers==4.31.0)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.31.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m

### Loading Llama 2




In [None]:
from transformers import LlamaForCausalLM, LlamaTokenizer

hf_access_token = "INSERT YOUR TOKEN HERE"

tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", token=hf_access_token)
model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", load_in_8bit=True, device_map="auto", token=hf_access_token)

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

## Llama2 Prompting with Special Tokens



$\quad$ \- According to the instructions mentioned in: https://github.com/facebookresearch/llama/blob/main/llama/generation.py#L212


### You can initialize the generation configuration here:

*   temperature
*   top_p
*   repetition_penalty



In [None]:
def llama2_talk(text):
    gen_len = 512
    generation_kwargs = {
          "max_new_tokens": gen_len,
          "top_p": 0.9,
          "temperature": 0.6,
          "repetition_penalty": 1.2,
          "do_sample": True,
      }

    B_INST, E_INST = "[INST]", "[/INST]"

    prompt_text = text

    prompt = f"{B_INST} {prompt_text} {E_INST}"  # Special format reuired by the Llama2 Chat Model

    prompt_ids = tokenizer(prompt, return_tensors="pt")

    prompt_size = prompt_ids['input_ids'].size()[1]

    generate_ids = model.generate(prompt_ids.input_ids.to(model.device), **generation_kwargs)

    generate_ids = generate_ids.squeeze()

    response = tokenizer.decode(generate_ids.squeeze()[prompt_size+1:], skip_special_tokens=True).strip()

    print("Llama response: ", response)

### Code with System Prompts (Optional)



*   System Prompts Provide more context about the underlying task to the model: https://huggingface.co/blog/llama2

In [None]:
def llama2_system_talk(system, text):
    gen_len = 512

    generation_kwargs = {
          "max_new_tokens": gen_len,
          "top_p": 0.9,
          "temperature": 0.6,
          "repetition_penalty": 1.2,
          "do_sample": True,
      }

    B_INST, E_INST = "[INST]", "[/INST]"

    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

    system_prompt = system

    prompt_text = text

    prompt = f"{B_INST} {B_SYS} {system_prompt} {E_SYS} {prompt_text} {E_INST}"  # Special format required by the Llama2 Chat Model where we can use system messages to provide more context about the task

    prompt_ids = tokenizer(prompt, return_tensors="pt")

    prompt_size = prompt_ids['input_ids'].size()[1]

    generate_ids = model.generate(prompt_ids.input_ids.to(model.device), **generation_kwargs)

    generate_ids = generate_ids.squeeze()

    response = tokenizer.decode(generate_ids.squeeze()[prompt_size+1:], skip_special_tokens=True).strip()

    print("Llama response: ", response)

#Examples:

In [None]:
input_text = "The movie Openhimer is the best movie ever to come out in 2023 \n What is the sentiment of the following sentence? Is it positive, neutral or negative?"

llama2_talk(input_text)

Llama response:  Based on the sentence "The movie Openhimer is the best movie ever to come out in 2023," the sentiment is POSITIVE. The use of the word "best" and the exclamation point convey enthusiasm and excitement about the movie, indicating a positive sentiment.


#Examples with System Prompts (Optional):

In [None]:
system_prompt = "You are a helpful, respectful and honest assistant. Your role is to read the given text and identify the corresponding sentiment"

input_text = "The movie Openhimer is the best movie ever to come out in 2023 \n What is the sentiment of the following sentence? Is it positive, neutral or negative?"

llama2_system_talk(system_prompt, input_text)

Llama response:  Great! Based on the information provided, the sentiment of the sentence "The movie Openhimer is the best movie ever to come out in 2023" is POSITIVE.
