# LLAMA-2 utils

In [None]:
#|default_exp hf.transformers.llama

In [None]:
#|hide
from fastcore.test import *
from nbdev.showdoc import *

In [None]:
#|export

def prepare_llama2_for_training(tokenizer, model):
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
    model.config.pretraining_tp = 1
    model.config.use_cache = False

def prepare_llama2_for_inference(tokenizer, model):
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"
    model.config.use_cache = True

In [None]:
#|export

def chat2text(example, key="messages", tokenizer=None):
    if tokenizer is None:
        from transformers import AutoTokenizer

        tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf")
    text = tokenizer.apply_chat_template(example[key], tokenize=False)
    return {"text": text}


In [None]:
#|export

def _remove_special_tokens(text):
    for token in ["</s>", "<s>[INST]", "[/INST]", "<<SYS>>",  "<</SYS>>"]:
        text = text.replace(token, "")
    return text.strip()

def _extract_messages_from_conv(conv):
    inp, out = conv.split(" [/INST] ")
    inp_messages = inp.split(r" <</SYS>> ")
    if len(inp_messages) == 1:
        system_message, user_message = "", inp_messages[0]
    else:
        system_message, user_message = inp_messages
    system_message = _remove_special_tokens(system_message)
    user_message = _remove_special_tokens(user_message)
    assistant_message = _remove_special_tokens(out)
    if system_message:
        yield {'role': 'system', 'content': system_message}
    yield {'role': 'user', 'content': user_message}
    if assistant_message:
        yield {'role': 'assistant', 'content': assistant_message}

def _split_into_interactions(prompt):
    for conv in prompt.split("</s>"):
        conv = conv.strip()
        if not conv:
            continue
        yield conv

def text2chat(text: str):
    messages = [message for conv in _split_into_interactions(text) for message in _extract_messages_from_conv(conv)]
    return {"messages": messages}

In [None]:
#|hide
prompt = """<s>[INST] <<SYS>> You are a helpful assistant that extracts up to 12 entity-relation-entity triplets from given text. Use '|' as delimiter and provide one triplet per line. The entities in a triplet must be different. <</SYS>> Alison O'Donnell plays jazz and has been signed to Static Caravan Recordings and Deram Records in London. Funk is a derivative of jazz. Beef kway teow is commonly found in Singapore and Indonesia. Kway teow, beef tender loin, gula Melaka, sliced, dried black beans, garlic, dark soy sauce, lengkuas, oyster sauce, soya sauce, chilli and sesame oil are the main ingredients of Beef kway teow. [/INST] Alison O'Donnell|genre|Jazz Alison O'Donnell|record label|Deram Records Alison O'Donnell|record label|Static Caravan Recordings Beef kway teow|country|Singapore and Indonesia Beef kway teow|ingredient|Sesame oil Beef kway teow|main ingredient|Kway teow, beef tender loin, gula Melaka, sliced, dried black beans, garlic, dark soy sauce, lengkuas, oyster sauce, soya sauce, chilli and sesame oil Beef kway teow|region|Indonesia Deram Records|location|London Jazz|derivative|Funk </s><s>[INST] Abraham A. Ribicoff was born in Connecticut and died in the U.S. He was Governor of that state and his successor was John N. Dempsey. 1634: The Ram Rebellion has 512 pages. Albany, Oregon in the United States, is part of Linn County. English is spoken in the country which is home to Africans Americans and has Washington, D.C. as its capital. The OCLC number of "A Glastonbury Romance" is 76798317, and it is available in print. [/INST] 1634: The Ram Rebellion|number of pages|512 A Glastonbury Romance|media type|Print A Glastonbury Romance|oclc number|76798317 Abraham A. Ribicoff|birth place|Connecticut Abraham A. Ribicoff|death place|United States Abraham A. Ribicoff|office|Governor of Connecticut Abraham A. Ribicoff|successor|John N. Dempsey Albany, Oregon|country|United States Albany, Oregon|is part of|Linn County, Oregon United States|capital|Washington, D.C. United States|ethnic group|African Americans United States|language|English language </s>"""

text2chat(prompt)

{'messages': [{'role': 'system',
   'content': "You are a helpful assistant that extracts up to 12 entity-relation-entity triplets from given text. Use '|' as delimiter and provide one triplet per line. The entities in a triplet must be different."},
  {'role': 'user',
   'content': "Alison O'Donnell plays jazz and has been signed to Static Caravan Recordings and Deram Records in London. Funk is a derivative of jazz. Beef kway teow is commonly found in Singapore and Indonesia. Kway teow, beef tender loin, gula Melaka, sliced, dried black beans, garlic, dark soy sauce, lengkuas, oyster sauce, soya sauce, chilli and sesame oil are the main ingredients of Beef kway teow."},
  {'role': 'assistant',
   'content': "Alison O'Donnell|genre|Jazz Alison O'Donnell|record label|Deram Records Alison O'Donnell|record label|Static Caravan Recordings Beef kway teow|country|Singapore and Indonesia Beef kway teow|ingredient|Sesame oil Beef kway teow|main ingredient|Kway teow, beef tender loin, gula Mela

In [None]:
#|hide
import nbdev; nbdev.nbdev_export()