<a href="https://colab.research.google.com/github/aartikushal/IC-LLAMA-ASSIGNMENT/blob/main/LLAMA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**1) Install dependencies**

In [8]:
!pip install transformers torch accelerate



In [None]:
!huggingface-cli login

**2) Login to Hugging Face**

In [11]:
from transformers import AutoTokenizer
import transformers
import torch

model = "meta-llama/Llama-2-7b-chat-hf" # meta-llama/Llama-2-7b-hf

tokenizer = AutoTokenizer.from_pretrained(model, use_auth_token=True)



In [None]:
from transformers import pipeline

llama_pipeline = pipeline(
    "text-generation",  # LLM task
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
)

**4) Chat helper function**

In [None]:
def generate_llama_response(prompt: str) -> None:
    """
    Generate a response from the Llama model.

    Parameters:
        prompt (str): The user's input/question for the model.

    Returns:
        None: Prints the model's response.
    """
    sequences = llama_pipeline(
        prompt,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=256,
    )
    print("Chatbot:", sequences[0]['generated_text'])



prompt = 'I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n'
generate_llama_response(prompt)

**5) Quick test**

In [None]:
prompt = """I'm a programmer and Python is my favorite language because of it's simple syntax and variety of applications I can build with it.\
Based on that, what language should I learn next?\
Give me 5 recommendations"""
generate_llama_response(prompt)

In [None]:
prompt = 'How to do quick breakfast using non startchy veggies?\n'
generate_llama_response(prompt)

In [None]:
while True:
    user_input = input("You: ")
    if user_input.lower() in ["bye", "quit", "exit"]:
        print("Chatbot: Goodbye!")
        break
    generate_llama_response(user_input)


**6) Gradio Chat UI**

In [6]:

SYSTEM_PROMPT = "You are a helpful, concise assistant."
MAX_TOKENS = 512
TEMPERATURE = 0.7
TOP_P = 0.95

def format_prompt(history, user_message):
    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
    for u, a in history:
        if u:
            messages.append({"role": "user", "content": u})
        if a:
            messages.append({"role": "assistant", "content": a})
    messages.append({"role": "user", "content": user_message})

    # Build chat prompt in LLaMA-2 format
    prompt = ""
    for m in messages:
        if m["role"] == "system":
            prompt += f"<s>[INST] <<SYS>>\n{m['content']}\n<</SYS>>\n"
        elif m["role"] == "user":
            prompt += f"{m['content']} [/INST] "
        elif m["role"] == "assistant":
            prompt += f"{m['content']} </s><s>[INST] "
    return prompt

def generate_chat_response(history, user_message):
    prompt = format_prompt(history, user_message)
    out = llama_pipeline(
        prompt,
        max_new_tokens=MAX_TOKENS,
        do_sample=True,
        temperature=TEMPERATURE,
        top_p=TOP_P,
        eos_token_id=tokenizer.eos_token_id
    )
    reply = out[0]["generated_text"][len(prompt):]
    return reply.strip()

In [7]:
import gradio as gr

with gr.Blocks() as demo:
    gr.Markdown("# 🦙 LLaMA-2 7B Chatbot")
    chatbot = gr.Chatbot(height=400)
    with gr.Row():
        msg = gr.Textbox(placeholder="Type a message...", scale=8)
        clear = gr.Button("Clear", scale=1)
    temp = gr.Slider(0.0, 1.5, value=TEMPERATURE, step=0.05, label="Temperature")
    top_p = gr.Slider(0.1, 1.0, value=TOP_P, step=0.05, label="Top-p")
    max_new = gr.Slider(64, 1024, value=MAX_TOKENS, step=32, label="Max new tokens")

    def respond(user_message, chat_history, temperature, top_p, max_new_tokens):
        global TEMPERATURE, TOP_P, MAX_TOKENS
        TEMPERATURE = float(temperature)
        TOP_P = float(top_p)
        MAX_TOKENS = int(max_new_tokens)
        bot_message = generate_chat_response(chat_history, user_message)
        chat_history = chat_history + [(user_message, bot_message)]
        return "", chat_history

    msg.submit(respond, [msg, chatbot, temp, top_p, max_new], [msg, chatbot])
    clear.click(lambda: None, None, chatbot, queue=False)

demo.launch()


  chatbot = gr.Chatbot(height=400)


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://bfc538ac4a1b244e5e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


