# Llibraries installation

In [None]:
pip install -q datasets sentence-transformers faiss-cpu accelerate bitsandbytes gradio spaces

In [None]:
#Restart Kernel - colab doesn't seem to detect accelerate without restarting (Runtime -> Restart session)

# Dataset

## Scraping a web page with documentation

In [None]:
import requests
from bs4 import BeautifulSoup
import re

urls = ['https://tokeru.com/cgwiki/HoudiniGettingStarted.html', 'https://tokeru.com/cgwiki/Houdini.html', 'https://tokeru.com/cgwiki/HoudiniChops.html', 'https://tokeru.com/cgwiki/HoudiniCops.html', 'https://tokeru.com/cgwiki/HoudiniCrowd.html', 'https://tokeru.com/cgwiki/HoudiniDops.html', 'https://tokeru.com/cgwiki/HoudiniFAQ.html', 'https://tokeru.com/cgwiki/HoudiniHair.html', 'https://tokeru.com/cgwiki/HoudiniHDA.html', 'https://tokeru.com/cgwiki/HoudiniKinefx.html', 'https://tokeru.com/cgwiki/Houdini_Lighting_Shading.html', 'https://tokeru.com/cgwiki/HoudiniLops.html', 'https://tokeru.com/cgwiki/HoudiniPython.html', 'https://tokeru.com/cgwiki/HoudiniTops.html', 'https://tokeru.com/cgwiki/HoudiniUserInterfaceTips.html', 'https://tokeru.com/cgwiki/HoudiniVellum.html', 'https://tokeru.com/cgwiki/HoudiniVex1.html', 'https://tokeru.com/cgwiki/HoudiniVex2.html', 'https://tokeru.com/cgwiki/HoudiniVex3.html', 'https://tokeru.com/cgwiki/HoudiniVolumes.html', 'https://tokeru.com/cgwiki/Houdini_Vops.html']


def scrape_webpage(url):
    # Fetch the webpage content
    response = requests.get(url)
    # Parse the content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the first occurrence of h
    first_heading = soup.find(['h1','h2', 'h3', 'h4'])
    if not first_heading:
        return ["No h1-4 tags found"]

    # Collect parts starting from the first h
    parts = []
    current_part = [first_heading.text]  # start with the first heading text

    # Iterate over next siblings of the first heading
    for sibling in first_heading.next_siblings:
        if sibling.name in ['h1','h2', 'h3', 'h4']:
            # When a new h is found, store the current part and start a new one
            parts.append(' '.join(current_part))
            current_part = [sibling.text]  # start new part with the heading text
        elif sibling.name:
            # Add text from other elements
            current_part.append(sibling.get_text(strip=True))

    # Append the last part collected
    if current_part:
        parts.append(' '.join(current_part))

    return parts

# Call the function and print the result
parts = []

for url in urls:
    parts += scrape_webpage(url)




#Cleaning
def clean_text(text):
    # Regular expression to identify URLs
    url_pattern = r'(https?://[^\s]+)'
    # Find all URLs using regex
    urls = re.findall(url_pattern, text)

    # Create a dictionary to replace non-ASCII characters in the rest of the text while preserving URLs
    preserved_urls = {url: f"URL_{i}" for i, url in enumerate(urls)}

    # Replace URLs with placeholders to avoid accidental modification
    for url, placeholder in preserved_urls.items():
        text = text.replace(url, placeholder)

    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # Restore URLs from placeholders
    for placeholder, url in preserved_urls.items():
        text = text.replace(placeholder, url)

    return text

for i in range(len(parts)):
    parts[i] = clean_text(parts[i])



from datasets import Dataset, DatasetDict

def create_dataset(text_segments):
    # Create a Dataset from the list of text segments
    dataset = Dataset.from_dict({'text': text_segments})

    # Create a DatasetDict with only a training set
    dataset_dict = DatasetDict({
        'train': dataset
    })

    return dataset_dict

dataset = create_dataset(parts)

dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 729
    })
})

In [None]:
# Show column names and example entries
print(dataset['train'][0])  # Display the first entry in the training set

{'text': "Getting Started  So you've decided to take a look at Houdini? Good for you! If you'restillnot sure and you're a competent Maya user, maybe skim theMayaToHoudinipage, it might convince you. This site assumes you can navigate around Houdini and know the basics. If you don't, don't worry, the core UI is very simple, and shouldn't take more than an hour to learn. If that's you, then you probably want to do these steps first: Download Houdini Apprentice. It's free, and gives you access to everything you need.Check out the Sidefx Learning Paths, clear quickstart guides to show you the basics. If you like video tutorials, but want to play long, maybe watch me blather on in a webinar I recorded for sidefx. There's a download link in the description to get the same file I'm demonstrating with, gives a bit of an insight to how Houdini is used (skip to about 10 mins in if you don't want to hear my life story): URL_0 Once you can move the viewport and get into a SOP network, you can star

## Embedding

In [None]:
from sentence_transformers import SentenceTransformer
ST = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/113k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

In [None]:
def embed(batch):
    """
    adds a column to the dataset called 'embeddings'
    """
    information = batch["text"]
    return {"embeddings" : ST.encode(information)}

dataset = dataset.map(embed,batched=True,batch_size=16)

Map:   0%|          | 0/729 [00:00<?, ? examples/s]

In [None]:
# # Save the dataset to disk
# dataset.save_to_disk('saved_dataset')

In [None]:
# # Load the dataset from disk
# from datasets import load_from_disk
# dataset = load_from_disk('saved_dataset')

In [None]:
#Add index
data = dataset["train"]
data = data.add_faiss_index("embeddings")

  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
def search(query: str, k: int = 3 ):
    """a function that embeds a new query and returns the most probable results"""
    embedded_query = ST.encode(query) # embed new query
    scores, retrieved_examples = data.get_nearest_examples( # retrieve results
        "embeddings", embedded_query, # compare our new embedded query with the dataset embeddings
        k=k # get only top k results
    )
    return scores, retrieved_examples

# Load Model

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

model_id = "NousResearch/Meta-Llama-3-8B-Instruct"

# use quantization to lower GPU usage
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=bnb_config
)
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

In [None]:
# Adding system prompt to deliver a short answer first, followed by a detailed one.
SYS_PROMPT = """You are an assistant for answering questions.
# You are given the extracted parts of a long document and a question. Firstly provide a short summarized answer. Afterwards provide a detailed answer.
# If you don't know the answer, just say "I do not know." Don't make up an answer."""

# Retriever

In [None]:
def format_prompt(prompt,retrieved_documents,k):
  """using the retrieved documents we will prompt the model to generate our responses"""
  PROMPT = f"Question:{prompt}\nContext:"
  for idx in range(k) :
    PROMPT+= f"{retrieved_documents['text'][idx]}\n"
  return PROMPT

def generate(formatted_prompt):
  formatted_prompt = formatted_prompt[:8000]
  messages = [{"role":"system","content":SYS_PROMPT},{"role":"user","content":formatted_prompt}]
  # tell the model to generate
  input_ids = tokenizer.apply_chat_template(
      messages,
      add_generation_prompt=True,
      return_tensors="pt"
  ).to(model.device)
  outputs = model.generate(
      input_ids,
      max_new_tokens=1024,
      eos_token_id=terminators,
      do_sample=True,
      temperature=0.3,
      top_p=0.8,
  )
  response = outputs[0][input_ids.shape[-1]:]
  return tokenizer.decode(response, skip_special_tokens=True)

def rag_chatbot(prompt:str,k:int=2):
  scores , retrieved_documents = search(prompt, k)
  formatted_prompt = format_prompt(prompt,retrieved_documents,k)
  return generate(formatted_prompt)

# App

In [None]:
import gradio as gr
import os
import spaces
import time
from threading import Thread
from transformers import TextIteratorStreamer

In [None]:
@spaces.GPU(duration=150)
def talk(prompt,history):
    k = 3 # number of retrieved documents
    scores , retrieved_documents = search(prompt, k)
    formatted_prompt = format_prompt(prompt,retrieved_documents,k)
    formatted_prompt = formatted_prompt[:8000]
    messages = [{"role":"system","content":SYS_PROMPT},{"role":"user","content":formatted_prompt}]
    # tell the model to generate
    input_ids = tokenizer.apply_chat_template(
      messages,
      add_generation_prompt=True,
      return_tensors="pt"
    ).to(model.device)
    outputs = model.generate(
      input_ids,
      max_new_tokens=1024,
      eos_token_id=terminators,
      do_sample=True,
      temperature=0.3,
      top_p=0.8,
    )
    streamer = TextIteratorStreamer(
            tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
        )
    generate_kwargs = dict(
        input_ids= input_ids,
        streamer=streamer,
        max_new_tokens=1024,
        do_sample=True,
        top_p=0.8,
        temperature=0.3,
        eos_token_id=terminators,
    )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    outputs = []
    for text in streamer:
        outputs.append(text)
        print(outputs)
        yield "".join(outputs)

In [None]:
# Layout

TITLE = "Houdini chatbot"

DESCRIPTION = "LLaMA-3-8b-Instruct with RAG to answer Houdini-related questions based on tokeru.com/cgwiki"

demo = gr.ChatInterface(
    fn=talk,
    chatbot=gr.Chatbot(
        show_label=True,
        show_share_button=True,
        show_copy_button=True,
        likeable=True,
        layout="bubble",
        bubble_full_width=True,
        height=700,
    ),
    theme="Soft",
    title=TITLE,
    description=DESCRIPTION,

)
demo.launch(debug=True)

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://504b87f7b2d4f17dba.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['']
['', '']
['', '', '**Short ']
['', '', '**Short ', '']
['', '', '**Short ', '', '']
['', '', '**Short ', '', '', 'Summarized ']
['', '', '**Short ', '', '', 'Summarized ', '']
['', '', '**Short ', '', '', 'Summarized ', '', 'Answer:**\n\n']
['', '', '**Short ', '', '', 'Summarized ', '', 'Answer:**\n\n', '']
['', '', '**Short ', '', '', 'Summarized ', '', 'Answer:**\n\n', '', 'To ']
['', '', '**Short ', '', '', 'Summarized ', '', 'Answer:**\n\n', '', 'To ', 'create ']
['', '', '**Short ', '', '', 'Summarized ', '', 'Answer:**\n\n', '', 'To ', 'create ', 'a ']
['', '', '**Short ', '', '', 'Summarized ', '', 'Answer:**\n\n', '', 'To ', 'create ', 'a ', 'velocity ']
['', '', '**Short ', '', '', 'Summarized ', '', 'Answer:**\n\n', '', 'To ', 'create ', 'a ', 'velocity ', 'field ']
['', '', '**Short ', '', '', 'Summarized ', '', 'Answer:**\n\n', '', 'To ', 'create ', 'a ', 'velocity ', 'field ', 'from ']
['', '', '**Short ', '', '', 'Summarized ', '', 'Answer:**\n\n', '', 'To ', 'creat