In [None]:
import random
from tqdm import tqdm
import textwrap

from youtube_transcript_api import YouTubeTranscriptApi

from pymongo import MongoClient

from sentence_transformers import SentenceTransformer #TODO: 90% sure this is the best for embedding in our case, just check once

from qdrant_client import QdrantClient
from qdrant_client.http import models
from torchvision.transforms import Resize

import torch

from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available 
from transformers import BitsAndBytesConfig

In [None]:
# ETL

## Youtube ETL
def get_youtube_transcript(video_url: str) -> list:
    video_id = video_url.split("v=")[1]
    transcript = YouTubeTranscriptApi.get_transcript(video_id)
    return transcript


In [None]:
#TODO: Make a more genric function to store data into MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["youtube_database"]  # Database name #TODO: Categorize better
collection = db["transcripts"]   # Collection name #TODO: Categorize better

# Insert into MongoDB
document = {
    "source": "youtube",
    "video_id": video_id,
    "transcript": transcript
}

collection.insert_one(document)
print("Transcript stored into MongoDB")

In [None]:
# Chunking
# Chunk sentences together for embedding
#TODO: Make this chunking better:Maybe capitalize and fins end of sentences in a better way
sentences_in_chunk = 6
chunks = []
for i in range(0, len(sentences), sentences_in_chunk):
    chunks.append(". ".join(sentences[i:i + sentences_in_chunk]))

print(len(chunks))

In [None]:
# EMbedding
# Embed these chunks
model = SentenceTransformer('paraphrase-MiniLM-L6-v2') # TODO: Pick better models

embeddings = model.encode(chunks)
embeddings_dict = dict(zip(chunks, embeddings))
print(embeddings.shape)

In [None]:
## Store embeddings also in MongoDB

In [None]:
# Qdrant

## Similarity search using qdrant
qdrant = QdrantClient(":memory:") # Create in-memory Qdrant instance, for testing, CI/CD

#Create a collection or database of texts where you store the embeddings
my_collection = "text_collection"

qdrant.create_collection(
    collection_name=my_collection,
    vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE)
)

#TODO: Make a function to insert embeddings into Qdrant
# Insert embeddings into Qdrant
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
    qdrant.upsert(
        collection_name=my_collection,
        points=[models.PointStruct(
            id=i,
            vector=embedding,
            payload={"text": chunk}
        )]
    )

qdrant.count(
    collection_name=my_collection,
    exact=True,
)

#TODO: Write a fucntion to search for similar text
sample_embedding = model.encode(sample_sentence)
search_result = qdrant.search(
    collection_name=my_collection,
    query_vector=sample_embedding,
    limit=5
)

In [None]:
#LLM

login() # Login to huggingface # TODO: Finalize whether this is required when you finalize the model

use_quantization_config = True
model_id = "TheBloke/CapybaraHermes-2.5-Mistral-7B-GPTQ"

#Instantiate tokenizer (tokenizer turns text into numbers ready for the model) 
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

# Instantiate the model
llm_model = AutoModelForCausalLM.from_pretrained(model_id,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="main")

llm_model.to("cuda")
llm_model.eval() # put model in evaluation mode

In [None]:
# ROS questions generated with GPT4
gpt4_questions = [
    "WWhat are the main differences between ROS 1 and ROS 2?",
    "Explain the concept of topics, services, and actions in ROS.",
    "What is the purpose of the catkin_make command in ROS 1?",
    "What is the function of rviz in ROS?",
    "What is the purpose of rqt_graph, and how can it help in debugging a ROS system?"
] 

# Manually created question list
manual_questions = [
    "What is ros?",
]

query_list = gpt4_questions + manual_questions

In [None]:
def prompt_formatter(query: str, 
                     context_items: list[dict]) -> str:
    """
    Augments query with text-based context from context_items.
    """
    # Join context items into one dotted paragraph
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items]) #TODO: This works, but maybe make it better?

    system_message = f"You are a robotic operating system (ROS) developer, using given context as additional information and answer the query, just your answer explanatory answer would suffice. Here is the context:{context}." # For out task we can use this as a system message

    # Default prompt template
    prompt=f'''<|im_start|>system
    {system_message}<|im_end|>
    <|im_start|>user
    {query}<|im_end|>
    <|im_start|>assistant
    '''
    return prompt

def ask(query, 
        temperature=0.7,
        top_p=0.95,
        top_k=40,
        max_new_tokens=512,
        format_answer_text=True, 
        return_answer_only=True):
    """
    Takes a query, finds relevant resources/context and generates an answer to the query based on the relevant resources.
    """
    
    search_results = qdrant.search(
    collection_name=my_collection,
    query_vector=model.encode(query),
    limit=5
    )
    
    # Create a list of context items
    context_items = []
    for result in search_results:
        context_items.append({"sentence_chunk": result.payload['text']})

    scores = [result.score for result in search_results]

    # Add score to context item
    for i, item in enumerate(context_items):
        item["score"] = scores[i] # return score back to CPU 
        
    # Format the prompt with context items
    prompt = prompt_formatter(query=query,
                              context_items=context_items)
    
    # Tokenize the prompt
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate an output of tokens
    outputs = llm_model.generate(**input_ids,
                                 temperature=temperature,
                                 do_sample=True,
                                 top_p=top_p,
                                 top_k=top_k,
                                 max_new_tokens=max_new_tokens)
    
    # Turn the output tokens into text
    output_text = tokenizer.decode(outputs[0])

    if format_answer_text:
        # Replace special tokens and unnecessary help message
        prompt_format_matched = prompt.replace('<|im_start|>', '<|im_start|> ').replace('<|im_end|>\n', '<|im_end|> \n')
        output_text = output_text.replace('<s>', '').replace(prompt_format_matched, '').replace('<|im_end|>', '')

    # Only return the answer without the context items
    if return_answer_only:
        return output_text, "No context items returned"
    
    return output_text, context_items

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [None]:
## Testing out everything

query = random.choice(query_list)
print(f"Query: {query}")

# Answer query with context and return context 
answer, context_items = ask(query=query, 
                            temperature=0.7,
                            max_new_tokens=512,
                            return_answer_only=False)

print(f"Answer:\n")
print_wrapped(answer)
print(f"Context items:")
context_items