In [1]:
import os
import uuid
from typing import List
from typing_extensions import TypedDict
from IPython.display import Image, display


from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import SKLearnVectorStore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
from langchain.schema import Document
from langgraph.graph import START, END, StateGraph
from langchain_community.tools.tavily_search import TavilySearchResults
from pprint import pprint


from typing import Literal

from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field

from langchain.document_loaders import HuggingFaceDatasetLoader

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
def initialize_llm():
    # we are using gemini model. You can use different models.
    from langchain.chat_models import init_chat_model
    from dotenv import load_dotenv  # used to store secret stuff like API keys or configuration values

    load_dotenv()

    llm = init_chat_model(
        "azure_openai:gpt-4o",
        azure_deployment="gpt4o",
    )
    metadata = f"CRAG, gpt4o"
    return llm, metadata

In [3]:
llm, metadata = initialize_llm()

In [4]:
class RouteQuery(BaseModel):
    """Route a user query to the most relevant datasource."""

    datasource: Literal["transformer", "web_search"] = Field(
        ...,
        description="""
        
        Given a user question choose to route it to 
        transformer store or web search."""
    )


structured_llm_router = llm.with_structured_output(RouteQuery)

# Prompt
system = """
You are an expert at routing a user question to a 
transformer store or web search.
Transformer store has information about transformer models, guides, and tutorials.
Web search has information about current events and news.
You must choose the most relevant datasource to answer the question.
"""
route_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}"),
    ]
)

question_router = route_prompt | structured_llm_router
print(
    question_router.invoke(
        {"question": "What is the difference between GPT-4 and GPT-3?"}
    )
)
print(question_router.invoke({"question": "What is the capital of India?"}))

datasource='transformer'
datasource='web_search'


In [5]:
def initialize_vectorstore(doc_splits,collection_name):

    from langchain_qdrant import QdrantVectorStore
    
    model_name = "sentence-transformers/all-mpnet-base-v2"
    model_kwargs = {'device': 'cpu'}
    encode_kwargs = {'normalize_embeddings': False}
    hf_embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs,
    )
    
    vectorstore = QdrantVectorStore.from_documents(
        doc_splits,
        embedding=hf_embeddings,
        url=os.getenv("QDRANT_URL"),
        api_key=os.getenv("QDRANT_KEY"),
        collection_name=collection_name,
    )
    return vectorstore.as_retriever(k=4)

In [6]:
def preprocess_dataset(docs_list):
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=700,
        chunk_overlap=50,
        disallowed_special=()
    )
    doc_splits = text_splitter.split_documents(docs_list)
    return doc_splits

In [None]:

transformers_doc = HuggingFaceDatasetLoader("m-ric/transformers_documentation_en","text")

In [None]:

transformers_docs = transformers_doc.load()
transformers_docs = preprocess_dataset(transformers_docs)

In [None]:
len(transformers_docs)

In [None]:
transformers_store = initialize_vectorstore(
    preprocess_dataset(transformers_docs),
    collection_name="transformers_docs"
        )