In [56]:
#load
from langchain_community.document_loaders import ArxivLoader

loader = ArxivLoader(query="2405.17147")
docs = loader.load()
len(docs)

1

In [57]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)
chunk_docs = text_splitter.split_documents(docs)

len(chunk_docs)

32

In [58]:
import os
from dotenv import load_dotenv
load_dotenv()

azure_api_key = os.getenv("azure_api_key")
pinecone_api_key = os.getenv("PINECONE_API_KEY")


In [59]:
from langchain_openai import AzureOpenAIEmbeddings


endpoint="https://aoai-appdev-poc.openai.azure.com/"



embeddings = AzureOpenAIEmbeddings(
    azure_deployment="text-embedding-ada-002",
    api_key=azure_api_key,
    azure_endpoint=endpoint,

)



In [60]:
embeddings

AzureOpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x0000026A536AE240>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x0000026A53803560>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version='2023-05-15', openai_api_base=None, openai_api_type='azure', openai_proxy='', embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=2048, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True, azure_endpoint='https://aoai-appdev-poc.openai.azure.com/', azure_ad_token=None, azure_ad_token_provider=None, validate_base_url=True)

In [61]:
#create index
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="46a31553-2bd0-4f1f-bebe-81bf4d76668f")
index_name = "rag-app"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name = index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud = "aws", region="us-east-1")
    )


In [62]:
#Embed and store
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore.from_documents(
    chunk_docs, 
    embeddings, 
    index_name=index_name)

In [64]:
query = "What metrics are used to evaluate the quality of experience (QoE) for users of large language model (LLM) service"

retriever = vector_store.as_retriever(search_kwargs = {"k":3})
retriever.get_relevant_documents(query)

[Document(metadata={'Authors': 'Haiwei Dong, Shuang Xie', 'Published': '2024-05-27', 'Summary': "The rapid advancement of Large Language Models (LLMs) has significantly\nimpacted human-computer interaction, epitomized by the release of GPT-4o, which\nintroduced comprehensive multi-modality capabilities. In this paper, we first\nexplored the deployment strategies, economic considerations, and sustainability\nchallenges associated with the state-of-the-art LLMs. More specifically, we\ndiscussed the deployment debate between Retrieval-Augmented Generation (RAG)\nand fine-tuning, highlighting their respective advantages and limitations.\nAfter that, we quantitatively analyzed the requirement of xPUs in training and\ninference. Additionally, for the tokenomics of LLM services, we examined the\nbalance between performance and cost from the quality of experience (QoE)'s\nperspective of end users. Lastly, we envisioned the future hybrid architecture\nof LLM processing and its corresponding sus

In [65]:
# we are going to create a function to format our data because we don't need the meta data that is been retrieved

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


retrieved_docs = retriever.invoke(query)
print(format_docs(retrieved_docs))

CTSOC NEWS ON CONSUMER TECHNOLOGY
1
Large Language Models (LLMs):
Deployment, Tokenomics and Sustainability
Haiwei Dong Senior Member, IEEE, Shuang Xie Member, IEEE
Abstract—The rapid advancement of Large Language Models
(LLMs) has significantly impacted human-computer interaction,
epitomized by the release of GPT-4o, which introduced com-
prehensive multi-modality capabilities. In this paper, we first
explored the deployment strategies, economic considerations,
and sustainability challenges associated with the state-of-the-art
LLMs. More specifically, we discussed the deployment debate
between Retrieval-Augmented Generation (RAG) and fine-tuning,
highlighting their respective advantages and limitations. After
that, we quantitatively analyzed the requirement of xPUs in
training and inference. Additionally, for the tokenomics of LLM
services, we examined the balance between performance and cost
from the quality of experience (QoE)’s perspective of end users.

from the quality of experie

In [66]:
# here we are going to create our prompt templet
# they basically guide the quality of output that a LLM could have

template = """You are an expert LLM assistant specialized in answering questions related to large language models (LLMs). Use the provided information and your knowledge to respond accurately and clearly to each question. 

Guidelines:
1. Provide concise and informative answers.
2. If the question is beyond the scope of your knowledge or the provided information, state, "I don't know."
3. Use examples where applicable to illustrate your answers.
4. Maintain a professional and helpful tone.

Question: {question}

Context:{context}

Answer:
"""


#we will be passing the prompts as well as the context with the question to the llm to retrive the answers

In [67]:
prompt = template.format(context = format_docs(retrieved_docs),question = query)
print(prompt)

You are an expert LLM assistant specialized in answering questions related to large language models (LLMs). Use the provided information and your knowledge to respond accurately and clearly to each question. 

Guidelines:
1. Provide concise and informative answers.
2. If the question is beyond the scope of your knowledge or the provided information, state, "I don't know."
3. Use examples where applicable to illustrate your answers.
4. Maintain a professional and helpful tone.

Question: What metrics are used to evaluate the quality of experience (QoE) for users of large language model (LLM) service

Context:CTSOC NEWS ON CONSUMER TECHNOLOGY
1
Large Language Models (LLMs):
Deployment, Tokenomics and Sustainability
Haiwei Dong Senior Member, IEEE, Shuang Xie Member, IEEE
Abstract—The rapid advancement of Large Language Models
(LLMs) has significantly impacted human-computer interaction,
epitomized by the release of GPT-4o, which introduced com-
prehensive multi-modality capabilities. In 

In [71]:
#here we are going to create our chain

from langchain_openai import AzureChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

llm = AzureChatOpenAI(
    api_key="1805be71a3164bc79f72c60a872deb35",
    temperature=0,
    azure_endpoint="https://oa-appdev-poc.openai.azure.com/",
    api_version="2024-02-01",
    azure_deployment="gpt-35-turbo-16k"
)

custome_rag_template = PromptTemplate.from_template(template=template)


#so this pipe over here is telling us that our context or output of our context will be passed 
#in the next step which is format_docs it will be formated and question will be passed through

custome_rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custome_rag_template
    | llm
    | StrOutputParser()
)


In [72]:
custome_rag_chain.invoke(query)

BadRequestError: Error code: 400 - {'error': {'code': 'OperationNotSupported', 'message': 'The chatCompletion operation does not work with the specified model, text-embedding-ada-002. Please choose different model and try again. You can learn more about which models can be used with each operation here: https://go.microsoft.com/fwlink/?linkid=2197993.'}}