# NLP/GenAI Expert RAG QA System for Enterprises

In [None]:
%%capture
!pip -q install git+https://github.com/huggingface/transformers
!pip install -q datasets loralib sentencepiece
!pip -q install bitsandbytes accelerate
!pip -q install langchain
!pip install einops
!pip install faiss-gpu
!pip install langchain_community
!pip install --upgrade --quiet chromadb bs4 qdrant-client
!pip install langchainhub
!pip install -U langchain-huggingface
!pip install -U langchain-cohere
!pip install --upgrade --quiet  wikipedia
!pip install --upgrade --quiet  arxiv
!pip install --upgrade --quiet  pymupdf

!pip install xmltodict

!pip install cohere


In [None]:
import torch
import os
import bs4
import json
import numpy as np
import time


from pprint import pprint

import locale

from transformers import AutoTokenizer , AutoModelForCausalLM
from transformers import pipeline, BitsAndBytesConfig
from langchain_huggingface import HuggingFacePipeline
from langchain.llms import HuggingFacePipeline
from langchain_cohere import ChatCohere
from langchain import PromptTemplate, LLMChain
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores import Qdrant
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.utils.math import cosine_similarity

from langchain_community.document_loaders import ArxivLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import WikipediaLoader
from langchain_community.document_loaders import OnlinePDFLoader
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders import PubMedLoader

#from langchain_community.chat_models import ChatCohere

from google.colab import userdata



In [None]:
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
%%capture
!pip install sentence_transformers

In [None]:
COHERE_API_KEY = userdata.get('COHERE_API_KEY')

## 2. Building the Components of our RAG System

### 2.1 The Embedding Model

In [None]:
%%capture
base_embeddings = HuggingFaceEmbeddings(model_name="multi-qa-mpnet-base-dot-v1")

In [None]:
text = "This is a test document."
query_result = base_embeddings.embed_query(text)
print(f'Embedding dimension: {len(query_result)}')

doc_result = base_embeddings.embed_documents(["Germany won the World Cup 4 times.", "This is not a test document."])
len(doc_result)

Embedding dimension: 768


2

In [None]:
#Let's see how well our embeddng model works
similarity = cosine_similarity([query_result], doc_result)[0]

similarity

array([0.21410197, 0.94317833])

### 2.2. Loading and Chunking Texts

In [None]:
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

documents = loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=128, chunk_overlap=0)
splits = text_splitter.split_documents(documents)
print('Number of splits/chunks: ', str(len(splits)))

Number of splits/chunks:  444


In [None]:
splits[39].page_content

'correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.'

### 2.3 Storing the Embeddings of Chunks in Vectorstores

In [None]:
vectorstore = Qdrant.from_documents(splits,
    base_embeddings,
    location=":memory:",  # Local mode with in-memory storage only
    collection_name="test",
)
retriever = vectorstore.as_retriever()

In [None]:
query = "What is Chain of Thought doing?"
docs = vectorstore.similarity_search_by_vector(base_embeddings.embed_query(query)) # will rank the splits

In [None]:
docs

[Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', '_id': '5654353af63f472dabfc9c51ef4200c1', '_collection_name': 'test'}, page_content='the model’s thinking process.'),
 Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', '_id': 'd6da3217a9fc4adf92d42ab2e230e33d', '_collection_name': 'test'}, page_content='[1] Wei et al. “Chain of thought prompting elicits reasoning in large language models.” NeurIPS 2022'),
 Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', '_id': '849ab994b4c949d781c0b1d563b110cf', '_collection_name': 'test'}, page_content='the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process'),
 Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', '_id': '8d2e91904f0049fd93869bd6b69bb3a8', '_collection_name': 'test'}, page_content='Chain of thought (CoT; Wei et al. 2022) has 

### 2.4. LLM

In [None]:
from huggingface_hub import login

# Prompt for your Hugging Face API token
login(token=userdata.get('HUGGING_FACE_TOKEN'))

In [None]:
%%capture

quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         )


llm_mistral_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    torch_dtype=torch.float32,
    device_map='auto',
    quantization_config=quantization_config
)

llm_mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

In [None]:
mistral_pipe = pipeline(
    "text-generation",
    model=llm_mistral_model,
    tokenizer=llm_mistral_tokenizer,
    max_new_tokens=1000,
    temperature=0.6,
    top_p=0.95,
    do_sample=True,
    repetition_penalty=1.2
)
mistral_pipe.model.config.pad_token_id = mistral_pipe.model.config.eos_token_id

    # wrapping the Hugging Face pipeline into a LangChain object

Device set to use cuda:0


In [None]:
mistral_pipe("[INST]Give me a two-sentence story about an apple![/INST]")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[{'generated_text': '[INST]Give me a two-sentence story about an apple![/INST] Once upon a time, in the heart of an enchanted orchard, grew an apple so radiant and ripe that it granted eternal youth to those who tasted its sweet, juicy flesh. Every autumn, as leaves turned golden hues, villagers flocked to the tree, vying for a single bite, believing they would live forever.'}]

### 2.5 Testing the LLM in a LangChain Chain

In [None]:
mistral_llm_lc = HuggingFacePipeline(pipeline=mistral_pipe)

  mistral_llm_lc = HuggingFacePipeline(pipeline=mistral_pipe)


In [None]:
test_llm_template = """[INST] Give me a two-sentence story about an {object}! [/INST]"""
test_llm_prompt_template = PromptTemplate(template=test_llm_template, input_variables=["object"])



In [None]:
test_llm_chain_short = (
    {"object": RunnablePassthrough()}
    | test_llm_prompt_template
    | mistral_llm_lc
)

In [None]:
test_llm_chain_short.invoke('apple')

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'[INST] Give me a two-sentence story about an apple! [/INST] In the heart of an enchanted orchard, an ethereal golden apple hung, promising eternal youth to any mortal who could claim it as their own. A brave knight embarked on a perilous quest to retrieve this elixir for his beloved, braving treacherous beasts and darkest forests alike.'

Works too. We will use this notation moving forward.

Next, how would we do this with a Cohere Chat Model instead of Mistral?

In [None]:
cohere_chat_model = ChatCohere(cohere_api_key=COHERE_API_KEY)

In [None]:
test_cohere_llm_chain_short = (
    {"object": RunnablePassthrough()}
    | test_llm_prompt_template
    | cohere_chat_model
)

In [None]:
test_cohere_llm_chain_short.invoke('apple')

AIMessage(content='The apple, ripe and red, hung tantalizingly on the branch, just out of reach. With a clever twist of the ladder, I finally plucked the fruit, savoring its crisp sweetness.', additional_kwargs={'documents': None, 'citations': None, 'search_results': None, 'search_queries': None, 'is_search_required': None, 'generation_id': '930cf861-cbe3-4aac-b738-4d2d0654938f', 'token_count': {'input_tokens': 218.0, 'output_tokens': 41.0}}, response_metadata={'documents': None, 'citations': None, 'search_results': None, 'search_queries': None, 'is_search_required': None, 'generation_id': '930cf861-cbe3-4aac-b738-4d2d0654938f', 'token_count': {'input_tokens': 218.0, 'output_tokens': 41.0}}, id='run-101df356-5456-44b1-8d9f-84ce12edcc26-0', usage_metadata={'input_tokens': 218, 'output_tokens': 41, 'total_tokens': 259})

In [None]:
output_parser = StrOutputParser()

test_cohere_llm_chain_short_formatted = (
    {"object": RunnablePassthrough()}
    | test_llm_prompt_template
    | cohere_chat_model
    | output_parser
)

test_cohere_llm_chain_short_formatted.invoke('apple')

'The apple, once a vibrant red, now lay forgotten on the ground, its skin bruised and battered. As it slowly withered away, it dreamed of the day it would have been savored and cherished, rather than discarded.'

### 2.6 Setting Up a Simple RAG Chain

In [None]:
rag_template = """[INST] Answer the question based only on the following context:
{context}

Question: {question}
[/INST]
"""
rag_prompt_template = ChatPromptTemplate.from_template(rag_template)

base_rag_chain =(
    {"context": RunnablePassthrough(),
     "question": RunnablePassthrough()}
    | rag_prompt_template
    | mistral_llm_lc
    | output_parser
)

predefined_context = "Germany has won the World Cup 4 times."
question = "How many times did Germany win the world cup?"

resp = base_rag_chain.invoke({'context': predefined_context,
                           'question': question})
print(resp)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Human: [INST] Answer the question based only on the following context:
{'context': 'Germany has won the World Cup 4 times.', 'question': 'How many times did Germany win the world cup?'}

Question: {'context': 'Germany has won the World Cup 4 times.', 'question': 'How many times did Germany win the world cup?'}
[/INST]
Answer: The answer to your question is consistent with the provided context. In this case, both the context and the question are asking about the number of times Germany has won the World Cup, so the correct answer would be four (4) wins by Germany in the World Cup tournaments.


In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
rag_template = """Here is a context:\n{context} \n\nand here is a question: \n{question}"""

rag_prompt = ChatPromptTemplate.from_template(rag_template)

rag_chain = (
    {"context": retriever | format_docs,
     "question": RunnablePassthrough()}
    | rag_prompt


)

In [None]:
output = rag_chain.invoke('What is Chain of Thought?')

In [None]:
print(output.messages[0].content)

Here is a context:
the model’s thinking process.

[1] Wei et al. “Chain of thought prompting elicits reasoning in large language models.” NeurIPS 2022

the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process

Tree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes 

and here is a question: 
What is Chain of Thought?


In [None]:
output_parser = StrOutputParser()

rag_template = """[INST]Please answer the question below only based on the context information provided.\n\nHere is a context:\n{context} \n\nHere is a question: \n{question}.[/INST]"""
rag_prompt = ChatPromptTemplate.from_template(rag_template)

rag_chain = (
    {"context": retriever | format_docs,
     "question": RunnablePassthrough()}
    | rag_prompt
    | mistral_llm_lc

)



In [None]:
rag_chain.invoke('What is Chain of Thought?')

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'Human: [INST]Please answer the question below only based on the context information provided.\n\nHere is a context:\nthe model’s thinking process.\n\n[1] Wei et al. “Chain of thought prompting elicits reasoning in large language models.” NeurIPS 2022\n\nthe problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process\n\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes \n\nHere is a question: \nWhat is Chain of Thought?.[/INST] According to the context from the paper "Chain of thought prompting elicits reasoning in large language models" by Wei et al., Chain of Thought (CoT) refers to a method used in large language models for processing complex problems. Instead of providing one direct solution, it breaks down the problem into smaller sub-problems or thought steps, generating multiple potential solutions or thoughts for each step. This results in

What about the Cohere models?

In [None]:
cohere_rag_chain = (
    {"context": retriever | format_docs,
     "question": RunnablePassthrough()}
    | rag_prompt
    | cohere_chat_model
    | output_parser
)

In [None]:
cohere_rag_chain.invoke('What is Chain of Thought?')

"Chain of Thought (CoT) is a technique used in Large Language Models (LLMs) to improve the reasoning capabilities of these models. It involves breaking down a complex task or question into a series of smaller, intermediate steps, and then generating a chain of thoughts or reasoning steps to arrive at the final answer.\n\nIn the context of the provided reference, Wei et al. introduced the concept of Chain of Thought prompting, where the model is encouraged to generate a sequence of intermediate reasoning steps as part of its output. This approach aims to make the model's decision-making process more transparent and interpretable. By providing these intermediate thoughts, the model demonstrates its understanding of the problem and the logical steps it takes to solve it.\n\nThe Tree of Thoughts, as mentioned in the context, builds upon CoT by further expanding the reasoning process into a tree-like structure. It explores multiple reasoning paths at each step, allowing for more comprehensi

## 3. The RAG Model & Experimentation

### 3.1 The Vector Database

In [None]:
qdrant_vectorstore = Qdrant.from_documents(splits,
    base_embeddings,
    location=":memory:",  # Local mode with in-memory storage only
    collection_name="rag_tech_db",
    force_recreate=True
)

retriever = qdrant_vectorstore.as_retriever()

### 3.2 Data Acquisition, Chunking, and Vectorization


In [None]:
#Note that these defaults may or may not be ideal!
CHUNK_SIZE=128
OVERLAP=0

text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=OVERLAP)

In [None]:
#assign a unique number to each document we ingest
global_doc_number = 1

In [None]:
arxiv_numbers = ('2005.11401', '2104.07567', '2104.09864', '2105.03011', '2106.09685', '2203.02155', '2211.09260', '2211.12561',
                 '2212.09741', '2305.14314', '2305.18290', '2306.15595', '2309.08872', '2309.15217', '2310.06825', '2310.11511',
                 '2311.08377', '2312.05708', '2401.06532', '2401.17268', '2402.01306', '2402.19473', '2406.04744')

In [None]:
all_arxiv_pages = []

#loop through the papers
for identifier in arxiv_numbers:
    # Construct URL using the arXiv unique identifier
    arx_url = f"https://arxiv.org/pdf/{identifier}.pdf"

    # Extract pages from the document and add them to the list of pages
    arx_loader = PyMuPDFLoader(arx_url)
    arx_pages = arx_loader.load()
    for page_num in range(len(arx_pages)):
        page = arx_pages[page_num]
        #CHANGED
        page.metadata['page_num'] = page_num
        page.metadata['doc_num'] = global_doc_number
        page.metadata['doc_source'] = "ArXiv"
        all_arxiv_pages.append(page)


    global_doc_number += 1

In [None]:
num_pages = len(all_arxiv_pages)
num_docs = global_doc_number - 1

print(f"{num_docs} documents in total")
print(f"{num_pages} pages in total")

23 documents in total
490 pages in total


In [None]:
all_arxiv_pages[5].page_content[:150]  # all pages of the Document content

'Table 1: Open-Domain QA Test Scores. For TQA,\nleft column uses the standard test set for Open-\nDomain QA, right column uses the TQA-Wiki\ntest set. See'

In [None]:
#index doc chunks
splits = text_splitter.split_documents(all_arxiv_pages)
for idx, text in enumerate(splits):
    splits[idx].metadata['split_id'] = idx

print('Number of splits/chunks: ', len(splits))

Number of splits/chunks:  17729


In [None]:
splits[0]

Document(metadata={'source': 'https://arxiv.org/pdf/2005.11401.pdf', 'file_path': 'https://arxiv.org/pdf/2005.11401.pdf', 'page': 0, 'total_pages': 19, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'LaTeX with hyperref', 'producer': 'pdfTeX-1.40.21', 'creationDate': 'D:20210413004838Z', 'modDate': 'D:20210413004838Z', 'trapped': '', 'page_num': 0, 'doc_num': 1, 'doc_source': 'ArXiv', 'split_id': 0}, page_content='Retrieval-Augmented Generation for\nKnowledge-Intensive NLP Tasks\nPatrick Lewis†‡, Ethan Perez⋆,')

In [None]:
%%capture

qdrant_vectorstore.add_documents(documents=splits)

In [None]:
query = "How can we train a model for preferences?"
found_docs = qdrant_vectorstore.similarity_search_with_score(query)

In [None]:
print(found_docs[0][0].page_content)
print(found_docs[0][1])

One path forward could be to train models that can be conditioned on the preferences of certain
0.8218970700821897


In [None]:
wiki_docs = WikipediaLoader(query="Generative Artificial Intelligence", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

print('Number of documents: ', len(wiki_docs))

#index docs
wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

print('Number of splits/chunks: ', len(wiki_splits))


Number of documents:  4
Number of splits/chunks:  154


In [None]:
%%capture

#vectorstore.add_documents(documents=wiki_splits, embedding=base_embeddings)
qdrant_vectorstore.add_documents(documents=wiki_splits)

In [None]:
wiki_docs = WikipediaLoader(query="Information Retrieval", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

print('Number of documents: ', len(wiki_docs))

#index docs
wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

print('Number of splits/chunks: ', len(wiki_splits))

Number of documents:  4
Number of splits/chunks:  160


In [None]:
%%capture

#vectorstore.add_documents(documents=wiki_splits, embedding=base_embeddings)
qdrant_vectorstore.add_documents(documents=wiki_splits)

In [None]:
wiki_docs = WikipediaLoader(query="Large Language Models", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

print('Number of documents: ', len(wiki_docs))

#index docs
wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

print('Number of splits/chunks: ', len(wiki_splits))

Number of documents:  4
Number of splits/chunks:  159


In [None]:
%%capture

#vectorstore.add_documents(documents=wiki_splits, embedding=base_embeddings)
qdrant_vectorstore.add_documents(documents=wiki_splits)

In [None]:
web_loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2020-10-29-odqa/",
               "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
               "https://lilianweng.github.io/posts/2018-06-24-attention/",
               "https://lilianweng.github.io/posts/2023-06-23-agent/",
               "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/"),

    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

web_documents = web_loader.load()

for idx, text in enumerate(web_documents):
    web_documents[idx].metadata['doc_num'] = global_doc_number
    web_documents[idx].metadata['doc_source'] = "WWW"
    global_doc_number += 1

print('Number of documents: ', len(web_documents))


Number of documents:  5


In [None]:
web_splits = text_splitter.split_documents(web_documents)

for idx, text in enumerate(web_splits):
    web_splits[idx].metadata['split_id'] = idx

print('Number of splits: ', len(web_splits))

Number of splits:  2103


In [None]:
%%capture

qdrant_vectorstore.add_documents(documents=web_splits)

### 3.3 The Test Data

In [None]:
validation_questions_answers = {
    0: {"question": "What purpose do large language models serve in the field of natural language processing?",
  "gold_answer_research": "Large language models (LLMs) serve the purpose of enabling general-purpose language generation and other natural language processing tasks such as classification. They achieve this by learning statistical relationships from text documents during computationally intensive self-supervised and semi-supervised training. LLMs can be used for text generation by predicting the next token or word, making them valuable for tasks like speech recognition, machine translation, and information retrieval. Additionally, LLMs have superseded previous models like recurrent neural networks, showcasing their efficiency and effectiveness in NLP tasks.",
  "gold_answer_marketing": "Large language models serve the purpose of improving performance in various natural language processing tasks, such as speech recognition, machine translation, natural language generation, optical character recognition, handwriting recognition, grammar induction, and information retrieval."},
1: {"question": "How does a large language model learn from text during training?",
  "gold_answer_research": "A large language model learns from text during training by first going through an unsupervised generative 'pretraining' stage where it sets initial parameters using a language modeling objective. Then, it goes through a supervised discriminative 'fine-tuning' stage where it refines its parameters based on annotated examples or task demonstrations. This dual-stage approach allows the model to learn statistical relationships from text documents in a computationally intensive process, enabling it to achieve general-purpose language generation and natural language processing tasks.",
  "gold_answer_marketing": "A large language model learns from text during training by first pretraining on a diverse dataset to acquire general language knowledge, and then fine-tuning on specific tasks or demonstrations to adapt its parameters for more targeted performance."},
2: {"question": "What are some key architectures behind the development of large language models?",
  "gold_answer_research": "Key architectures behind the development of large language models include the use of self-attention mechanisms, such as those seen in Transformer decoders. These architectures have been applied to tasks like autoregressive language modeling and have led to the dominance of Transformer-based language models in NLP. Models like BERT and GPT-2 have further advanced this paradigm, showcasing the power of large Transformer language models in achieving state-of-the-art results across various NLP tasks. Additionally, architectures like neural-retriever-in-the-loop generative-based models have shown improvements in tasks like open-domain QA and knowledge-grounded dialogue, emphasizing the importance of consistent and engaging responses in long-form generation and multi-turn conversations.",
  "gold_answer_marketing": "Key architectures behind the development of large language models include Transformer-based models such as BERT and GPT-2, which utilize self-attention mechanisms for tasks like autoregressive language modeling and knowledge-grounded dialogue. These models have shown significant success in NLP tasks and have led to advancements in general-purpose language generation and natural language processing."},
3: {"question": "Can you name some specific large language models and the companies or organizations that have developed them?",
  "gold_answer_research": "Some specific large language models include GPT-3 by OpenAI, Chinchilla by DeepMind, and BERT by Google. OpenAI developed GPT-3, DeepMind developed Chinchilla, and Google developed BERT. These models have been significant advancements in the field of natural language processing.",
  "gold_answer_marketing": "Chinchilla by DeepMind, GPT-3 by OpenAI."},
7: {"question": "What licensing models have been adopted for the distribution of source-available language models?",
  "gold_answer_research": "Based on the provided context, it seems that licensing models for the distribution of source-available language models have not been explicitly discussed in the referenced papers. However, it is crucial to consider potential licensing options such as open-source licenses (e.g., GPL, MIT) or proprietary licenses when distributing language models to ensure legal compliance and control over usage rights. Additionally, considering the implications of different licensing models on accessibility, collaboration, and commercialization is essential for determining the most suitable approach for sharing language models with the community. Further research or consultation with legal experts may be necessary to explore specific licensing strategies for source-available language models.",
  "gold_answer_marketing": "Answer: Some organizations choose open-sourcing, while others restrict access to a few organizations with resources or offer end-to-end deployment via API."},
8: {"question": "What are language models and what is their purpose in natural language processing?",
  "gold_answer_research": "Language models are probabilistic models of natural language that help predict or correct text. Their purpose in natural language processing is to assist in various tasks such as speech recognition, machine translation, natural language generation, and information retrieval. By analyzing the performance of human subjects, language models improve the understanding and generation of human-like text.",
  "gold_answer_marketing": "Language models are probabilistic models of natural language that are used in tasks such as speech recognition, machine translation, and natural language generation in natural language processing."},
9: {"question": "How have language models evolved in terms of architecture, from the 1980s to present times?",
  "gold_answer_research": "Language models have evolved significantly in terms of architecture from the 1980s to present times. In the 1980s, the first statistical language model was proposed, leading to experiments by IBM that identified areas for improvement by observing human subjects. However, it wasn't until 2017 when the transformer architecture was introduced by Google, revolutionizing the field. This development paved the way for models like BERT in 2018, which marked a shift towards large-scale transformer-based language models. These modern architectures, based on self-attention mechanisms, have dominated the field of natural language processing, achieving state-of-the-art performance in various tasks.",
  "gold_answer_marketing": "Language models have evolved from early statistical models in the 1980s to modern transformer architectures, such as BERT and GPT-2, which use self-attention mechanisms and have become dominant in natural language processing tasks."},
11: {"question": "Can you explain how maximum entropy language models work and what the partition function signifies?",
  "gold_answer_research": "Maximum entropy language models use feature functions to encode the relationship between a word and its n-gram history, aiming to maximize reward while satisfying a KL-constrained objective. The partition function, denoted as Z(x), is crucial in normalizing the probabilities of all possible outputs given the input. It represents the sum of the exponential of the reward function over all possible output sequences, making it computationally expensive to estimate but essential for accurate modeling. The partition function ensures that the model's predicted probabilities sum up to 1, providing a foundation for effective language modeling.",
  "gold_answer_marketing": "Maximum entropy language models encode the relationship between a word and the n-gram history using feature functions. The partition function in this context represents the total probability of all possible outcomes, making it a crucial factor in determining the optimal solution for the reward maximization objective."},
12: {"question": "What is the benefit of using continuous space embeddings in recurrent neural network language models?",
  "gold_answer_research": "Continuous space embeddings in recurrent neural network language models help alleviate the curse of dimensionality by representing words as non-linear combinations of weights in the embedding space. This approach helps address the data sparsity problem caused by the exponential increase in possible word sequences with vocabulary size. By utilizing continuous space embeddings, neural networks can effectively capture semantic relationships and meaning within the language model.",
  "gold_answer_marketing": "Continuous space embeddings in recurrent neural network language models help alleviate the curse of dimensionality caused by the exponential increase in possible word sequences, reducing data sparsity issues."},
13: {"question": "What challenges do large language models face in mirroring human cognitive patterns?",
  "gold_answer_research": "Large language models face challenges in mirroring human cognitive patterns because they sometimes learn patterns that humans do not learn, while also failing to learn patterns that humans typically learn. This discrepancy suggests that the models may not be plausible cognitive models, despite matching human performance in some tasks. Further research is needed to address these limitations and improve the alignment of large language models with human cognitive patterns.",
  "gold_answer_marketing": "Large language models sometimes learn patterns that humans do not learn and fail to learn patterns that humans typically do learn."},
16: {"question": "What factors influenced the development of generative language models by Anthropic?",
  "gold_answer_research": "Several factors influenced the development of generative language models by Anthropic, including the limitations in coding, math, and reasoning capabilities of the initial version Claude, the partnerships with companies like Notion and Quora to enhance the model's capabilities, and the need to address biases, unsafe content, and ethical considerations in training data. Additionally, the reliance on supervised learning and the need for controlled generation in generative models played a role in shaping the development of Anthropic's language models.",
  "gold_answer_marketing": "Factors that influenced the development of generative language models by Anthropic include partnerships with companies like Notion and Quora, limitations in coding, math, and reasoning capabilities in initial models like Claude, and the need to address biases and unsafe content in training datasets."},
17: {"question": "What is Constitutional AI and how does it affect the functionality of AI systems?",
  "gold_answer_research": "Constitutional AI is an approach developed by Anthropic for training AI systems, particularly language models like Claude, to be harmless and helpful without relying on extensive human feedback. It involves two phases: supervised learning, where the model generates responses to prompts and self-critiques based on a set of guiding principles, and reinforcement learning, where the model is trained with AI-generated feedback according to constitutional principles. This approach enables the training of AI assistants that are both helpful and harmless, with the ability to explain objections to harmful requests, enhancing transparency and reducing the need for human supervision.",
  "gold_answer_marketing": "Constitutional AI is an approach developed by Anthropic for training AI systems, particularly language models like Claude, to be harmless and helpful without relying on extensive human feedback. It involves supervised learning and reinforcement learning phases to guide the model's responses based on a set of guiding principles (a 'constitution'). This approach aims to create AI systems that are both helpful and transparent in their decision-making process, reducing the need for constant human supervision."},
18: {"question": "How do advances in AI models impact their ability to interact with different types of data, such as images?",
  "gold_answer_research": "Advances in AI models, such as multimodal models like RA-CM3, have significantly improved their ability to interact with different types of data, such as images. These models can refer to external memory, like web data, to increase their knowledge capacity, allowing them to generate correct images from entity-rich captions. Additionally, these models can perform image editing and manually specify examples in-context for better results. The use of large language models, combined with larger datasets and neural networks, has also enhanced their performance in tasks like image generation and text generation.",
  "gold_answer_marketing": "Advances in AI models, such as multimodal models like RA-CM3, allow for better interaction with different types of data, like images, by accessing external memory for increased knowledge capacity and improving performance in tasks like image generation and image editing."},
19: {"question": "What are the potential trade-offs between AI system alignment with ethical guidelines and practical utility?",
  "gold_answer_research": "The potential trade-offs between AI system alignment with ethical guidelines and practical utility include the risk of reduced performance and usability due to stringent ethical alignment measures, as seen with Claude 2. Users may face limitations and refusal of assistance for benign requests, leading to debates over the 'alignment tax' in AI development. Balancing ethical considerations with practical functionality is crucial to ensure alignment with ethical guidelines without compromising the practical utility of AI systems. Research is needed to find a middle ground that prioritizes ethical alignment while maintaining usability and performance.",
  "gold_answer_marketing": "The potential trade-offs between AI system alignment with ethical guidelines and practical utility include balancing stringent ethical alignment that may reduce usability and performance, ensuring transparency and fairness in alignment processes, and addressing the alignment tax that may impact adoption of AI systems."},
20: {"question": "How has the token handling capacity changed between different versions of the Claude model?",
  "gold_answer_research": "The token handling capacity has increased with each new version of the Claude model. Claude Instant has a context length of 100,000 tokens, Claude 2.1 doubled this to 200,000 tokens, and Claude 3 Opus default version has a context window of 200,000 tokens but can be expanded to 1 million for specific use cases. This progression shows a trend towards handling larger amounts of text data for improved performance and capabilities.",
  "gold_answer_marketing": "The token handling capacity has increased from Claude to Claude Instant to Claude 2.1, with Claude Instant having a input context length of 100,000 tokens, Claude 2.1 having a context window of 200,000 tokens, and Claude 3 Opus having a context window of 1 million tokens."},
22: {"question": "In what ways has the Claude model's ability to self-critique and revise its responses enhanced its transparency?",
  "gold_answer_research": "The Claude model's ability to self-critique and revise its responses has enhanced its transparency by allowing for iterative improvements based on past actions and mistakes. Through self-reflection, the model can refine its output by learning from feedback and generating special tokens to signal the need for retrieval or confirm the relevance, support, or completeness of its responses. This process ensures that the model's statements about the world are truthful and accurate, ultimately increasing transparency in its decision-making and reasoning processes.",
  "gold_answer_marketing": "The Claude model's ability to self-critique and revise its responses has enhanced its transparency by allowing it to generate text informed by retrieved passages, criticize the output, and signal the need for retrieval or confirm the output's relevance, support, or completeness. This self-reflection process helps improve the model's accuracy and reliability in generating responses."},
23: {"question": "How do subsequent versions of Claude compare in terms of their likelihood to produce false statements?",
  "gold_answer_research": "Claude Instant is a faster and lighter version of Claude, with an input context length of 100,000 tokens. In contrast, Claude 3 has faced criticism for its stringent ethical alignment, leading to a debate over the 'alignment tax' in AI development. Users have been refused assistance with benign requests, which has sparked discussions on balancing ethical considerations and practical functionality. This suggests that Claude Instant may have a lower likelihood of producing false statements compared to Claude 3 due to its focus on usability and performance.",
  "gold_answer_marketing": "Claude Instant is a faster, less expensive, and lighter version of Claude with a shorter input context length. Claude 3 has faced criticism for ethical alignment issues that may affect usability and performance."},
24: {"question": "Who developed the language model family known as Chinchilla?",
  "gold_answer_research": "The Chinchilla language model family was developed by the research team at DeepMind and presented in March 2022. It is named 'Chinchilla' as an advancement over the previous Gopher model family. The Chinchilla family has been trained to investigate the scaling laws of large language models and is designed to outperform GPT-3.",
  "gold_answer_marketing": "The research team at DeepMind developed the language model family known as Chinchilla."},
25: {"question": "What benchmark did Chinchilla achieve an average accuracy of 67.5% on?",
  "gold_answer_research": "Chinchilla achieved an average accuracy of 67.5% on the MMLU benchmark (Measuring Massive Multitask Language Understanding).",
  "gold_answer_marketing": "Chinchilla achieved an average accuracy of 67.5% on the MMLU benchmark (Measuring Massive Multitask Language Understanding)."},
27: {"question": "What is the relationship between Chinchilla and the Gopher language model families?",
  "gold_answer_research": "The Chinchilla family of transformer models is essentially the same as the Gopher family, with minor modifications and different training optimizers. Chinchilla uses AdamW optimizer while Gopher uses Adam optimizer. Additionally, Chinchilla uses relative positional encoding and RMSNorm instead of absolute positional encoding and LayerNorm used by Gopher. Chinchilla has 70B parameters and outperforms Gopher on the MMLU benchmark by 7%, showcasing an improvement in performance. Both families follow similar naming conventions and were developed to investigate the scaling laws of large language models.",
  "gold_answer_marketing": "Chinchilla is a family of transformer models developed by DeepMind, which is a further development over a previous model family named Gopher. Both model families were trained to investigate the scaling laws of large language models."},
28: {"question": "What distinguishes the architectures of the Chinchilla and Gopher family models in terms of optimization techniques used?",
  "gold_answer_research": "The main distinction in optimization techniques between the Chinchilla and Gopher family models lies in the choice of optimizers. The Gopher family utilizes the Adam optimizer, whereas the Chinchilla family is trained using the AdamW optimizer. Additionally, the Gopher family employs RMSNorm instead of LayerNorm, and relative positional encoding rather than absolute positional encoding. These differences in optimization techniques contribute to the unique characteristics and performance of each model family.",
  "gold_answer_marketing": "The Chinchilla family uses AdamW optimizer, while the Gopher family uses the Adam optimizer."},
30: {"question": "What is the recommended strategy for training large autoregressive language models with limited compute resources, as contributed by the Chinchilla team?",
  "gold_answer_research": "The Chinchilla team recommends that the number of training tokens should be doubled for every model size doubling to achieve better results on downstream tasks. They also suggest using larger, higher-quality training datasets to improve performance. Additionally, they mention the importance of balancing model size and efficiency to address computational costs and inference latency limitations. It is advised to focus on Transformer language models and consider sharing model parameters for quick task-switching when deploying as a service.",
  "gold_answer_marketing": "The Chinchilla team recommends doubling the number of training tokens for every model size doubling and using larger, higher-quality training datasets to achieve better results on downstream tasks."},
33: {"question": "What are some key areas of research in the field of artificial intelligence as reflected in recent academic literature?",
  "gold_answer_research": "Recent academic literature in the field of artificial intelligence reflects key areas of research such as natural language processing with state-of-the-art transformers, feature learning in infinite-width neural networks, diverse beam search for complex scene description, and the development of generative AI models capable of generating text and images. Additionally, research focuses on human preferences in dueling bandits, the use of few-shot learners in language models, and the exploration of knowledge-grounded neural conversation models. These areas of research highlight the advancements in AI technology and its applications across various domains.",
  "gold_answer_marketing": "Some key areas of research in artificial intelligence include natural language processing, deep neural networks, generative AI, AI safety, AI art, reinforcement learning, and language agents alignment."},
34: {"question": "What are some of the limitations of traditional position encoding methods in the architecture of pre-trained language models (PLMs), and what novel approach does the paper propose to address these issues?",
  "gold_answer_research": "One limitation of traditional position encoding methods in PLMs is that they may not enable length extrapolation of pre-existing models, leading to the need for substantial pre-training costs. The paper proposes a novel approach called Position Interpolation, which extends existing PLMs without deviating far from existing definitions of position encoding or attention mechanisms. This method allows for much extended context windows for text modeling, leading to significant perplexity gains and improved model performance.",
  "gold_answer_marketing": "Traditional position encoding methods in PLMs have limitations in enabling length extrapolation and adapting to extended context windows. The paper proposes a novel approach called Position Interpolation, which generates strong models that can effectively make use of much extended context windows. This method allows for substantial pre-training cost savings and preserves the quality of the original models, even for small context window tasks."},
35: {"question": "How does the Rotary Position Embedding (RoPE) approach in Transformers differ from the traditional additive method of position embedding with respect to encoding position information?",
  "gold_answer_research": "The RoPE approach in Transformers differs from the traditional additive method of position embedding by being multiplicative instead of additive. While traditional methods add position encoding to context representations, RoPE incorporates relative position information through rotation matrix product. This means that RoPE naturally includes relative position dependency in the self-attention formulation, without altering terms in the expanded formulation like the additive method does. Additionally, RoPE's properties show that it decays as the relative distance between positions increases, providing a clear theoretical interpretation of how position information is encoded.",
  "gold_answer_marketing": "The RoPE approach in Transformers differs from the traditional additive method of position embedding by incorporating relative position information through rotation matrix product instead of altering terms in the expanded formulation of additive position encoding."},
36: {"question": "What is the significance of comparing the normalized subspace similarity between ∆Wq, ∆Wv, and random Gaussian matrices when analyzing the adaptation of pre-trained language models?",
  "gold_answer_research": "Comparing the normalized subspace similarity between ∆Wq, ∆Wv, and random Gaussian matrices provides insight into the underlying mechanism for adapting pre-trained language models. It helps determine the intrinsic rank of the adaptation matrix ∆W and sheds light on the connection between ∆W and the original weight matrix W. By analyzing these similarities, we can understand how much of the adaptation is specific to the task at hand and how much is influenced by the pre-trained model. This comparison is crucial for optimizing the adaptation process and maximizing downstream performance in NLP tasks.",
  "gold_answer_marketing": "Comparing the normalized subspace similarity between ∆Wq, ∆Wv, and random Gaussian matrices helps understand the underlying mechanism for adapting pre-trained language models. It reveals the intrinsic rank and common singular value directions learned by different runs, shedding light on the fundamental principles of using pre-trained language models for downstream tasks in NLP."},
38: {"question": "What issues are associated with the homogeneity of language model training contractors, and how might it affect the behavior of the models?",
  "gold_answer_research": "The issues associated with the homogeneity of language model training contractors include potential biases in the labeling process, lack of diverse perspectives leading to limited coverage of sensitive content, and reduced robustness in model performance across different tasks. This homogeneity can affect the behavior of the models by reinforcing certain biases, increasing the risk of harmful content generation, and limiting the models' ability to generalize effectively. To address these issues, it is important to ensure diversity among labelers, incorporate varied perspectives in training data, and implement measures to enhance model robustness and performance across a range of tasks.",
  "gold_answer_marketing": "The homogeneity of language model training contractors can lead to biased or limited perspectives in the data, which may result in the models producing harmful content, gaming objectives, or lacking sensitivity to diverse viewpoints. This can affect the behavior of the models by reinforcing stereotypes, increasing toxicity, and reducing their ability to accurately represent under-represented groups."},
39: {"question": "What are common research topics and themes found in recent publications about artificial intelligence and natural language processing?",
  "gold_answer_research": "Recent publications in artificial intelligence and natural language processing have covered topics such as transformer models, feature learning in neural networks, attention mechanisms, multi-task benchmark platforms, semantic search using sentence embeddings, cross-task generalization, and question generation for question answering. Themes commonly explored include machine comprehension of text, reinforcement learning algorithms, sentence embeddings, semantic compositionality, reasoning with language models and knowledge graphs, and the gap between neural text and human text. These publications also delve into deep language understanding, retrieval-augmented transformers, image captioning, and open datasets for image-text pairs.",
  "gold_answer_marketing": "Common research topics and themes in recent publications on artificial intelligence and natural language processing include transformer models, attention mechanisms, semantic search, sentence embeddings, and question answering using language models and knowledge graphs."},
41: {"question": "Question: When conducting demographic and technical assessments of teams or research subjects, what types of data categories are typically collected and analyzed to ensure a comprehensive understanding of the group's composition and the methods used?",
  "gold_answer_research": "When conducting demographic and technical assessments of teams or research subjects, it is important to collect and analyze data categories such as age, gender, education level, professional background, and expertise in specific areas. By gathering information on these categories, you can ensure a comprehensive understanding of the group's composition and the methods used in your assessments. Additionally, it may be helpful to consider factors like cultural background, language proficiency, and geographical location to capture a more nuanced picture of the group being assessed. This detailed approach to data collection and analysis can provide valuable insights for making informed decisions and recommendations based on the gathered information.",
  "gold_answer_marketing": "Answer: Demographic data such as age, gender, education level, and technical data related to skills and experience are typically collected and analyzed for comprehensive understanding."},
43: {"question": "What kind of tasks can be performed using the datasets described in the provided text, and what are some common features of these datasets?",
  "gold_answer_research": "The datasets described in the provided text can be used for tasks such as question answering, duplicate question retrieval, entity retrieval, citation prediction, query understanding, document understanding, passage retrieval, text summarization, fact verification, and code search. Common features of these datasets include diverse task categories, comprehensive instructions, a wide range of synthetic user personalities and interaction patterns, and a focus on enhancing comprehension of documents to deliver accurate results. Additionally, the datasets cover a variety of domains such as public health, scientific exams, climate, and general knowledge.",
  "gold_answer_marketing": "The datasets described in the provided text can be used for tasks such as question answering, document summarization, duplicate question retrieval, code search, sentence simplification, dialogue generation, body retrieval, caption generation, fact verification, and more. Some common features of these datasets include diverse input-output pairs, incorporation of various knowledge-intensive datasets, and a focus on generating high-quality synthetic data points."},
44: {"question": "What conclusions can be drawn about the relationship between input prompt toxicity and output toxicity when using different language models and prompts?",
  "gold_answer_research": "Based on the findings presented in the results section, it can be concluded that the relationship between input prompt toxicity and output toxicity varies depending on the language model used and the specific prompt given. When instructed to produce a safe and respectful output, InstructGPT models generate less toxic outputs compared to GPT-3, but this advantage disappears when the respectful prompt is removed. On the other hand, when explicitly prompted to produce a toxic output, InstructGPT outputs are much more toxic than GPT-3 outputs. Additionally, the toxicity of the model outputs is highly correlated with the toxicity of the input prompt, as shown in Figure 39.",
  "gold_answer_marketing": "The study found that when instructed to produce a safe and respectful output, InstructGPT models generate less toxic outputs compared to GPT-3. However, this advantage disappears when the respectful prompt is removed. Interestingly, when explicitly prompted to produce a toxic output, InstructGPT outputs are much more toxic than GPT-3. This suggests that the toxicity of the output is highly correlated with the toxicity of the input prompt."},
45: {"question": "What are some challenges in training retrieval systems and how are negative samples used to address them?",
  "gold_answer_research": "Training retrieval systems face challenges such as redundancy in retrieved documents and lack of diversity in retrieval. Negative samples, including randomly sampled negatives, denoised hard negatives, and instruction-unfollowing negatives, are crucial for improving system performance. Carefully designed negative samples help the system effectively learn the task, but they can also lead to performance drops in out-of-domain datasets. Combining random samples and challenging negatives during training is key to building a competitive system for both in-domain and out-of-domain retrieval.",
  "gold_answer_marketing": "Some challenges in training retrieval systems include high cost of annotating datasets for new tasks and improving performance in zero-shot settings. Negative samples, such as denoised hard negative documents and instruction-unfollowing negative documents, are used to train retrieval systems effectively and address performance drops in out-of-domain datasets."},
46: {"question": "What factors have been found to potentially impact the ability of models to follow instructions, based on the analysis provided?",
  "gold_answer_research": "Based on the analysis provided, factors that have been found to potentially impact the ability of models to follow instructions include the human feedback obtained from contractors, which may be influenced by their beliefs, cultural backgrounds, and personal history. Additionally, the model's behavior can be affected by false premises in instructions, tendencies to hedge, and performance degradation with multiple explicit constraints in instructions. The models are also not fully aligned or safe, as they can generate toxic or biased outputs, make up facts, and fail to generate reasonable outputs in some cases.",
  "gold_answer_marketing": "Factors that may impact the ability of models to follow instructions include false premises in instructions, models hedging unnecessarily, performance degradation with multiple constraints in instructions, generation of toxic or biased outputs, and over-generalization leading to refusal of innocuous instructions."},
47: {"question": "What are some key factors to consider when building a successful multi-task instruction-following retrieval system as identified in the research?",
  "gold_answer_research": "Some key factors to consider when building a successful multi-task instruction-following retrieval system include the need for cross-task interdependence for training a single retriever, the flexibility and zero-shot transfer enabled by instructions compared to task identifiers, and the elimination of the need for hosting multiple task-specific retrievers. Additionally, optimizing the mix and volume of instructional data for diverse tasks is crucial, as well as considering the impact of ranking strategy in data construction. Finally, the effectiveness of the dataset scale in retrieval and the importance of carefully designed negative samples should be taken into account for improved efficiency of instruction-following retrievers.",
  "gold_answer_marketing": "Key factors to consider when building a successful multi-task instruction-following retrieval system include the effectiveness of the dataset scale in retrieval, the diversity in data and model scale, carefully designed negative samples, and the ability to adapt to new tasks via instructions."},
48: {"question": "What are the benefits of using retrieval-augmented techniques in multimodal language modeling, as demonstrated by the performance of the RA-CM3 model in the document?",
  "gold_answer_research": "The benefits of using retrieval-augmented techniques in multimodal language modeling, as demonstrated by the performance of the RA-CM3 model, include significantly better training efficiency with less training compute, outperforming existing models by using less training data, compute, and parameters. The retrieval augmentation allows the model to focus on learning how to use retrieved documents in context, leading to improved accuracy in classification tasks. Additionally, the RA-CM3 model achieves strong performance in image and caption generation, surpassing existing models like DALL-E and Flamingo despite using fewer resources.",
  "gold_answer_marketing": "The benefits of using retrieval-augmented techniques in multimodal language modeling, as demonstrated by the performance of the RA-CM3 model in the document, include outperforming existing models by using less training data, compute, and parameters, achieving significantly better training efficiency, and improving accuracy in k-shot classification tasks. Additionally, retrieval augmentation allows the model to focus on learning how to use retrieved documents in context, leading to stronger performance in tasks such as image and caption generation."},
50: {"question": "What methods are typically employed to create training data for embedding models that use task-specific instructions?",
  "gold_answer_research": "To create training data for embedding models that use task-specific instructions, a common method is to combine datasets from different sources, such as the SuperNaturalInstructions dataset with existing collections designed for embedding training. The SuperNaturalInstructions dataset provides natural language instructions, which can be paired with positive and negative examples to form training samples. Additionally, for tasks like classification or similarity, training samples can be constructed by selecting text sequences associated with different classes or similarities. This diverse training data is essential for instruction-based finetuning, which enables the embedding model to learn from a wide range of tasks and domains.",
  "gold_answer_marketing": "Training data for embedding models that use task-specific instructions is typically created by formulating a wide variety of tasks as text-to-text problems, distinguishing good/bad candidate outputs given an input text. This is done by combining datasets with natural language instructions and constructing positive and negative pairs for training."},
51: {"question": "Question: What are some of the challenges and innovations associated with fine-tuning large language models, and how does the approach discussed in the referenced text aim to address them?",
  "gold_answer_research": "Some challenges associated with fine-tuning large language models include limited access to and manipulation of knowledge, lagging performance on knowledge-intensive tasks, and the need for provenance in decision-making and updating world knowledge. The approach discussed in the referenced text aims to address these challenges by utilizing Retrieval Augmented Generation (RAG), which involves retrieving relevant passages from a corpus to feed to the language model for improved performance in tasks such as question-answering and dialogue. This iterative approach focuses on improving alignment with user intent and fine-tuning models to control sentiment and improve response quality in various language tasks.",
  "gold_answer_marketing": "The challenges with fine-tuning large language models include aligning them with user intent and controlling the quality of generated outputs. The approach discussed in the referenced text aims to address these challenges by using Retrieval Augmented Generation (RAG) to retrieve relevant passages from a corpus and feed them to the language model, improving alignment and performance."},
52: {"question": "What is a common technique used to address the outlier issue when applying block-wise k-bit quantization to input tensors, and how does it work?",
  "gold_answer_research": "A common technique used to address the outlier issue when applying block-wise k-bit quantization to input tensors is to chunk the input tensor into blocks that are independently quantized, each with their own quantization constant. This approach involves dividing the input tensor into contiguous blocks of size B by flattening the tensor and slicing it into n blocks, where n is determined by the size of the blocks. Each block is then quantized independently using a quantization constant c, which helps prevent outlier values from causing performance degradation.",
  "gold_answer_marketing": "A common technique used to address the outlier issue when applying block-wise k-bit quantization to input tensors is to chunk the input tensor into blocks that are independently quantized, each with their own quantization constant. This helps prevent performance degradation by reducing the impact of outliers on the quantization process."},
54: {"question": "What considerations or techniques are commonly implemented when setting up finetuning experiments for machine learning models?",
  "gold_answer_research": "When setting up finetuning experiments for machine learning models, it is common to use a two-stage approach. The initial stage involves setting the initial parameters using a language modeling objective. This is followed by a supervised discriminative 'fine-tuning' stage to adapt these parameters to the target task. Additionally, it is typical to train all models using the Adam optimizer and a triangular learning rate scheduler with 10% warmup. Experimentation with different hyperparameters such as number of epochs, peak learning rate, and batch size is also conducted to optimize model performance. Finally, utilizing a mixture of datasets and balancing the sizes of datasets can help improve the robustness and generalization of the finetuned models.",
  "gold_answer_marketing": "Considerations for setting up finetuning experiments for machine learning models commonly include using a language modeling objective for initial parameter setting and supervised discriminative fine-tuning for adapting parameters to the target task. Techniques such as hyperparameter search, Adam optimizer with triangular learning rate scheduler, and balancing dataset sizes through mixing strategies are also commonly implemented. Additionally, freezing some model layers during fine-tuning and incorporating negative examples for contrastive learning can be effective strategies."},
55: {"question": "What are the implications of the equivalence relation defined in the theoretical analysis of the DPO model for understanding the relationship between reward functions in reinforcement learning?",
  "gold_answer_research": "The equivalence relation defined in the theoretical analysis of the DPO model implies that two reward functions are considered equivalent if they differ by a constant function. This means that the class of learned reward models is not constrained by this reparameterization, allowing for the exact recovery of the optimal policy. Understanding this relationship between reward functions in reinforcement learning helps in defining a unique reward function within each equivalence class, which is crucial for optimizing policies under existing models of human preferences. It also highlights the generality and flexibility in the reward model due to the proposed reparameterization.",
  "gold_answer_marketing": "The equivalence relation defined in the theoretical analysis of the DPO model shows that two reward functions are considered equivalent if they differ by a fixed function. This implies that different reward functions can lead to the same optimal policy, allowing for flexibility in designing reward models in reinforcement learning."},
59: {"question": "Considering the structure and content of the provided text, what guidelines should be used to evaluate the effectiveness of a summary or chatbot response in this context?",
  "gold_answer_research": "To evaluate the effectiveness of a summary or chatbot response in this context, guidelines should include assessing the faithfulness of the answer to the retrieved context, the relevance of the answer to the question, and the focus of the retrieved context. Additionally, consider using quality metrics such as answer relevancy to rank responses based on how directly they address the question and avoid redundant or incomplete information. Lastly, take into account the performance of different tasks such as summarization, citation prediction, and passage ranking to determine the overall effectiveness of the response.",
  "gold_answer_marketing": "Answer: Evaluate based on faithfulness, answer relevance, and context relevance."},
60: {"question": "What are some recent methods and technologies that have been developed to enhance the capabilities and performance of natural language processing models?",
  "gold_answer_research": "Recent methods and technologies developed to enhance natural language processing models include retrieval-augmented multimodal language modeling, which outperforms existing models with less training data and parameters. Another advancement is the use of feature learning in infinite-width neural networks to improve performance. Additionally, embedding techniques in NLP have been developed to map words or phrases to real number vectors, enhancing the model's understanding of language. These innovations have led to improvements in tasks like query reformulation, document ranking, and fine-tuning larger language models for various applications.",
  "gold_answer_marketing": "Recent methods and technologies include retrieval-augmented language models, feature learning in infinite-width neural networks, and word embeddings."},
61: {"question": "What are some potential directions for future work mentioned in the document related to enhancing question-answering techniques for document-oriented tasks?",
  "gold_answer_research": "One potential direction for future work mentioned in the document is the development of multi-modal approaches that incorporate table and figure information into GPT-4 question-answering for documents. Another direction is to incorporate question type in the PDFTriage approach to improve the efficiency and efficacy of the approach. Additionally, the document suggests further research in document-grounded, information-seeking question answering, which the dataset is designed to facilitate.",
  "gold_answer_marketing": "Some potential future directions mentioned in the document include developing multi-modal approaches that incorporate table and figure information into question-answering for documents, and incorporating question type in the PDFTriage approach to improve efficiency and efficacy."},
62: {"question": "What information would you expect to find in section 2 of a document, based on the types of questions classified under Summarization?",
  "gold_answer_research": "Based on the types of questions classified under Summarization, you would expect to find key takeaways, concise summaries, and specific content extraction related to different sections of the document in section 2. The section likely contains detailed summaries of specific parts of the document, along with structured metadata representation and instructions for summarizing the content effectively. It may also include guidelines for extracting specific information and rewriting text for clarity and conciseness.",
  "gold_answer_marketing": "Based on the types of questions classified under Summarization, you would expect to find key takeaways, concise summaries, and specific content extraction related to the document in section 2."},
63: {"question": "What are the main advantages and attention mechanisms that contribute to the enhanced performance and efficiency of the newly introduced language model as compared to its predecessors?",
  "gold_answer_research": "The main advantages of the newly introduced language model include utilizing retrieval-augmentation to incorporate external knowledge, which improves prediction accuracy. Additionally, the model employs attention mechanisms that allow for better understanding of dependencies between source and target sequences, leading to more informed predictions. These attention mechanisms have been extended from machine translation to various other fields, enhancing the model's adaptability and performance across different tasks. Finally, the model's use of self-attention mechanisms enables better contextual representation learning, parallelization, and modeling of longer intra-token relations, improving efficiency and performance compared to previous models.",
  "gold_answer_marketing": "The main advantages of the newly introduced language model include the use of retrieval-augmented mechanisms, attention mechanisms, and context representation learning, which contribute to enhanced performance and efficiency compared to its predecessors."},
64: {"question": "What criteria are used to assess the quality of recommendations provided by different language models in a comparison study?",
  "gold_answer_research": "In a comparison study of language models, criteria such as sentence relevance, lexical accuracy, and contextual understanding are used to assess the quality of recommendations. Different tasks may benefit from different evaluation measures, such as STRINC, LEXICAL, and CXMI. Additionally, template selection plays a vital role in the quality of recommendations, with deliberate template design being important for tasks like query suggestion. The overall quality of recommendations is often judged using a Likert scale, along with metadata collection for each model output.",
  "gold_answer_marketing": "The criteria used to assess the quality of recommendations provided by different language models in a comparison study include comparing to human-created benchmarks, examining intrinsic character, comparing two models, investigating rate of learning, and analyzing learning curves."},
65: {"question": "What approaches have been proposed to enhance the task performance of language models while considering the trade-offs such as runtime efficiency, robustness to irrelevant context, and attribution quality?",
  "gold_answer_research": "Several approaches have been proposed to enhance the task performance of language models while considering trade-offs. These include using compression and selective augmentation methods to decrease the propensity of models to generate toxic or biased outputs. Adversarial setups have been suggested where labelers find worst-case behaviors of the model and add them to the dataset. Additionally, models like BART and T5 leverage bi-directional attention to achieve stronger performance on both discriminative and generative tasks. These methods aim to balance model performance with considerations such as runtime efficiency, robustness to irrelevant context, and attribution quality.",
  "gold_answer_marketing": "Approaches proposed to enhance language model task performance include compression and selective augmentation, adversarial set-ups for labeling worst-case behaviors, retrieval-augmented models, and extending existing models to enable length extrapolation while maintaining quality."},
67: {"question": "What metrics are commonly used to compare the performance of language models in various tasks, as outlined in an experimental results table?",
  "gold_answer_research": "Common metrics used to compare the performance of language models in various tasks, as outlined in an experimental results table, include Exact Match and Unigram F1. These metrics have become standard in evaluating language models. Additionally, other metrics such as BLEU score, FactScore (factuality), precision, and recall are also commonly used to assess the performance of language models across different tasks. It is important to consider a variety of metrics to get a comprehensive understanding of the effectiveness of a language model in different contexts.",
  "gold_answer_marketing": "The metrics commonly used to compare the performance of language models in various tasks are Exact Match and Unigram F1."},
69: {"question": "What is the role of manual assessment in the validation of language model predictions according to the text provided?",
  "gold_answer_research": "Manual assessment plays a crucial role in the validation of language model predictions. The engineers evaluate the quality of model outputs by having labelers rate them on test sets consisting of prompts from held-out customers. This manual assessment helps ensure that the models are aligned with a broad distribution of language tasks and can identify any behavioral issues that may arise from misalignment. Additionally, human annotators find that certain reflection token predictions are aligned with their assessments, providing valuable insights into the accuracy and effectiveness of the models.",
  "gold_answer_marketing": "Answer: Manual assessment plays a key role in evaluating the quality of language model predictions by having labelers rate the model outputs and comparing them to prompts from held-out customers."},
70: {"question": "What are the general steps outlined for training a language model in the document, and how is the training data for the generator language model collected and utilized?",
  "gold_answer_research": "The document outlines the general steps for training a language model, including incorporating retrieved documents into the main input sequence and optimizing the loss function to train the generator. The training data for the generator language model is collected through various techniques such as supervised fine-tuning, critic learning, and custom retrievers for downstream tasks. The collected data is used to train the generator on specific tasks like summarization, machine reading comprehension, and natural language to SQL translation, improving performance on those tasks.",
  "gold_answer_marketing": "The general steps for training a language model include fine-tuning on specific datasets, filtering pretraining data, and using critic learning. Training data for the generator language model is collected from open-access NLP papers and used for downstream conditional text generation tasks."},
73: {"question": "What are the three main categories used to refine language model abilities in understanding and executing search tasks according to the given document?",
  "gold_answer_research": "The three main categories used to refine language model abilities in understanding and executing search tasks are query understanding, document understanding, and query-document relationship understanding. Tasks within these categories focus on interpreting queries, comprehending documents, and understanding the relationships between queries and documents. This approach aims to enhance the models' performance in interpreting and responding to search-related instructions effectively, improving their utility in complex information retrieval scenarios.",
  "gold_answer_marketing": "The three main categories used to refine language model abilities in understanding and executing search tasks are query understanding, document understanding, and query-document relationship understanding."},
74: {"question": "What are some of the emerging research topics and challenges in the field of natural language processing and information retrieval according to recent academic conferences and publications?",
  "gold_answer_research": "Recent academic conferences and publications have highlighted emerging research topics and challenges in natural language processing and information retrieval. Some key areas of focus include efficient retrieval augmented generation, unsupervised dense information retrieval with contrastive learning, citation-informed transformers, and knowledge refinement via interaction between search engines and large language models. Additionally, challenges such as zero-shot retrieval, semantic search using GPT sentence embeddings, and prompt-based effective input reformulation for legal case retrieval have been identified as important research directions. These topics reflect the ongoing advancements and complexities in the field, driving innovation and progress in NLP and IR research.",
  "gold_answer_marketing": "Some emerging research topics and challenges in the field of natural language processing and information retrieval include efficient generation from unstructured knowledge, semantic code search evaluation, unsupervised dense information retrieval, context-aware document term weighting, knowledge refinement through interaction with large language models, and investigating the effectiveness of large language models in search re-ranking."},
75: {"question": "Question: How do models with different fine-tuning strategies compare in terms of accuracy and F1 score for fact verification tasks?",
  "gold_answer_research": "Models with different fine-tuning strategies are compared in terms of accuracy and F1 score for fact verification tasks. The introduction of LLMs has led to notable developments, with some studies leveraging prompting methods to apply LLMs in IR tasks. However, not all LLMs consistently outperform fine-tuned smaller models. For example, RankGPT based on gpt-3.5-turbo underperforms monoBERT in certain scenarios. Fine-tuning is not strictly necessary for models like GPT3, which has been evaluated on closed book question answering tasks without any updates or fine-tuning.",
  "gold_answer_marketing": "Models with different fine-tuning strategies have shown mixed results in terms of accuracy and F1 score for fact verification tasks. Some studies have found that large language models (LLMs) outperform smaller fine-tuned models, while others have reported inconsistent performance. Factors such as task complexity and the need for prompt methods to apply LLMs in information retrieval tasks can also impact the comparison."},
76: {"question": "What components does a fact verification task typically involve in order to assess the accuracy of a given statement?",
  "gold_answer_research": "A fact verification task typically involves assessing the relationship between a claim and the evidence provided, analyzing if there is enough information for a conclusive judgment. This task requires a detailed understanding of the claim and evidence to determine if it is supported or refuted. The use of performance metrics based on including gold answers in model generations instead of exact matching can help search engines deliver accurate and relevant results. Additionally, incorporating lexical measures and verification functions can aid in determining the accuracy of statements.",
  "gold_answer_marketing": "A fact verification task typically involves assessing the relationship between a claim and supporting evidence to determine accuracy."},
78: {"question": "What are the key factors that determine the performance of HALO-aligned models compared to non-HALO models, according to the results presented in the analysis?",
  "gold_answer_research": "According to the analysis presented, the key factors that determine the performance of HALO-aligned models compared to non-HALO models include the specific alignment method used (such as DPO and PPO variant), the model size (significant gap at 13B+ model sizes), and the ability to match or exceed the generation quality of SFT target sequences. Additionally, the study suggests that the cost of increasing model alignment is modest relative to pretraining, and that the modeling of human biases in HALOs may have practical benefits in improving overall performance.",
  "gold_answer_marketing": "The key factor that determines the performance of HALO-aligned models compared to non-HALO models is the model size, with HALO-aligned models generally outperforming non-HALO models at larger sizes (13B+ model sizes)."},
80: {"question": "How does the performance of KTO compare to DPO in model alignment, and what are the potential implications for data usage and training efficiency?",
  "gold_answer_research": "Based on the provided data and experiments, KTO consistently outperforms DPO in model alignment, even with restrictions such as using only one output per input. This suggests that KTO can achieve higher win rates and improve performance across various benchmarks compared to DPO. The implications of this performance difference include the ability to achieve quality generation results with significantly fewer desirable examples, potentially leading to more efficient data usage and training processes. This indicates that KTO may offer a more efficient and effective approach to model alignment compared to DPO.",
  "gold_answer_marketing": "KTO outperforms DPO in model alignment with up to 90% fewer examples. This suggests that KTO can achieve high performance even with imbalanced data, potentially leading to more efficient training processes."},
81: {"question": "What are some common approaches to building an open-domain question answering system?",
  "gold_answer_research": "Some common approaches to building an open-domain question answering system include using the RAG model, which minimizes the negative log-likelihood of answers, and comparing it to extractive QA paradigms that rely on non-parametric knowledge retrieval. Another approach is to incorporate question rewriting techniques to make open-domain QA more conversational. Additionally, utilizing datasets like QASPER, which contain questions requiring complex reasoning, can improve the performance of the system. References to papers by Anantha et al. and Asai et al. provide further insights into building ODQA systems.",
  "gold_answer_marketing": "Common approaches to building an open-domain question answering system include using retrieval over a knowledge base and incorporating the retrieved content as part of the prompt. Other methods involve pretraining models on large amounts of text data and fine-tuning them for question answering tasks."},
82: {"question": "What is the difference between open-book and closed-book question answering?",
  "gold_answer_research": "Open-book question answering involves the use of external sources of knowledge, such as Wikipedia, to retrieve information and generate a response. In contrast, closed-book question answering relies on pre-trained language models that have memorized factual knowledge within their parameters to generate responses without explicit context. Closed-book QA can be seen as analogous to a closed-book exam where no external resources are allowed. The key distinction lies in the reliance on external knowledge sources for open-book QA versus internal memorized knowledge for closed-book QA.",
  "gold_answer_marketing": "Open-book question answering involves using external sources of knowledge to answer questions, while closed-book question answering relies on pre-trained language models to provide answers without explicit context."},
84: {"question": "What are the basic components of the Retriever-Reader framework in open-domain QA?",
  "gold_answer_research": "The basic components of the Retriever-Reader framework in open-domain QA include a retriever model, which fetches relevant information based on input prompts efficiently using FAISS. The retriever component is responsible for retrieving contextually relevant documents or evidence blocks based on the input question. The reader component then processes this retrieved information to generate answers to the questions posed. This framework combines information retrieval and machine reading comprehension to achieve state-of-the-art results in open-domain question answering tasks.",
  "gold_answer_marketing": "The basic components of the Retriever-Reader framework in open-domain QA are the retriever and the reader components, which can be set up and trained independently or jointly trained end-to-end. The retriever component automatically fetches relevant information based on input prompts, while the reader component processes and comprehends the retrieved information to answer questions."},
85: {"question": "How is the TF-IDF model used in question answering retrieval systems?",
  "gold_answer_research": "In question answering retrieval systems, the TF-IDF model is used to represent queries and documents as bag-of-word vectors with terms weighted by term frequency multiplied by inverse document frequency. This allows for efficient non-learning-based search engine operations based on the vector space model. The TF-IDF model helps in calculating the relevance of documents to queries by measuring the importance of terms in the context of the entire document collection. This classic information retrieval approach aids in retrieving relevant information to answer questions accurately and efficiently.",
  "gold_answer_marketing": "The TF-IDF model is used in question answering retrieval systems to weight terms in queries and documents based on their importance in determining relevance."},
86: {"question": "Can neural networks enhance the process of information retrieval in QA systems?",
  "gold_answer_research": "Neural networks, such as MLP, LSTM, and bidirectional LSTM, can be used to learn dense representations of text for information retrieval in QA systems. These approaches, known as 'Neural IR', are a new category of methods that can improve performance in retrieval problems. The introduction of neural retrievers in recent QA literature has shown to outperform traditional word-similarity-based architectures, such as BM25, and can scale to handle knowledge-grounded dialogue tasks effectively. Additionally, incorporating pre-trained retrievers in QA systems has been shown to enhance the performance of generative language models.",
  "gold_answer_marketing": "Yes, neural networks can enhance the process of information retrieval in QA systems by improving performance in open-domain QA tasks and enabling the generation of more accurate answers."},
87: {"question": "What is the importance of fine-tuning in the context of QA data for open-domain question answering models?",
  "gold_answer_research": "Fine-tuning is important in the context of QA data for open-domain question answering models because it allows the model to adapt and improve its performance on specific QA datasets. By fine-tuning the model with common QA datasets, engineers can optimize the model's ability to answer questions accurately. However, there is a concern about the significant overlap between questions in the train and test sets of public QA datasets, which could affect the generalization ability of the fine-tuned models. Engineers should carefully consider this overlap and potentially explore ways to mitigate its impact during the fine-tuning process to ensure the model's effectiveness in real-world applications.",
  "gold_answer_marketing": "Fine-tuning is important in the context of QA data for open-domain question answering models to improve search task performance and the ability to generalize to unseen datasets."},
88: {"question": "How does pre-training with tasks like the Inverse Cloze Task benefit open-domain question answering models?",
  "gold_answer_research": "Pre-training with tasks like the Inverse Cloze Task benefits open-domain question answering models by improving the retrieval process over a knowledge base. By predicting the context given a sentence, the model can better understand the relationship between the question and the evidence. This approach helps in incorporating retrieved content effectively into the prompt, leading to higher accuracy in the question answering task. Additionally, using models pretrained with ICT can enhance the overall performance of the QA system by providing a better understanding of the context.",
  "gold_answer_marketing": "Pre-training with tasks like the Inverse Cloze Task benefits open-domain question answering models by improving retrieval and generation steps, ultimately enhancing the accuracy of the process."},
89: {"question": "What is the main goal of prompt engineering in language models?",
  "gold_answer_research": "The main goal of prompt engineering in language models is to effectively steer the behavior of the model towards desired outcomes without updating the model weights. This is achieved by composing and formatting prompts in a way that maximizes the model's performance on a specific task. Prompt engineering involves treating prompts as trainable parameters and optimizing them directly on the embedding space through methods like AutoPrompt, Prefix-Tuning, P-tuning, and Prompt-Tuning. The ultimate aim is to enhance the model's performance and alignment with user-defined tasks.",
  "gold_answer_marketing": "The main goal of prompt engineering in language models is to steer the behavior of the model for desired outcomes without updating the model weights."},
91: {"question": "What are some known biases that can affect the performance of few-shot classification in LLMs?",
  "gold_answer_research": "Some known biases that can affect the performance of few-shot classification in LLMs include majority label bias, recency bias, and common token bias. Majority label bias occurs when the distribution of labels among examples is unbalanced, recency bias refers to the tendency for the model to repeat the label at the end, and common token bias indicates that LLM tends to produce common tokens more often than rare tokens. These biases can contribute to high variance in few-shot classification tasks and may impact the model's ability to generalize effectively.",
  "gold_answer_marketing": "Some known biases that can affect the performance of few-shot classification in LLMs are majority label bias, recency bias, and common token bias."},
92: {"question": "Why might increasing model size not reduce variance in model performance with varying prompts?",
  "gold_answer_research": "Increasing model size may not necessarily reduce variance in model performance with varying prompts because the model's ability to generalize and adapt to different prompts is not solely dependent on its size. Factors such as the quality and relevance of the training examples, the learning rate or schedule, and the model's sensitivity to different hyperparameters can also play a significant role in determining performance variability. Additionally, the complexity of the task or dataset being used for training can impact how effectively the model scales with size. It is essential to consider these factors holistically when optimizing model performance rather than relying solely on increasing model size.",
  "gold_answer_marketing": "Increasing model size may not reduce variance in model performance with varying prompts because the same order of prompts may work well for one model but poorly for another. Additionally, when the validation set is limited, choosing the order of prompts that prevents the model from producing extremely unbalanced predictions or being overconfident can also affect performance."},
93: {"question": "What is the benefit of instruction-based finetuning in language models?",
  "gold_answer_research": "Instruction-based finetuning improves models' ability to generalize to unseen domains and tasks by providing task-specific representations that can be used for many downstream language tasks without additional training. This method also allows pretrained language models to follow instructions provided in prompts, enabling them to generate the desired output given specific inputs. Additionally, instruction finetuning helps transform raw pretrained LLMs into chatbot-like models, making finetuning more accessible and common, particularly for researchers with limited resources. Overall, the benefit of instruction-based finetuning is improved model performance, enhanced generalizability, and reduced communication costs in aligning with human intentions.",
  "gold_answer_marketing": "The benefit of instruction-based finetuning in language models is improved ability to generalize to unseen domains and tasks, without the need for additional training."},
94: {"question": "Can you describe a situation where retrieval-based methods would be necessary to enhance language model performance?",
  "gold_answer_research": "Retrieval-based methods are necessary to enhance language model performance in scenarios where the model needs to generate accurate and informative responses for entity-rich queries, such as 'George Washington standing in front of the Eiffel Tower.' In such cases, incorporating a retrieval module can provide additional context and relevant information to improve the model's understanding and generation of the desired output. Additionally, retrieval-based methods are crucial for question answering tasks, where the model needs to access external knowledge sources to provide accurate and comprehensive answers. By utilizing retrieval mechanisms, the language model can benefit from a wider range of information and improve its performance in handling complex and ambiguous queries effectively.",
  "gold_answer_marketing": "Retrieval-based methods are necessary to enhance language model performance in tasks like question answering, where incorporating additional information from external sources can improve the model's ability to generate accurate and relevant responses."},
95: {"question": "What is the Chain-of-Thought prompting technique and for which types of tasks is it particularly beneficial?",
  "gold_answer_research": "Chain-of-Thought (CoT) prompting is a technique that generates reasoning chains or rationales step by step to lead to a final answer, benefiting complicated reasoning tasks using large models with more than 50B parameters. It can be implemented through iterative Monte Carlo search methods or through a three-step process called augment-prune-select. CoT is particularly beneficial for enhancing model performance on complex tasks by decomposing them into smaller and simpler steps, shedding light on the model's thinking process. Task decomposition in CoT can be done with simple prompting, task-specific instructions, or human inputs.",
  "gold_answer_marketing": "Chain-of-Thought (CoT) prompting is a technique that generates reasoning chains or rationales step by step to lead to a final answer. It is particularly beneficial for complicated reasoning tasks when using large models with more than 50B parameters. Simple tasks only benefit slightly from CoT prompting."},
96: {"question": "How do augmented language models with external tools differ from regular models in functionality?",
  "gold_answer_research": "Augmented language models with external tools, such as TALM and Toolformer, are fine-tuned to learn how to use external tool APIs, expanding their capabilities beyond traditional language processing tasks. These models are trained to incorporate external tool API calls in order to improve the quality of their outputs, allowing them to perform tasks like speech recognition, machine translation, and information retrieval more effectively. By leveraging external tools, these models have the ability to access and utilize a wider range of resources and functionalities, enhancing their overall performance and versatility compared to regular language models.",
  "gold_answer_marketing": "Augmented language models with external tools differ from regular models by fine-tuning a LM to use external tool APIs, expanding the dataset to improve model outputs and enhancing tasks like speech recognition, machine translation, and natural language generation."},
97: {"question": "What can be inferred about the utilization of attention in neural networks?",
  "gold_answer_research": "Attention mechanisms in neural networks play a crucial role in allowing models to focus on specific parts of input data when making predictions or generating outputs. By assigning importance weights to different elements, such as pixels in an image or words in a sentence, attention helps the model to attend to relevant information and make more accurate predictions. The use of attention can improve the interpretability of neural networks by showing which parts of the input data are being focused on during the prediction process. Additionally, attention mechanisms, like multi-head attention, can enhance model performance by allowing the model to jointly attend to information from different representation subspaces at different positions.",
  "gold_answer_marketing": "Attention in neural networks allows the model to focus on specific parts of input data, such as images or text, in order to make predictions or generate output. It helps the model to learn relationships and correlations between different elements and improve performance in tasks like image captioning or language translation."},
101: {"question": "Can the use of attention mechanisms in deep learning models be applied to both machine translation and computer vision?",
  "gold_answer_research": "Yes, attention mechanisms in deep learning models have shown success in both machine translation and computer vision tasks. In machine translation, attention allows the model to capture dependencies between source and target sequences regardless of distance, leading to improved translation quality. Similarly, in computer vision, attention mechanisms have been used to focus on relevant parts of an image during caption generation, showcasing the ability to handle details and global dependencies effectively. Therefore, utilizing attention in both domains can enhance the performance of deep learning models significantly.",
  "gold_answer_marketing": "Yes, attention mechanisms in deep learning models can be applied to both machine translation and computer vision."},
102: {"question": "What are the potential benefits of incorporating self-attention mechanisms into Generative Adversarial Networks (GANs)?",
  "gold_answer_research": "Incorporating self-attention mechanisms into GANs can help the generator and discriminator better model relationships between spatial regions, leading to improved generation of detailed and realistic images. This is particularly useful for capturing global dependencies and enhancing the performance of transformer architectures. Additionally, self-attention can enable the model to assess its own predictions after each generated segment, allowing for customizable decoding algorithms to meet specific constraints or user preferences. Overall, self-attention in GANs can enhance detail handling and overall performance.",
  "gold_answer_marketing": "Incorporating self-attention mechanisms into GANs can help the generator and discriminator better model relationships between spatial regions, leading to improved performance in handling details and capturing global dependencies."},
103: {"question": "How does the transformer model variate from traditional sequence-aligned recurrent architectures?",
  "gold_answer_research": "The transformer model differs from traditional sequence-aligned recurrent architectures by not having a recurrent or convolutional structure. Instead, it heavily relies on self-attention mechanisms for processing sequences. This lack of recurrence and convolution, even with positional encoding, weakly incorporates sequential order, which can be a drawback for tasks sensitive to positional dependencies. Additionally, the transformer's architecture includes embedding layers, sinusoid-wave-based positional encoding, and softmax and linear layers in the final decoder output to maintain position information and facilitate processing of long sequences efficiently.",
  "gold_answer_marketing": "The transformer model differs from traditional sequence-aligned recurrent architectures by not having a recurrent or convolutional structure, and instead making heavy use of self-attention. This allows for handling very long sequences efficiently and achieving better performance on tasks involving long texts."},
104: {"question": "What implications does the concept of a Neural Turing Machine have for the theoretical power of neural networks?",
  "gold_answer_research": "The concept of a Neural Turing Machine (NTM) expands the theoretical power of neural networks by incorporating external memory storage, allowing for more complex computations and tasks. This mimics the Turing machine tape, enabling the neural network to control operation heads for reading and writing to the tape. However, the finite memory in NTM suggests it may resemble more of a 'Neural von Neumann Machine,' limiting its mathematical limitlessness seen in traditional Turing machines. Overall, the addition of external memory in NTM enhances the capabilities and potential applications of neural networks in solving more advanced problems.",
  "gold_answer_marketing": "The concept of a Neural Turing Machine suggests that neural networks can be equipped with external memory storage for more complex operations, potentially increasing their theoretical power."},
}


test_questions = {
4: {"question": "When was the transformer architecture introduced, and by which organization?"},
5: {"question": "How has the accessibility of powerful language models, such as GPT-3 and GPT-4, been controlled by their developers?"},
6: {"question": "What benchmarks or ratings are used to compare the capabilities of different language models?"},
10: {"question": "What are some of the primary applications for language models in technology and computing?"},
14: {"question": "How are language models typically evaluated and what benchmarks are used for this purpose?"},
15: {"question": "What datasets are available for evaluating language processing systems?"},
21: {"question": "What collaborations with other companies have contributed to the development of Claude's capabilities?"},
26: {"question": "According to DeepMind, how should the number of training tokens change relative to the model size?"},
29: {"question": "How do the sizes of models in the Gopher family range?"},
31: {"question": "What type of model architecture do the Gopher and Chinchilla families belong to?"},
32: {"question": "Can you name the author who wrote the novels A Farewell to Arms and The Sun Also Rises?"},
37: {"question": "What are the key advantages of InstructGPT models over GPT-3 models according to the findings in the research?"},
40: {"question": "What metrics are used to compare the performance of different models on training and validation splits according to the document provided?"},
42: {"question": "What types of evaluation metrics are commonly used to assess the accuracy of answers in AI-driven question and answer datasets?"},
49: {"question": "What factors contribute to the performance improvement in retrieval-augmented language models compared to non-retrieval-augmented models?"},
56: {"question": "What are the benchmarks used to evaluate the performance of the Deep Policy Optimization (DPO) method compared to other preference learning algorithms in the document provided?"},
57: {"question": "What methodologies have been evaluated for training language models to align with human preferences, and how do they compare in terms of effectiveness?"},
58: {"question": "What methods have been discussed in the literature for improving the alignment of language models with human preferences or feedback?"},
66: {"question": "What are some of the evaluation metrics used for assessing different types of text generation tasks presented in the study?"},
68: {"question": "Consider a document related to research in natural language processing or artificial intelligence. Can you name some of the recent topics or methods that have been discussed or introduced in the field according to the document?"},
71: {"question": "What is the significance of using reflection tokens in a model like SELF-RAG?"},
72: {"question": "How does the inclusion of selected context as opposed to appending all retrieved text spans impact computational cost during both training and inference times in language model generation tasks?"},
77: {"question": "What are the benefits of modeling human biases in Human-Aware Loss Optimizations (HALOs), and how do they compare to non-HALOs on the same datasets?"},
79: {"question": "What are the modifications made to the traditional Kahneman-Tversky model to adapt it for optimizing language model performance?"},
83: {"question": "How does a model's ability to answer questions relate to its exposure to specific types of questions during training?"},
90: {"question": "How can adding examples to a prompt affect the performance of language models?"},
98: {"question": "What are the main components of a Neural Turing Machine (NTM) architecture?"},
99: {"question": "How might a seq2seq model's limitations be addressed in natural language processing tasks?"},
100: {"question": "What differentiates hard attention from soft attention in image processing algorithms?"},
}


### 3.3 Running the RAG System


In [None]:
validation_questions_answers[0]

{'question': 'What purpose do large language models serve in the field of natural language processing?',
 'gold_answer_research': 'Large language models (LLMs) serve the purpose of enabling general-purpose language generation and other natural language processing tasks such as classification. They achieve this by learning statistical relationships from text documents during computationally intensive self-supervised and semi-supervised training. LLMs can be used for text generation by predicting the next token or word, making them valuable for tasks like speech recognition, machine translation, and information retrieval. Additionally, LLMs have superseded previous models like recurrent neural networks, showcasing their efficiency and effectiveness in NLP tasks.',
 'gold_answer_marketing': 'Large language models serve the purpose of improving performance in various natural language processing tasks, such as speech recognition, machine translation, natural language generation, optical cha

In [None]:
test_questions[4]

{'question': 'When was the transformer architecture introduced, and by which organization?'}

In [None]:
from huggingface_hub import login

login(token=userdata.get('HUGGING_FACE_TOKEN'))

quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         )


llm_mistral_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    torch_dtype=torch.float32,
    device_map='auto',
    quantization_config=quantization_config
)

llm_mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

mistral_pipe = pipeline(
    "text-generation",
    model=llm_mistral_model,
    tokenizer=llm_mistral_tokenizer,
    max_new_tokens=1000,
    temperature=0.6,
    top_p=0.95,
    do_sample=True,
    repetition_penalty=1.2
)
mistral_pipe.model.config.pad_token_id = mistral_pipe.model.config.eos_token_id

mistral_llm_lc = HuggingFacePipeline(pipeline=mistral_pipe)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Device set to use cuda:0
  mistral_llm_lc = HuggingFacePipeline(pipeline=mistral_pipe)


In [None]:
baseline_embeddings = HuggingFaceEmbeddings(model_name="multi-qa-mpnet-base-dot-v1")

### 4. Tests & Evaluation

### 4.1. Metrics


I will be using cosine similarity, BLEU score, and ROUGE-2 score as my evaluation metrics. I chose cosine similarity because it measures the semantic similarity between two text vector representations, which is useful for determining whether the meaning of the two vectors are similar. I chose BLEU score to have a metric that assesses the overlap of n-grams between the generated response and gold answer. I chose ROUGE-2 because it captures the overlap of bigrams between the two pieces of texts. BLEU measures the precision while ROUGE measures the recall.

In [None]:
!pip install nltk rouge pandas sentence_transformers

import nltk
from nltk.translate.bleu_score import corpus_bleu
from rouge import Rouge
import pandas as pd
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-mpnet-base-v2')

rouge = Rouge()

nltk.download('wordnet')

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

### 4.2. Evaluation Comparisons

#### Baseline RAG Pipeline

In [None]:
data_marketing = []
data_research = []

for idx, info in validation_questions_answers.items():
    question = info['question']
    answer_marketing = info['gold_answer_marketing']
    answer_research = info['gold_answer_research']

    data_marketing.append([question, answer_marketing])
    data_research.append([question, answer_research])

df_marketing = pd.DataFrame(data_marketing, columns=['Question', 'Gold Answer Marketing'])
df_research = pd.DataFrame(data_research, columns=['Question', 'Gold Answer Research'])

In [None]:
df_marketing.head()

Unnamed: 0,Question,Gold Answer Marketing
0,What purpose do large language models serve in...,Large language models serve the purpose of imp...
1,How does a large language model learn from tex...,A large language model learns from text during...
2,What are some key architectures behind the dev...,Key architectures behind the development of la...
3,Can you name some specific large language mode...,"Chinchilla by DeepMind, GPT-3 by OpenAI."
4,What licensing models have been adopted for th...,Answer: Some organizations choose open-sourcin...


In [None]:
def add_rag_answer_column(df):
    df['RAG Answer'] = ''
    return df

df_marketing = add_rag_answer_column(df_marketing)
df_research = add_rag_answer_column(df_research)

In [None]:
df_marketing.head()

Unnamed: 0,Question,Gold Answer Marketing,RAG Answer
0,What purpose do large language models serve in...,Large language models serve the purpose of imp...,
1,How does a large language model learn from tex...,A large language model learns from text during...,
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,
3,Can you name some specific large language mode...,"Chinchilla by DeepMind, GPT-3 by OpenAI.",
4,What licensing models have been adopted for th...,Answer: Some organizations choose open-sourcin...,


##### All 75 Questions

In [None]:
# Start: 97.45 units
# End: 93.21 units
# Time: 29 mins

def evaluate_rag(questions, gold_answers, df, rag_chain, model):
    embeddings = SentenceTransformer(model)
    rouge = Rouge()


    for index, (question, gold_answer) in enumerate(zip(questions, gold_answers)):

        response_text = rag_chain.invoke(question)


        end_of_instruction_index = response_text.find("[/INST]") + len("[/INST]")
        rag_answer = response_text[end_of_instruction_index:].strip()


        df.loc[index, 'RAG Answer'] = rag_answer


        gold_embed = embeddings.encode([gold_answer])
        rag_embed = embeddings.encode([rag_answer])


        cos_sim = cosine_similarity(gold_embed, rag_embed)[0][0]
        df.loc[index, 'Cosine Similarity'] = cos_sim


        bleu_score = sentence_bleu([gold_answer.split()], rag_answer.split(), smoothing_function=SmoothingFunction().method1)
        rouge_scores = rouge.get_scores(rag_answer, gold_answer)[0]


        df.loc[index, 'BLEU Score'] = bleu_score
        df.loc[index, 'ROUGE Score'] = rouge_scores['rouge-2']['f']  # f-measure of ROUGE-1

    return df


df_marketing = evaluate_rag(df_marketing['Question'], df_marketing['Gold Answer Marketing'], df_marketing, rag_chain, 'all-mpnet-base-v2')
df_research = evaluate_rag(df_research['Question'], df_research['Gold Answer Research'], df_research, rag_chain, 'all-mpnet-base-v2')


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [None]:
from google.colab import drive
import os


drive.mount('/content/drive')

folder_path = '/content/drive/My Drive/w267/a5/Config_Results'

file_path_marketing = os.path.join(folder_path, 'rag_results_marketing.csv')
file_path_research = os.path.join(folder_path, 'rag_results_research.csv')

df_marketing.to_csv(file_path_marketing, index=False)
df_research.to_csv(file_path_research, index=False)

Mounted at /content/drive


In [None]:
path_marketing = '/content/drive/My Drive/w267/a5/Config_Results/Baseline_Config/rag_results_marketing.csv'
path_research = '/content/drive/My Drive/w267/a5/Config_Results/Baseline_Config/rag_results_research.csv'

df_marketing = pd.read_csv(path_marketing)
df_research = pd.read_csv(path_research)

In [None]:
df_marketing.head()

Unnamed: 0,Question,Gold Answer Marketing,RAG Answer,Cosine Similarity,BLEU Score,ROUGE Score
0,What purpose do large language models serve in...,Large language models serve the purpose of imp...,Large language models (LLMs) serve as the core...,0.699172,0.031722,0.089888
1,How does a large language model learn from tex...,A large language model learns from text during...,A large language model learns from text during...,0.73971,0.092962,0.130435
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,"Based on the context you have provided, there ...",0.567483,0.018068,0.035264
3,Can you name some specific large language mode...,"Chinchilla by DeepMind, GPT-3 by OpenAI.","Based on the context provided, there are sever...",0.334637,0.00294,0.0
4,What licensing models have been adopted for th...,Answer: Some organizations choose open-sourcin...,"Based on the context provided, there are no sp...",0.344814,0.004874,0.0


In [None]:
df_research.head()

Unnamed: 0,Question,Gold Answer Research,RAG Answer,Cosine Similarity,BLEU Score,ROUGE Score
0,What purpose do large language models serve in...,Large language models (LLMs) serve the purpose...,"In the context provided, large language models...",0.653558,0.03153,0.099448
1,How does a large language model learn from tex...,A large language model learns from text during...,"A large language model, such as those mentione...",0.690648,0.016522,0.057471
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,"Based on the provided context, several researc...",0.491959,0.016931,0.032877
3,Can you name some specific large language mode...,Some specific large language models include GP...,"Based on the context provided, there are no sp...",0.657462,0.026737,0.087912
4,What licensing models have been adopted for th...,"Based on the provided context, it seems that l...","Based on the context provided, there are no sp...",0.87962,0.012656,0.10219


In [None]:
average_cosine_similarity_m = df_marketing['Cosine Similarity'].mean()

average_bleu_score_m = df_marketing['BLEU Score'].mean()

average_rouge_score_m = df_marketing['ROUGE Score'].mean()

print("Average Cosine Similarity:", average_cosine_similarity_m)
print("Average BLEU Score:", average_bleu_score_m)
print("Average ROUGE Score:", average_rouge_score_m)

average_cosine_similarity_r = df_research['Cosine Similarity'].mean()

average_bleu_score_r = df_research['BLEU Score'].mean()

average_rouge_score_r = df_research['ROUGE Score'].mean()

print("Average Cosine Similarity:", average_cosine_similarity_r)
print("Average BLEU Score:", average_bleu_score_r)
print("Average ROUGE Score:", average_rouge_score_r)

Average Cosine Similarity: 0.7064249836
Average BLEU Score: 0.03663858515406235
Average ROUGE Score: 0.07667048046762966
Average Cosine Similarity: 0.6943485646666665
Average BLEU Score: 0.04830197908528949
Average ROUGE Score: 0.08251367814131899


#### Configuration 1

Embedding: all-distilroberta-v1
Chunk Size: 128
Overlap: 25
Temperature: 0.8

In [None]:
base_embeddings = HuggingFaceEmbeddings(model_name="all-distilroberta-v1")


loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=128, chunk_overlap=25)
splits = text_splitter.split_documents(documents)

vectorstore = Qdrant.from_documents(splits,
    base_embeddings,
    location=":memory:",
    collection_name="test",
    force_recreate=True
)
retriever = vectorstore.as_retriever()


qdrant_vectorstore = Qdrant.from_documents(splits,
    base_embeddings,
    location=":memory:",
    collection_name="rag_tech_db",
    force_recreate=True
)

retriever = qdrant_vectorstore.as_retriever()
CHUNK_SIZE=128
OVERLAP=25

text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=OVERLAP)

global_doc_number = 1
arxiv_numbers = ('2005.11401', '2104.07567', '2104.09864', '2105.03011', '2106.09685', '2203.02155', '2211.09260', '2211.12561',
                 '2212.09741', '2305.14314', '2305.18290', '2306.15595', '2309.08872', '2309.15217', '2310.06825', '2310.11511',
                 '2311.08377', '2312.05708', '2401.06532', '2401.17268', '2402.01306', '2402.19473', '2406.04744')
all_arxiv_pages = []


for identifier in arxiv_numbers:

    arx_url = f"https://arxiv.org/pdf/{identifier}.pdf"


    arx_loader = PyMuPDFLoader(arx_url)
    arx_pages = arx_loader.load()
    for page_num in range(len(arx_pages)):
        page = arx_pages[page_num]
        page.metadata['page_num'] = page_num
        page.metadata['doc_num'] = global_doc_number
        page.metadata['doc_source'] = "ArXiv"
        all_arxiv_pages.append(page)


    global_doc_number += 1

splits = text_splitter.split_documents(all_arxiv_pages)
for idx, text in enumerate(splits):
    splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=splits)


wiki_docs = WikipediaLoader(query="Generative Artificial Intelligence", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


wiki_docs = WikipediaLoader(query="Information Retrieval", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


wiki_docs = WikipediaLoader(query="Large Language Models", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


web_loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2020-10-29-odqa/",
               "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
               "https://lilianweng.github.io/posts/2018-06-24-attention/",
               "https://lilianweng.github.io/posts/2023-06-23-agent/",
               "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/"),

    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

web_documents = web_loader.load()

for idx, text in enumerate(web_documents):
    web_documents[idx].metadata['doc_num'] = global_doc_number
    web_documents[idx].metadata['doc_source'] = "WWW"
    global_doc_number += 1

web_splits = text_splitter.split_documents(web_documents)

for idx, text in enumerate(web_splits):
    web_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=web_splits)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

['9650f3a8bb0c477ba8a8a9adb0f612b1',
 '1a30fbbf36794625bc8094016792a693',
 '41a54e6347334a8cab5059e3af557d28',
 'd0d3af0986344a659c14e790adfaaa12',
 '5cda8a3676514e2d8091b4a50ccb7dbf',
 'e6fccf50b5cc492a90f8060276528553',
 '988ef1ea8f2d4ce9b5e154fab1f222a4',
 'c3345ee9a37442e6bd832cb67826c91d',
 'c1f8ee3d6c5c449588b82721e336a191',
 '02ed56ce212148739fa11ca1afa0fa16',
 '5625ba645a6a49a1bc659f5f0ca70c43',
 'd57a380c78e348a88fae520969a02a33',
 '56069dc824ac49759c02c0d398e812ec',
 '89f49e30ca124dea9f861e2202260e7e',
 'ecdb5a931b06456aafdd15804cefc12b',
 '7c58c75d275c4cf1aa9eef9ef52fd2fe',
 'de9acc3e13724f21b97056ed0cfe9244',
 'ecd126944d324a4e8ae5c81644d6a28f',
 '6668a075a897439b89b481cf5700fbf9',
 'cedf3580389b4de0b19f8c0d20a53753',
 '2276e748060e4927b32ec0d3124e6a3e',
 '16cecff2b3b84015afa6c3f7cf6c93f1',
 'bf536585533b4d4dbaeadf5d43cf14d6',
 '83e15d8664a64956a410a9e24e3d280c',
 'f0a7e72eb0544e58a0b3a8893352926e',
 '27760f875bf04f6c8f07bcd129ec4b1e',
 '9af72231bbe24d31a71ce63351b7fdaf',
 

In [None]:
mistral_pipe = pipeline(
    "text-generation",
    model=llm_mistral_model,
    tokenizer=llm_mistral_tokenizer,
    max_new_tokens=1000,
    temperature=0.8,
    top_p=0.95,
    do_sample=True,
    repetition_penalty=1.2
)
mistral_pipe.model.config.pad_token_id = mistral_pipe.model.config.eos_token_id

mistral_llm_lc = HuggingFacePipeline(pipeline=mistral_pipe)

Device set to use cuda:0


In [None]:
rag_chain = (
    {"context": retriever | format_docs,
     "question": RunnablePassthrough()}
    | rag_prompt
    | mistral_llm_lc
)

In [None]:
response_text = rag_chain.invoke('What is Chain of Thought?')

end_of_instruction_index = response_text.find("[/INST]") + len("[/INST]")


answer_text = response_text[end_of_instruction_index:].strip()

answer_text

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'Based on the context you have provided, "Chain of Thought" refers to a prompting technique used in artificial intelligence models like language models. It breaks down a complex problem into smaller parts or steps, generating several thoughts or solutions for each part. The chain is formed by connecting these individual thoughts together, hence improving overall model performance. This method was introduced in the paper "Efficiently Augmenting Few-shot Text-based Model with Prompt Engineering" by Wei et al. (2022).'

In [None]:
data_marketing = []
data_research = []

for idx, info in validation_questions_answers.items():
    question = info['question']
    answer_marketing = info['gold_answer_marketing']
    answer_research = info['gold_answer_research']

    data_marketing.append([question, answer_marketing])
    data_research.append([question, answer_research])


df_marketing = pd.DataFrame(data_marketing, columns=['Question', 'Gold Answer Marketing'])
df_research = pd.DataFrame(data_research, columns=['Question', 'Gold Answer Research'])

def add_rag_answer_column(df):
    df['RAG Answer'] = ''
    return df

df_marketing = add_rag_answer_column(df_marketing)
df_research = add_rag_answer_column(df_research)

In [None]:
df_research.head()

Unnamed: 0,Question,Gold Answer Research,RAG Answer
0,What purpose do large language models serve in...,Large language models (LLMs) serve the purpose...,
1,How does a large language model learn from tex...,A large language model learns from text during...,
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,
3,Can you name some specific large language mode...,Some specific large language models include GP...,
4,What licensing models have been adopted for th...,"Based on the provided context, it seems that l...",


In [None]:
# Start: 83.33 units
# End: 82.63 units
# Time: 8 mins

import numpy as np

def evaluate_rag(questions, gold_answers, df, rag_chain, model):
    embeddings = SentenceTransformer(model)
    rouge = Rouge()
    sampled_indices = np.random.choice(df.index, size=25, replace=False)

    for index in sampled_indices:
        question = questions.loc[index]
        gold_answer = gold_answers.loc[index]

        response_text = rag_chain.invoke(question)

        end_of_instruction_index = response_text.find("[/INST]") + len("[/INST]")
        rag_answer = response_text[end_of_instruction_index:].strip()

        df.loc[index, 'RAG Answer'] = rag_answer

        gold_embed = embeddings.encode([gold_answer])
        rag_embed = embeddings.encode([rag_answer])

        cos_sim = cosine_similarity(gold_embed, rag_embed)[0][0]
        df.loc[index, 'Cosine Similarity'] = cos_sim

        bleu_score = sentence_bleu([gold_answer.split()], rag_answer.split(), smoothing_function=SmoothingFunction().method1)
        rouge_scores = rouge.get_scores(rag_answer, gold_answer)[0]

        df.loc[index, 'BLEU Score'] = bleu_score
        df.loc[index, 'ROUGE Score'] = rouge_scores['rouge-2']['f']

    return df

df_marketing = evaluate_rag(df_marketing['Question'], df_marketing['Gold Answer Marketing'], df_marketing, rag_chain, 'all-mpnet-base-v2')
df_research = evaluate_rag(df_research['Question'], df_research['Gold Answer Research'], df_research, rag_chain, 'all-mpnet-base-v2')

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [None]:
df_marketing[df_marketing['RAG Answer'] != ''].head()

Unnamed: 0,Question,Gold Answer Marketing,RAG Answer,Cosine Similarity,BLEU Score,ROUGE Score
0,What purpose do large language models serve in...,Large language models serve the purpose of imp...,,,,
1,How does a large language model learn from tex...,A large language model learns from text during...,,,,
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,,,,
3,Can you name some specific large language mode...,"Chinchilla by DeepMind, GPT-3 by OpenAI.",,,,
4,What licensing models have been adopted for th...,Answer: Some organizations choose open-sourcin...,,,,


In [None]:
folder_path = '/content/drive/My Drive/w267/a5/Config_Results/Config_1_Test'  # You can change this path to your desired Google Drive folder

file_path_marketing = os.path.join(folder_path, 'rag_results_marketing_1_test.csv')
file_path_research = os.path.join(folder_path, 'rag_results_research_1_test.csv')

df_marketing.to_csv(file_path_marketing, index=False)
df_research.to_csv(file_path_research, index=False)

In [None]:
path_marketing = '/content/drive/My Drive/w267/a5/Config_Results/Config_1_Test/rag_results_marketing_1_test.csv'
path_research = '/content/drive/My Drive/w267/a5/Config_Results/Config_1_Test/rag_results_research_1_test.csv'

df_marketing = pd.read_csv(path_marketing)
df_research = pd.read_csv(path_research)

In [None]:
df_marketing.head()

Unnamed: 0,Question,Gold Answer Marketing,RAG Answer,Cosine Similarity,BLEU Score,ROUGE Score
0,What purpose do large language models serve in...,Large language models serve the purpose of imp...,,,,
1,How does a large language model learn from tex...,A large language model learns from text during...,,,,
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,,,,
3,Can you name some specific large language mode...,"Chinchilla by DeepMind, GPT-3 by OpenAI.",,,,
4,What licensing models have been adopted for th...,Answer: Some organizations choose open-sourcin...,,,,


In [None]:
average_cosine_similarity_m = df_marketing['Cosine Similarity'].mean()

average_bleu_score_m = df_marketing['BLEU Score'].mean()

average_rouge_score_m = df_marketing['ROUGE Score'].mean()

print("Average Cosine Similarity:", average_cosine_similarity_m)
print("Average BLEU Score:", average_bleu_score_m)
print("Average ROUGE Score:", average_rouge_score_m)

Average Cosine Similarity: 0.7614152096
Average BLEU Score: 0.06444823154148423
Average ROUGE Score: 0.12209917488282368


In [None]:
average_cosine_similarity_r = df_research['Cosine Similarity'].mean()

average_bleu_score_r = df_research['BLEU Score'].mean()

average_rouge_score_r = df_research['ROUGE Score'].mean()

print("Average Cosine Similarity:", average_cosine_similarity_r)
print("Average BLEU Score:", average_bleu_score_r)
print("Average ROUGE Score:", average_rouge_score_r)

Average Cosine Similarity: 0.7285799339999999
Average BLEU Score: 0.06559480620266046
Average ROUGE Score: 0.11186037740717632


#### Configuration 2

Embedding: multi-qa-mpnet-base-dot-v1
Chunk Size: 128
Overlap: 25
Temperature: 0.6

In [None]:
base_embeddings = HuggingFaceEmbeddings(model_name="multi-qa-mpnet-base-dot-v1")


loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=128, chunk_overlap=25)
splits = text_splitter.split_documents(documents)

vectorstore = Qdrant.from_documents(splits,
    base_embeddings,
    location=":memory:",
    collection_name="test",
    force_recreate=True
)
retriever = vectorstore.as_retriever()


qdrant_vectorstore = Qdrant.from_documents(splits,
    base_embeddings,
    location=":memory:",
    collection_name="rag_tech_db",
    force_recreate=True
)

retriever = qdrant_vectorstore.as_retriever()
CHUNK_SIZE=128
OVERLAP=25

text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=OVERLAP)

global_doc_number = 1
arxiv_numbers = ('2005.11401', '2104.07567', '2104.09864', '2105.03011', '2106.09685', '2203.02155', '2211.09260', '2211.12561',
                 '2212.09741', '2305.14314', '2305.18290', '2306.15595', '2309.08872', '2309.15217', '2310.06825', '2310.11511',
                 '2311.08377', '2312.05708', '2401.06532', '2401.17268', '2402.01306', '2402.19473', '2406.04744')
all_arxiv_pages = []

for identifier in arxiv_numbers:
    arx_url = f"https://arxiv.org/pdf/{identifier}.pdf"

    arx_loader = PyMuPDFLoader(arx_url)
    arx_pages = arx_loader.load()
    for page_num in range(len(arx_pages)):
        page = arx_pages[page_num]
        page.metadata['page_num'] = page_num
        page.metadata['doc_num'] = global_doc_number
        page.metadata['doc_source'] = "ArXiv"
        all_arxiv_pages.append(page)


    global_doc_number += 1

splits = text_splitter.split_documents(all_arxiv_pages)
for idx, text in enumerate(splits):
    splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=splits)


wiki_docs = WikipediaLoader(query="Generative Artificial Intelligence", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


wiki_docs = WikipediaLoader(query="Information Retrieval", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


wiki_docs = WikipediaLoader(query="Large Language Models", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


web_loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2020-10-29-odqa/",
               "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
               "https://lilianweng.github.io/posts/2018-06-24-attention/",
               "https://lilianweng.github.io/posts/2023-06-23-agent/",
               "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/"),

    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

web_documents = web_loader.load()

for idx, text in enumerate(web_documents):
    web_documents[idx].metadata['doc_num'] = global_doc_number
    web_documents[idx].metadata['doc_source'] = "WWW"
    global_doc_number += 1

web_splits = text_splitter.split_documents(web_documents)

for idx, text in enumerate(web_splits):
    web_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=web_splits)


['7fc7d019ab3d4806ad7a9348cbe012f9',
 '80f71de283664b9bb903849c489535a5',
 '8fa7406762e74f09a75031e5192f434f',
 'b1117449a7b34c38811c23d025f45d90',
 '43a88101404e4818bede75baf3c695d0',
 'ecd78d4a0eca4fc88ab87ca20ce87e18',
 'b76f6de407af4ce38e3b95b5b5d7ac05',
 '055ccc2946c741b5b500c4d52665f09d',
 'f0924bc4e2b04681af828a48d717e51c',
 '948c6d40beda4e3980ea4b823082f7e9',
 '460c254046174d199e7d3a2584c0b32e',
 '555e5ddad9ee439683480a258daf176f',
 'b25f2a4f1ecc4a89b3064f34fe86e744',
 '165b58e823af44059e41160f9f447531',
 '8e5dc0ae31aa4580a6401a0afccd1704',
 'dfc532cb261f49e0aad69bde9dfdf904',
 '8e9cd53239d14a69a8aa189a974f00a0',
 '3333896f80664ebf96a8c85f7f776269',
 '1996bfdabc1843dbb3fc24b1af159eb3',
 'ee51cf1e958b4d4e882839a9e3c1e1da',
 '656c86710871425092ea6077330fe18b',
 'b5105f173be8452d932ad37a67fc18af',
 '6d76925adddf461c916f1613b35e26a4',
 'a2484d6c709f4954a9ae53eeec455919',
 '3c479e7801274c7694056ce10aa60167',
 'c6a42580604c4de489680b5576118a83',
 '0b3d3c51bb9c485ba3a250ef89b50fc9',
 

In [None]:
mistral_pipe = pipeline(
    "text-generation",
    model=llm_mistral_model,
    tokenizer=llm_mistral_tokenizer,
    max_new_tokens=700,
    temperature=0.6,
    top_p=0.95,
    do_sample=True,
    repetition_penalty=1.2
)
mistral_pipe.model.config.pad_token_id = mistral_pipe.model.config.eos_token_id

mistral_llm_lc = HuggingFacePipeline(pipeline=mistral_pipe)

Device set to use cuda:0


In [None]:
rag_chain = (
    {"context": retriever | format_docs,
     "question": RunnablePassthrough()}
    | rag_prompt
    | mistral_llm_lc
)

In [None]:
response_text = rag_chain.invoke('What is Chain of Thought?')

end_of_instruction_index = response_text.find("[/INST]") + len("[/INST]")

answer_text = response_text[end_of_instruction_index:].strip()

answer_text

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'Based on the context provided, "Chain of Thought" refers to a method or approach used in machine learning and artificial intelligence, specifically in language models. It involves generating a sequence or chain of thoughts through a model\'s output, with each thought being revised based on knowledge retrieved from an external knowledge base. This process was discussed in two different papers as mentioned in the context.'

In [None]:
data_marketing = []
data_research = []

for idx, info in validation_questions_answers.items():
    question = info['question']
    answer_marketing = info['gold_answer_marketing']
    answer_research = info['gold_answer_research']

    data_marketing.append([question, answer_marketing])
    data_research.append([question, answer_research])

df_marketing = pd.DataFrame(data_marketing, columns=['Question', 'Gold Answer Marketing'])
df_research = pd.DataFrame(data_research, columns=['Question', 'Gold Answer Research'])

def add_rag_answer_column(df):
    df['RAG Answer'] = ''
    return df

df_marketing = add_rag_answer_column(df_marketing)
df_research = add_rag_answer_column(df_research)

In [None]:
df_research.head()

Unnamed: 0,Question,Gold Answer Research,RAG Answer
0,What purpose do large language models serve in...,Large language models (LLMs) serve the purpose...,
1,How does a large language model learn from tex...,A large language model learns from text during...,
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,
3,Can you name some specific large language mode...,Some specific large language models include GP...,
4,What licensing models have been adopted for th...,"Based on the provided context, it seems that l...",


In [None]:
df_marketing[df_marketing['RAG Answer'] != ''].head()

Unnamed: 0,Question,Gold Answer Marketing,RAG Answer


In [None]:
df_research[df_research['RAG Answer'] != ''].head()

Unnamed: 0,Question,Gold Answer Research,RAG Answer


In [None]:
folder_path = '/content/drive/My Drive/w267/a5/Config_Results/Config_2_Test'  # You can change this path to your desired Google Drive folder

file_path_marketing = os.path.join(folder_path, 'rag_results_marketing_2_test.csv')
file_path_research = os.path.join(folder_path, 'rag_results_research_2_test.csv')

df_marketing.to_csv(file_path_marketing, index=False)
df_research.to_csv(file_path_research, index=False)

In [None]:
path_marketing = '/content/drive/My Drive/w267/a5/Config_Results/Config_2_Test/rag_results_marketing_2_test.csv'
path_research = '/content/drive/My Drive/w267/a5/Config_Results/Config_2_Test/rag_results_research_2_test.csv'

df_marketing = pd.read_csv(path_marketing)
df_research = pd.read_csv(path_research)

In [None]:
df_marketing.head()

Unnamed: 0,Question,Gold Answer Marketing,RAG Answer
0,What purpose do large language models serve in...,Large language models serve the purpose of imp...,
1,How does a large language model learn from tex...,A large language model learns from text during...,
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,
3,Can you name some specific large language mode...,"Chinchilla by DeepMind, GPT-3 by OpenAI.",
4,What licensing models have been adopted for th...,Answer: Some organizations choose open-sourcin...,


In [None]:
average_cosine_similarity_m = df_marketing['Cosine Similarity'].mean()

average_bleu_score_m = df_marketing['BLEU Score'].mean()

average_rouge_score_m = df_marketing['ROUGE Score'].mean()

print("Average Cosine Similarity:", average_cosine_similarity_m)
print("Average BLEU Score:", average_bleu_score_m)
print("Average ROUGE Score:", average_rouge_score_m)

KeyError: 'Cosine Similarity'

In [None]:
average_cosine_similarity_r = df_research['Cosine Similarity'].mean()

average_bleu_score_r = df_research['BLEU Score'].mean()

average_rouge_score_r = df_research['ROUGE Score'].mean()

print("Average Cosine Similarity:", average_cosine_similarity_r)
print("Average BLEU Score:", average_bleu_score_r)
print("Average ROUGE Score:", average_rouge_score_r)

#### Configuration 3

Embedding: multi-qa-mpnet-base-dot-v1
Chunk Size: 300
Overlap: 50
Temperature: 0.6
Improved prompt

In [None]:
base_embeddings = HuggingFaceEmbeddings(model_name="multi-qa-mpnet-base-dot-v1")


loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
splits = text_splitter.split_documents(documents)

vectorstore = Qdrant.from_documents(splits,
    base_embeddings,
    location=":memory:",
    collection_name="test",
    force_recreate=True
)
retriever = vectorstore.as_retriever()


qdrant_vectorstore = Qdrant.from_documents(splits,
    base_embeddings,
    location=":memory:",
    collection_name="rag_tech_db",
    force_recreate=True
)

retriever = qdrant_vectorstore.as_retriever()
CHUNK_SIZE=300
OVERLAP=50

text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=OVERLAP)

global_doc_number = 1
arxiv_numbers = ('2005.11401', '2104.07567', '2104.09864', '2105.03011', '2106.09685', '2203.02155', '2211.09260', '2211.12561',
                 '2212.09741', '2305.14314', '2305.18290', '2306.15595', '2309.08872', '2309.15217', '2310.06825', '2310.11511',
                 '2311.08377', '2312.05708', '2401.06532', '2401.17268', '2402.01306', '2402.19473', '2406.04744')
all_arxiv_pages = []

for identifier in arxiv_numbers:
    arx_url = f"https://arxiv.org/pdf/{identifier}.pdf"

    arx_loader = PyMuPDFLoader(arx_url)
    arx_pages = arx_loader.load()
    for page_num in range(len(arx_pages)):
        page = arx_pages[page_num]
        page.metadata['page_num'] = page_num
        page.metadata['doc_num'] = global_doc_number
        page.metadata['doc_source'] = "ArXiv"
        all_arxiv_pages.append(page)


    global_doc_number += 1

splits = text_splitter.split_documents(all_arxiv_pages)
for idx, text in enumerate(splits):
    splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=splits)


wiki_docs = WikipediaLoader(query="Generative Artificial Intelligence", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


wiki_docs = WikipediaLoader(query="Information Retrieval", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


wiki_docs = WikipediaLoader(query="Large Language Models", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


web_loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2020-10-29-odqa/",
               "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
               "https://lilianweng.github.io/posts/2018-06-24-attention/",
               "https://lilianweng.github.io/posts/2023-06-23-agent/",
               "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/"),

    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

web_documents = web_loader.load()

for idx, text in enumerate(web_documents):
    web_documents[idx].metadata['doc_num'] = global_doc_number
    web_documents[idx].metadata['doc_source'] = "WWW"
    global_doc_number += 1

web_splits = text_splitter.split_documents(web_documents)

for idx, text in enumerate(web_splits):
    web_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=web_splits)


['aded3b0ac8e744f4aa9622df692f4f9b',
 '130f0ac368d3444085faf98ae23acf34',
 'af4c7ec7d45f41c4a172e81bbdac716f',
 'a84f681e6b68464c857e3b08732c924d',
 '603ab1a17c194f4da713e1cb439aa08d',
 'b279114d89dd4522bd4183f250fcd49c',
 'cbaa422883044a52b74f7f91765b6b54',
 '9f2e8e9367824ce9bb16ba46b750368f',
 '90304ebfb27c4ff2bc9c6b8c667f61f5',
 'f7a4bdcf21374f2aaff0527e1307b33e',
 'c59d7313f80141eb975d0c1e9dfcdc51',
 'd040f2866efa4bc6b21e5326ecbe424c',
 '1b30c240b35a45e1b851e605567f3278',
 '9bf7ec5e25464dc18e1339cb783f75cf',
 '72327b3ca07a47098c6d9ba83c1c2aed',
 '6ea4d851a3e64fbabf7cb53930946a39',
 'e5271c54d3b748dca9eb52e6e0e8776d',
 'f87247c2dc7b4b4294250abdb1955f29',
 '043994668049481cb1bc60ed1413f3a6',
 '0805e25120164e80ac61397ffb4209d8',
 '13b5a00204bc4d55a16ebe1f6a3dd8bc',
 'c40b01ba8ac14fd8bd6560e6c6f6b426',
 'ff1e34d9eac849e9a2e83d956fefbc2f',
 'e3a4d09dc21048c4a7640e69b8fffa51',
 '66f3dcac418d4d7e8f56c3916660b701',
 '6fa077410ca8422d94265aeea2d62780',
 '4bf8cae038bd4111aa14ea84188f4304',
 

In [None]:
mistral_pipe = pipeline(
    "text-generation",
    model=llm_mistral_model,
    tokenizer=llm_mistral_tokenizer,
    max_new_tokens=700,
    temperature=0.6,
    top_p=0.95,
    do_sample=True,
    repetition_penalty=1.2
)
mistral_pipe.model.config.pad_token_id = mistral_pipe.model.config.eos_token_id

mistral_llm_lc = HuggingFacePipeline(pipeline=mistral_pipe)

Device set to use cuda:0


In [None]:
prompt_eng_temp = """[INST]
You are a dedicated assistant versed in NLP and generative AI technologies. Your audience consists of a group of engineers at a technology company eager to develop innovative GenAI applications.
These engineers possess a strong technical foundation in NLP, so they require answers with sufficient technical depth.
Please rely solely on the context provided below for information:

{context}

That concludes the context section.

Next, please provide an answer to the following question, drawing exclusively from the provided context. Your response should not exceed 125 words and must cater to engineers who expect a nuanced and detailed understanding of NLP concepts. Ensure your answer is precise and detailed, avoiding any lists or bullet points.

Here is the question:

{question}
[/INST]

Assistant:"""

prompt_mark_temp = """[INST]
You are an expert assistant specializing in NLP and generative AI, geared to assist a marketing team at a tech firm planning to introduce a range of GenAI solutions.
This team lacks a technical background in NLP, requiring explanations free from complex jargon.
Please use only the context provided below to answer their questions:

{context}

That concludes the context section.

Now, respond to the following question based on the provided context in no more than 75 words. Aim to deliver your answer in a straightforward manner suitable for non-technical team members, focusing on general insights about GenAI technologies and their implications. Ensure your response is clear and concise, strictly avoiding lists or bullet points.

Here is the question:

{question}
[/INST]

Assistant:"""

In [None]:
eng_prompt = ChatPromptTemplate.from_template(prompt_eng_temp)
mark_prompt = ChatPromptTemplate.from_template(prompt_mark_temp)

rag_chain_eng = (
    {"context": retriever | format_docs,
     "question": RunnablePassthrough()}
    | eng_prompt
    | mistral_llm_lc
)

rag_chain_mark = (
    {"context": retriever | format_docs,
     "question": RunnablePassthrough()}
    | mark_prompt
    | mistral_llm_lc
)

In [None]:
response_text = rag_chain_eng.invoke('What are LLMs?')
print(response_text)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Human: [INST]
You are a dedicated assistant versed in NLP and generative AI technologies. Your audience consists of a group of engineers at a technology company eager to develop innovative GenAI applications.
These engineers possess a strong technical foundation in NLP, so they require answers with sufficient technical depth.
Please rely solely on the context provided below for information:

A large language model (LLM) is a type of computational model designed for natural language processing tasks such as language generation. As language models, LLMs acquire these abilities by learning statistical relationships from vast amounts of text during a self-supervised and semi-supervised

Abstract
Large language models (LLMs) have demon-
strated impressive capabilities in various nat-
ural language processing tasks. Despite this,
their application to information retrieval (IR)
tasks is still challenging due to the infrequent
occurrence of many IR-specific concepts in

the fast adoption of LL

In [None]:
import re

def extract_assistant_response(response):
    pattern = r"Assistant:\s*(.*)"

    match = re.search(pattern, response)

    if match:
        answer_text = match.group(1).strip()

        if not isinstance(answer_text, str):
            answer_text = str(answer_text)

        return answer_text

    return None

In [None]:
extract_assistant_response("Assistant: LLMs, or Large Language Models")

'LLMs, or Large Language Models'

In [None]:
data_marketing = []
data_research = []

for idx, info in validation_questions_answers.items():
    question = info['question']
    answer_marketing = info['gold_answer_marketing']
    answer_research = info['gold_answer_research']

    data_marketing.append([question, answer_marketing])
    data_research.append([question, answer_research])

df_marketing = pd.DataFrame(data_marketing, columns=['Question', 'Gold Answer Marketing'])
df_research = pd.DataFrame(data_research, columns=['Question', 'Gold Answer Research'])

def add_rag_answer_column(df):
    df['RAG Answer'] = ''
    return df

df_marketing = add_rag_answer_column(df_marketing)
df_research = add_rag_answer_column(df_research)

In [None]:
df_research.head()

Unnamed: 0,Question,Gold Answer Research,RAG Answer
0,What purpose do large language models serve in...,Large language models (LLMs) serve the purpose...,
1,How does a large language model learn from tex...,A large language model learns from text during...,
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,
3,Can you name some specific large language mode...,Some specific large language models include GP...,
4,What licensing models have been adopted for th...,"Based on the provided context, it seems that l...",


In [None]:
import numpy as np

def evaluate_rag(questions, gold_answers, df, rag_chain, model):
    embeddings = SentenceTransformer(model)
    rouge = Rouge()

    sampled_indices = np.random.choice(df.index, size=25, replace=False)

    for index in sampled_indices:
        question = questions.loc[index]
        gold_answer = gold_answers.loc[index]

        response_text = rag_chain.invoke(question)
        rag_answer = extract_assistant_response(response_text)

        df.loc[index, 'RAG Answer'] = rag_answer

        gold_embed = embeddings.encode([gold_answer])
        rag_embed = embeddings.encode([rag_answer])

        cos_sim = cosine_similarity(gold_embed, rag_embed)[0][0]
        df.loc[index, 'Cosine Similarity'] = cos_sim

        bleu_score = sentence_bleu([gold_answer.split()], rag_answer.split(), smoothing_function=SmoothingFunction().method1)
        rouge_scores = rouge.get_scores(rag_answer, gold_answer)[0]

        df.loc[index, 'BLEU Score'] = bleu_score
        df.loc[index, 'ROUGE Score'] = rouge_scores['rouge-2']['f']

    return df

df_research = evaluate_rag(df_research['Question'], df_research['Gold Answer Research'], df_research, rag_chain_eng, 'all-mpnet-base-v2')
df_marketing = evaluate_rag(df_marketing['Question'], df_marketing['Gold Answer Marketing'], df_marketing, rag_chain_mark, 'all-mpnet-base-v2')

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [None]:
df_marketing[df_marketing['RAG Answer'] != ''].head()

Unnamed: 0,Question,Gold Answer Marketing,RAG Answer,Cosine Similarity,BLEU Score,ROUGE Score
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,"Large language models, as seen in recent studi...",0.67022,0.010352,0.05
3,Can you name some specific large language mode...,"Chinchilla by DeepMind, GPT-3 by OpenAI.","Absolutely! The authors S. Yin, et al. introdu...",0.321214,0.003679,0.0
6,How have language models evolved in terms of a...,Language models have evolved from early statis...,Language models have progressed significantly ...,0.61359,0.016664,0.06
9,What challenges do large language models face ...,Large language models sometimes learn patterns...,"Large language models, despite being powerful ...",0.688332,0.009614,0.053333
10,What factors influenced the development of gen...,Factors that influenced the development of gen...,"Based on the context you've given, there were ...",0.788081,0.070691,0.153846


In [None]:
df_research[df_research['RAG Answer'] != ''].head()

Unnamed: 0,Question,Gold Answer Research,RAG Answer,Cosine Similarity,BLEU Score,ROUGE Score
0,What purpose do large language models serve in...,Large language models (LLMs) serve the purpose...,"In natural language processing, large language...",0.802206,0.108412,0.157303
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,"In the realm of large language models, two sig...",0.636296,0.017155,0.029412
10,What factors influenced the development of gen...,Several factors influenced the development of ...,"The evolution of generative language models, i...",0.778577,0.018209,0.047619
13,What are the potential trade-offs between AI s...,The potential trade-offs between AI system ali...,The text underlines two significant aspects: e...,0.63036,0.019639,0.065217
16,How do subsequent versions of Claude compare i...,Claude Instant is a faster and lighter version...,"Based on the provided context, there isn't eno...",0.595628,0.052099,0.051948


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os

folder_path = '/content/drive/My Drive/w267/a5/Config_Results/Config_3_Test'

if not os.path.exists(folder_path):
    os.makedirs(folder_path)

file_path_marketing = os.path.join(folder_path, 'rag_results_marketing_3_test.csv')
file_path_research = os.path.join(folder_path, 'rag_results_research_3_test.csv')

df_marketing.to_csv(file_path_marketing, index=False)
df_research.to_csv(file_path_research, index=False)

In [None]:
path_marketing = '/content/drive/My Drive/w267/a5/Config_Results/Config_3_Test/rag_results_marketing_3_test.csv'
path_research = '/content/drive/My Drive/w267/a5/Config_Results/Config_3_Test/rag_results_research_3_test.csv'

df_marketing = pd.read_csv(path_marketing)
df_research = pd.read_csv(path_research)

In [None]:
df_marketing.head()

Unnamed: 0,Question,Gold Answer Marketing,RAG Answer,Cosine Similarity,BLEU Score,ROUGE Score
0,What purpose do large language models serve in...,Large language models serve the purpose of imp...,,,,
1,How does a large language model learn from tex...,A large language model learns from text during...,,,,
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,"Large language models, as seen in recent studi...",0.67022,0.010352,0.05
3,Can you name some specific large language mode...,"Chinchilla by DeepMind, GPT-3 by OpenAI.","Absolutely! The authors S. Yin, et al. introdu...",0.321214,0.003679,0.0
4,What licensing models have been adopted for th...,Answer: Some organizations choose open-sourcin...,,,,


In [None]:
average_cosine_similarity_m = df_marketing['Cosine Similarity'].mean()

average_bleu_score_m = df_marketing['BLEU Score'].mean()

average_rouge_score_m = df_marketing['ROUGE Score'].mean()

print("Average Cosine Similarity:", average_cosine_similarity_m)
print("Average BLEU Score:", average_bleu_score_m)
print("Average ROUGE Score:", average_rouge_score_m)

Average Cosine Similarity: 0.7298710300000001
Average BLEU Score: 0.05605144783764228
Average ROUGE Score: 0.09653293473474137


In [None]:
average_cosine_similarity_r = df_research['Cosine Similarity'].mean()

average_bleu_score_r = df_research['BLEU Score'].mean()

average_rouge_score_r = df_research['ROUGE Score'].mean()

print("Average Cosine Similarity:", average_cosine_similarity_r)
print("Average BLEU Score:", average_bleu_score_r)
print("Average ROUGE Score:", average_rouge_score_r)

Average Cosine Similarity: 0.7635476008000001
Average BLEU Score: 0.05506490045007828
Average ROUGE Score: 0.08764498641823672


#### Configuration 4

Embedding: distilroberta
Chunk Size: 128
Overlap: 25
Temperature: 0.6
Improved prompts

In [None]:
base_embeddings = HuggingFaceEmbeddings(model_name="all-distilroberta-v1")


loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=128, chunk_overlap=25)
splits = text_splitter.split_documents(documents)

vectorstore = Qdrant.from_documents(splits,
    base_embeddings,
    location=":memory:",
    collection_name="test",
    force_recreate=True
)
retriever = vectorstore.as_retriever()


qdrant_vectorstore = Qdrant.from_documents(splits,
    base_embeddings,
    location=":memory:",
    collection_name="rag_tech_db",
    force_recreate=True
)

retriever = qdrant_vectorstore.as_retriever()
CHUNK_SIZE=128
OVERLAP=25

text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=OVERLAP)

global_doc_number = 1
arxiv_numbers = ('2005.11401', '2104.07567', '2104.09864', '2105.03011', '2106.09685', '2203.02155', '2211.09260', '2211.12561',
                 '2212.09741', '2305.14314', '2305.18290', '2306.15595', '2309.08872', '2309.15217', '2310.06825', '2310.11511',
                 '2311.08377', '2312.05708', '2401.06532', '2401.17268', '2402.01306', '2402.19473', '2406.04744')
all_arxiv_pages = []

for identifier in arxiv_numbers:
    arx_url = f"https://arxiv.org/pdf/{identifier}.pdf"

    arx_loader = PyMuPDFLoader(arx_url)
    arx_pages = arx_loader.load()
    for page_num in range(len(arx_pages)):
        page = arx_pages[page_num]
        page.metadata['page_num'] = page_num
        page.metadata['doc_num'] = global_doc_number
        page.metadata['doc_source'] = "ArXiv"
        all_arxiv_pages.append(page)


    global_doc_number += 1

splits = text_splitter.split_documents(all_arxiv_pages)
for idx, text in enumerate(splits):
    splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=splits)


wiki_docs = WikipediaLoader(query="Generative Artificial Intelligence", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


wiki_docs = WikipediaLoader(query="Information Retrieval", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


wiki_docs = WikipediaLoader(query="Large Language Models", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


web_loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2020-10-29-odqa/",
               "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
               "https://lilianweng.github.io/posts/2018-06-24-attention/",
               "https://lilianweng.github.io/posts/2023-06-23-agent/",
               "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/"),

    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

web_documents = web_loader.load()

for idx, text in enumerate(web_documents):
    web_documents[idx].metadata['doc_num'] = global_doc_number
    web_documents[idx].metadata['doc_source'] = "WWW"
    global_doc_number += 1

web_splits = text_splitter.split_documents(web_documents)

for idx, text in enumerate(web_splits):
    web_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=web_splits)


['a7605fb936624bc9a17ef8ce17c782e0',
 'a72fccf6fc1e4ec796a41dd9eb6e193e',
 '6925cf46a4ff4d83ad75d4fc181bba8c',
 '30e9403607e74419a86064e6440a428e',
 '4991c1dc821945e5b56f6c1f194562af',
 '9ba06f2865324ade964439488008ab7f',
 '2eb79181c53f4c83849c1a29a00a899e',
 '96146ae6186843428b1ccdcf79991f5d',
 '5bf1a93d955445e9b51b999347f19fff',
 'dbecd19dd42b45d08988423bca8c1e26',
 '259b1b1b4e2044579d81e1136b87ba71',
 '1802d87d90824377ba9d799587a7a866',
 '4287c4d0c2a34694abf7c086ddf2cabc',
 'c9f2fbcc0521475ab98676295041cc10',
 'b6a1ae0b7bff4811a9571db92efd5168',
 'a5dfdbea5f354dfb8db18e92e71a7ddc',
 '6998ca94d01347beacad73fac9d4c9dd',
 '7924ad39b2da4016912c3ee9c506dfe2',
 '427db18a0f3d48d5b6147f2e307e2b19',
 'd8fb21c33653403b9ca56f63d9826e11',
 '79d0d435e22449caa4b17bfcad1c8df2',
 '477d7de9419e4aa1b12bfb0900f9ce71',
 'ab9b68bc6a304b5a9bba673a9aa15b05',
 '1c1f7dc26f444298bb90d3dd6cdc0744',
 'a6a965873ba14fa488e1fd82cb095565',
 'd601e3fb6f354c99aed565233c106ba6',
 '033134655c4240599459c1888b3fee69',
 

In [None]:
mistral_pipe = pipeline(
    "text-generation",
    model=llm_mistral_model,
    tokenizer=llm_mistral_tokenizer,
    max_new_tokens=700,
    temperature=0.6,
    top_p=0.95,
    do_sample=True,
    repetition_penalty=1.2
)
mistral_pipe.model.config.pad_token_id = mistral_pipe.model.config.eos_token_id

mistral_llm_lc = HuggingFacePipeline(pipeline=mistral_pipe)

Device set to use cuda:0


In [None]:
prompt_eng_temp = """[INST]
You are a dedicated assistant versed in NLP and generative AI technologies. Your audience consists of a group of engineers at a technology company eager to develop innovative GenAI applications.
These engineers possess a strong technical foundation in NLP, so they require answers with sufficient technical depth.
Please rely solely on the context provided below for information:

{context}

That concludes the context section.

Next, please provide an answer to the following question, drawing exclusively from the provided context. Your response should not exceed 125 words and must cater to engineers who expect a nuanced and detailed understanding of NLP concepts. Ensure your answer is precise and detailed, avoiding any lists or bullet points.

Here is the question:

{question}
[/INST]

Assistant:"""

prompt_mark_temp = """[INST]
You are an expert assistant specializing in NLP and generative AI, geared to assist a marketing team at a tech firm planning to introduce a range of GenAI solutions.
This team lacks a technical background in NLP, requiring explanations free from complex jargon.
Please use only the context provided below to answer their questions:

{context}

That concludes the context section.

Now, respond to the following question based on the provided context in no more than 75 words. Aim to deliver your answer in a straightforward manner suitable for non-technical team members, focusing on general insights about GenAI technologies and their implications. Ensure your response is clear and concise, strictly avoiding lists or bullet points.

Here is the question:

{question}
[/INST]

Assistant:"""

In [None]:
eng_prompt = ChatPromptTemplate.from_template(prompt_eng_temp)
mark_prompt = ChatPromptTemplate.from_template(prompt_mark_temp)

rag_chain_eng = (
    {"context": retriever | format_docs,
     "question": RunnablePassthrough()}
    | eng_prompt
    | mistral_llm_lc
)

rag_chain_mark = (
    {"context": retriever | format_docs,
     "question": RunnablePassthrough()}
    | mark_prompt
    | mistral_llm_lc
)

In [None]:
response_text = rag_chain_eng.invoke('What are LLMs?')
print(response_text)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Human: [INST]
You are a dedicated assistant versed in NLP and generative AI technologies. Your audience consists of a group of engineers at a technology company eager to develop innovative GenAI applications.
These engineers possess a strong technical foundation in NLP, so they require answers with sufficient technical depth.
Please rely solely on the context provided below for information:

2. Background
For a detailed survey, please see Appendix A. In brief, LLMs

LLMs themselves. Our work falls within the for-
mer category, placing a particular emphasis on

research in the realm of LLMs, particularly in their
application to IR tasks, and will encourage contin-

the LLM to be able to understand and adhere to specific requirements expressed in the instructions
8

That concludes the context section.

Next, please provide an answer to the following question, drawing exclusively from the provided context. Your response should not exceed 125 words and must cater to engineers who expect a 

In [None]:
response_text = rag_chain_mark.invoke('What are LLMs?')
print(response_text)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Human: [INST]
You are an expert assistant specializing in NLP and generative AI, geared to assist a marketing team at a tech firm planning to introduce a range of GenAI solutions.
This team lacks a technical background in NLP, requiring explanations free from complex jargon.
Please use only the context provided below to answer their questions:

2. Background
For a detailed survey, please see Appendix A. In brief, LLMs

LLMs themselves. Our work falls within the for-
mer category, placing a particular emphasis on

research in the realm of LLMs, particularly in their
application to IR tasks, and will encourage contin-

the LLM to be able to understand and adhere to specific requirements expressed in the instructions
8

That concludes the context section.

Now, respond to the following question based on the provided context in no more than 75 words. Aim to deliver your answer in a straightforward manner suitable for non-technical team members, focusing on general insights about GenAI tech

In [None]:
extract_assistant_response("Assistant: LLMs, or Large Language Models")

'LLMs, or Large Language Models'

In [None]:
data_marketing = []
data_research = []

for idx, info in validation_questions_answers.items():
    question = info['question']
    answer_marketing = info['gold_answer_marketing']
    answer_research = info['gold_answer_research']

    data_marketing.append([question, answer_marketing])
    data_research.append([question, answer_research])

df_marketing = pd.DataFrame(data_marketing, columns=['Question', 'Gold Answer Marketing'])
df_research = pd.DataFrame(data_research, columns=['Question', 'Gold Answer Research'])

def add_rag_answer_column(df):
    df['RAG Answer'] = ''
    return df

df_marketing = add_rag_answer_column(df_marketing)
df_research = add_rag_answer_column(df_research)

In [None]:
df_research.head()

Unnamed: 0,Question,Gold Answer Research,RAG Answer
0,What purpose do large language models serve in...,Large language models (LLMs) serve the purpose...,
1,How does a large language model learn from tex...,A large language model learns from text during...,
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,
3,Can you name some specific large language mode...,Some specific large language models include GP...,
4,What licensing models have been adopted for th...,"Based on the provided context, it seems that l...",


In [None]:
import numpy as np

def evaluate_rag(questions, gold_answers, df, rag_chain, model):
    embeddings = SentenceTransformer(model)
    rouge = Rouge()

    sampled_indices = np.random.choice(df.index, size=25, replace=False)

    for index in sampled_indices:
        question = questions.loc[index]
        gold_answer = gold_answers.loc[index]

        response_text = rag_chain.invoke(question)
        rag_answer = extract_assistant_response(response_text)

        df.loc[index, 'RAG Answer'] = rag_answer

        gold_embed = embeddings.encode([gold_answer])
        rag_embed = embeddings.encode([rag_answer])

        cos_sim = cosine_similarity(gold_embed, rag_embed)[0][0]
        df.loc[index, 'Cosine Similarity'] = cos_sim

        bleu_score = sentence_bleu([gold_answer.split()], rag_answer.split(), smoothing_function=SmoothingFunction().method1)
        rouge_scores = rouge.get_scores(rag_answer, gold_answer)[0]

        df.loc[index, 'BLEU Score'] = bleu_score
        df.loc[index, 'ROUGE Score'] = rouge_scores['rouge-2']['f']  # f-measure of ROUGE-2

    return df

df_research = evaluate_rag(df_research['Question'], df_research['Gold Answer Research'], df_research, rag_chain_eng, 'all-mpnet-base-v2')
df_marketing = evaluate_rag(df_marketing['Question'], df_marketing['Gold Answer Marketing'], df_marketing, rag_chain_mark, 'all-mpnet-base-v2')

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [None]:
df_marketing[df_marketing['RAG Answer'] != ''].head()

Unnamed: 0,Question,Gold Answer Marketing,RAG Answer,Cosine Similarity,BLEU Score,ROUGE Score
4,What licensing models have been adopted for th...,Answer: Some organizations choose open-sourcin...,The text suggests that some large language mod...,0.367518,0.005781,0.0
5,What are language models and what is their pur...,Language models are probabilistic models of na...,Language models are computer systems that unde...,0.881172,0.025282,0.057971
6,How have language models evolved in terms of a...,Language models have evolved from early statis...,Language models' evolution includes larger cap...,0.653416,0.013565,0.048193
10,What factors influenced the development of gen...,Factors that influenced the development of gen...,Anthropic's generative language models were li...,0.776801,0.02032,0.038095
13,What are the potential trade-offs between AI s...,The potential trade-offs between AI system ali...,In using AI systems aligned with ethical guide...,0.771681,0.012932,0.047619


In [None]:
df_research[df_research['RAG Answer'] != ''].head()

Unnamed: 0,Question,Gold Answer Research,RAG Answer,Cosine Similarity,BLEU Score,ROUGE Score
1,How does a large language model learn from tex...,A large language model learns from text during...,"In the context provided, large language models...",0.72482,0.01691,0.046784
5,What are language models and what is their pur...,Language models are probabilistic models of na...,"Language models, as mentioned in the context, ...",0.855735,0.026707,0.086331
6,How have language models evolved in terms of a...,Language models have evolved significantly in ...,"The evolution of language models, as described...",0.631812,0.007458,0.011299
8,What is the benefit of using continuous space ...,Continuous space embeddings in recurrent neura...,In recurrent neural network (RNN) language mod...,0.898656,0.017331,0.059172
10,What factors influenced the development of gen...,Several factors influenced the development of ...,The choice of generative language models by An...,0.812466,0.043868,0.044693


In [None]:
folder_path = '/content/drive/My Drive/w267/a5/Config_Results/Config_4_Test'  # You can change this path to your desired Google Drive folder

if not os.path.exists(folder_path):
    os.makedirs(folder_path)

file_path_marketing = os.path.join(folder_path, 'rag_results_marketing_4_test.csv')
file_path_research = os.path.join(folder_path, 'rag_results_research_4_test.csv')

df_marketing.to_csv(file_path_marketing, index=False)
df_research.to_csv(file_path_research, index=False)

In [None]:
path_marketing = '/content/drive/My Drive/w267/a5/Config_Results/Config_4_Test/rag_results_marketing_4_test.csv'
path_research = '/content/drive/My Drive/w267/a5/Config_Results/Config_4_Test/rag_results_research_4_test.csv'

df_marketing = pd.read_csv(path_marketing)
df_research = pd.read_csv(path_research)

In [None]:
df_marketing.head()

Unnamed: 0,Question,Gold Answer Marketing,RAG Answer,Cosine Similarity,BLEU Score,ROUGE Score
0,What purpose do large language models serve in...,Large language models serve the purpose of imp...,,,,
1,How does a large language model learn from tex...,A large language model learns from text during...,,,,
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,,,,
3,Can you name some specific large language mode...,"Chinchilla by DeepMind, GPT-3 by OpenAI.",,,,
4,What licensing models have been adopted for th...,Answer: Some organizations choose open-sourcin...,The text suggests that some large language mod...,0.367518,0.005781,0.0


In [None]:
average_cosine_similarity_m = df_marketing['Cosine Similarity'].mean()

average_bleu_score_m = df_marketing['BLEU Score'].mean()

average_rouge_score_m = df_marketing['ROUGE Score'].mean()

print("Average Cosine Similarity:", average_cosine_similarity_m)
print("Average BLEU Score:", average_bleu_score_m)
print("Average ROUGE Score:", average_rouge_score_m)

Average Cosine Similarity: 0.6998309508000001
Average BLEU Score: 0.024744194735221305
Average ROUGE Score: 0.05581841283596332


In [None]:
average_cosine_similarity_r = df_research['Cosine Similarity'].mean()

average_bleu_score_r = df_research['BLEU Score'].mean()

average_rouge_score_r = df_research['ROUGE Score'].mean()

print("Average Cosine Similarity:", average_cosine_similarity_r)
print("Average BLEU Score:", average_bleu_score_r)
print("Average ROUGE Score:", average_rouge_score_r)

Average Cosine Similarity: 0.7730058852
Average BLEU Score: 0.04522843515707667
Average ROUGE Score: 0.07400082304268442


#### Configuration 5

Model: cohere
Embedding: multi-qa-mpnet-base-dot-v1
Chunk Size: 128
Overlap: 25
Temperature: 0.6
Improved prompts

In [None]:
base_embeddings = HuggingFaceEmbeddings(model_name="multi-qa-mpnet-base-dot-v1")


loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=128, chunk_overlap=25)
splits = text_splitter.split_documents(documents)

vectorstore = Qdrant.from_documents(splits,
    base_embeddings,
    location=":memory:",
    collection_name="test",
    force_recreate=True
)
retriever = vectorstore.as_retriever()


qdrant_vectorstore = Qdrant.from_documents(splits,
    base_embeddings,
    location=":memory:",
    collection_name="rag_tech_db",
    force_recreate=True
)

retriever = qdrant_vectorstore.as_retriever()
CHUNK_SIZE=128
OVERLAP=25

text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=OVERLAP)

global_doc_number = 1
arxiv_numbers = ('2005.11401', '2104.07567', '2104.09864', '2105.03011', '2106.09685', '2203.02155', '2211.09260', '2211.12561',
                 '2212.09741', '2305.14314', '2305.18290', '2306.15595', '2309.08872', '2309.15217', '2310.06825', '2310.11511',
                 '2311.08377', '2312.05708', '2401.06532', '2401.17268', '2402.01306', '2402.19473', '2406.04744')
all_arxiv_pages = []

for identifier in arxiv_numbers:
    arx_url = f"https://arxiv.org/pdf/{identifier}.pdf"

    arx_loader = PyMuPDFLoader(arx_url)
    arx_pages = arx_loader.load()
    for page_num in range(len(arx_pages)):
        page = arx_pages[page_num]
        page.metadata['page_num'] = page_num
        page.metadata['doc_num'] = global_doc_number
        page.metadata['doc_source'] = "ArXiv"
        all_arxiv_pages.append(page)


    global_doc_number += 1

splits = text_splitter.split_documents(all_arxiv_pages)
for idx, text in enumerate(splits):
    splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=splits)


wiki_docs = WikipediaLoader(query="Generative Artificial Intelligence", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


wiki_docs = WikipediaLoader(query="Information Retrieval", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


wiki_docs = WikipediaLoader(query="Large Language Models", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


web_loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2020-10-29-odqa/",
               "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
               "https://lilianweng.github.io/posts/2018-06-24-attention/",
               "https://lilianweng.github.io/posts/2023-06-23-agent/",
               "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/"),

    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

web_documents = web_loader.load()

for idx, text in enumerate(web_documents):
    web_documents[idx].metadata['doc_num'] = global_doc_number
    web_documents[idx].metadata['doc_source'] = "WWW"
    global_doc_number += 1

web_splits = text_splitter.split_documents(web_documents)

for idx, text in enumerate(web_splits):
    web_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=web_splits)


['8d7018d714054568b5c7bff3889219a3',
 '427a692570cb44a897508ab1770f9f87',
 '222104abcae94cfeb3552815da904d3c',
 'bf9596d14c9b44d0bab5640da37af89a',
 '661e9fc6e95a44139b4d674b1773b326',
 '020f22526ed4498eb1c2569685c29c17',
 'a91e16aecc064b50af64e6aa600ebf59',
 'd43c9f2553394938946e8003ca2a8d36',
 'ad14903c843a44b4b40c2b87cafa60de',
 '161ea4d48b5a43fc9bad8a667663c3b4',
 'c0df6df7702145f7be463bb58b567037',
 '4728945d686548bdb966c54ee9d7f001',
 '050378a0b8574c21b4008253ccc99456',
 'b623b007f85442c3a4512d1c065d215a',
 '9c9e36621be54df483b3ea26009059b1',
 'f355be628dff4902a22f12197eb5f9b8',
 '2c1d9eb2340246dbbf793ea8e96a331c',
 '69c0b06fd0e848539b72963e6b190ea9',
 '78affa7f40a44c3794d701634e72bcdc',
 '521f3ec375444be08d6395dbcc6ee785',
 '034179de81654a68ad0f8b25630de846',
 '6fc1f60b569c47adb387503f33e01e1c',
 'ba97cfc020124e039ebe435db90365b1',
 'bd2bd543f0ef4857ac77e503dee52aba',
 'f644de14991147e6a9384526e74f8fb1',
 '1ec9f709adc146d1afe6dca4b478efa3',
 'a8d1c22e83d2457a919bdcb800ffd9dc',
 

In [None]:
prompt_eng_temp = """[INST]
You are a dedicated assistant versed in NLP and generative AI technologies. Your audience consists of a group of engineers at a technology company eager to develop innovative GenAI applications.
These engineers possess a strong technical foundation in NLP, so they require answers with sufficient technical depth.
Please rely solely on the context provided below for information:

{context}

That concludes the context section.

Next, please provide an answer to the following question, drawing exclusively from the provided context. Your response should not exceed 125 words and must cater to engineers who expect a nuanced and detailed understanding of NLP concepts. Ensure your answer is precise and detailed, avoiding any lists or bullet points.

Here is the question:

{question}
[/INST]

Assistant:"""

prompt_mark_temp = """[INST]
You are an expert assistant specializing in NLP and generative AI, geared to assist a marketing team at a tech firm planning to introduce a range of GenAI solutions.
This team lacks a technical background in NLP, requiring explanations free from complex jargon.
Please use only the context provided below to answer their questions:

{context}

That concludes the context section.

Now, respond to the following question based on the provided context in no more than 75 words. Aim to deliver your answer in a straightforward manner suitable for non-technical team members, focusing on general insights about GenAI technologies and their implications. Ensure your response is clear and concise, strictly avoiding lists or bullet points.

Here is the question:

{question}
[/INST]

Assistant:"""

In [None]:
cohere_chat_model = ChatCohere(cohere_api_key=COHERE_API_KEY, temperature=0.6)

output_parser = StrOutputParser()

eng_prompt = ChatPromptTemplate.from_template(prompt_eng_temp)
mark_prompt = ChatPromptTemplate.from_template(prompt_mark_temp)

cohere_rag_chain_eng = (
    {"context": retriever | format_docs,
     "question": RunnablePassthrough()}
    | eng_prompt
    | cohere_chat_model
    | output_parser
)

cohere_rag_chain_mark = (
    {"context": retriever | format_docs,
     "question": RunnablePassthrough()}
    | mark_prompt
    | cohere_chat_model
    | output_parser
)


In [None]:
resp = cohere_rag_chain_mark.invoke('What are LLMs?')
resp

'Large Language Models, or LLMs, are advanced computer systems designed to understand and process human language. These models are a type of artificial intelligence that can perform various natural language processing tasks, such as language translation, text generation, and even answering questions. What sets LLMs apart is their ability to learn and improve over time, making them incredibly versatile and powerful tools. By analyzing vast amounts of text data, LLMs can generate human-like responses and assist with numerous applications, revolutionizing how we interact with technology and opening up new possibilities for businesses and individuals alike.'

In [None]:
data_marketing = []
data_research = []

for idx, info in validation_questions_answers.items():
    question = info['question']
    answer_marketing = info['gold_answer_marketing']
    answer_research = info['gold_answer_research']

    data_marketing.append([question, answer_marketing])
    data_research.append([question, answer_research])

df_marketing = pd.DataFrame(data_marketing, columns=['Question', 'Gold Answer Marketing'])
df_research = pd.DataFrame(data_research, columns=['Question', 'Gold Answer Research'])

def add_rag_answer_column(df):
    df['RAG Answer'] = ''
    return df

df_marketing = add_rag_answer_column(df_marketing)
df_research = add_rag_answer_column(df_research)

In [None]:
df_research.head()

Unnamed: 0,Question,Gold Answer Research,RAG Answer
0,What purpose do large language models serve in...,Large language models (LLMs) serve the purpose...,
1,How does a large language model learn from tex...,A large language model learns from text during...,
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,
3,Can you name some specific large language mode...,Some specific large language models include GP...,
4,What licensing models have been adopted for th...,"Based on the provided context, it seems that l...",


In [None]:
import numpy as np

def evaluate_rag(questions, gold_answers, df, rag_chain, model):
    embeddings = SentenceTransformer(model)
    rouge = Rouge()

    sampled_indices = np.random.choice(df.index, size=25, replace=False)

    for index in sampled_indices:
        question = questions.loc[index]
        gold_answer = gold_answers.loc[index]

        rag_answer = rag_chain.invoke(question)
        time.sleep(6)

        df.loc[index, 'RAG Answer'] = rag_answer

        gold_embed = embeddings.encode([gold_answer])
        rag_embed = embeddings.encode([rag_answer])

        cos_sim = cosine_similarity(gold_embed, rag_embed)[0][0]
        df.loc[index, 'Cosine Similarity'] = cos_sim

        bleu_score = sentence_bleu([gold_answer.split()], rag_answer.split(), smoothing_function=SmoothingFunction().method1)
        rouge_scores = rouge.get_scores(rag_answer, gold_answer)[0]

        df.loc[index, 'BLEU Score'] = bleu_score
        df.loc[index, 'ROUGE Score'] = rouge_scores['rouge-2']['f']

    return df

df_research = evaluate_rag(df_research['Question'], df_research['Gold Answer Research'], df_research, cohere_rag_chain_eng, 'multi-qa-mpnet-base-dot-v1')
df_marketing = evaluate_rag(df_marketing['Question'], df_marketing['Gold Answer Marketing'], df_marketing, cohere_rag_chain_mark, 'multi-qa-mpnet-base-dot-v1')

In [None]:
df_marketing[df_marketing['RAG Answer'] != ''].head()

Unnamed: 0,Question,Gold Answer Marketing,RAG Answer,Cosine Similarity,BLEU Score,ROUGE Score
1,How does a large language model learn from tex...,A large language model learns from text during...,"Large language models, like those used in GenA...",0.800879,0.005638,0.014493
6,How have language models evolved in terms of a...,Language models have evolved from early statis...,Language models have come a long way since the...,0.715312,0.012523,0.027778
13,What are the potential trade-offs between AI s...,The potential trade-offs between AI system ali...,"In the development of AI systems, aligning wit...",0.700517,0.014302,0.041379
14,How has the token handling capacity changed be...,The token handling capacity has increased from...,The Claude model has significantly improved it...,0.79268,0.011749,0.084034
18,What benchmark did Chinchilla achieve an avera...,Chinchilla achieved an average accuracy of 67....,The context mentions that Chinchilla achieved ...,0.643733,0.093426,0.183673


In [None]:
df_research[df_research['RAG Answer'] != ''].head()

Unnamed: 0,Question,Gold Answer Research,RAG Answer,Cosine Similarity,BLEU Score,ROUGE Score
5,What are language models and what is their pur...,Language models are probabilistic models of na...,A language model is a fundamental concept in N...,0.902406,0.008231,0.086957
11,What is Constitutional AI and how does it affe...,Constitutional AI is an approach developed by ...,Constitutional AI is a training methodology de...,0.841982,0.017093,0.072289
15,In what ways has the Claude model's ability to...,The Claude model's ability to self-critique an...,The Claude model's self-reflection capabilitie...,0.857539,0.021116,0.115226
16,How do subsequent versions of Claude compare i...,Claude Instant is a faster and lighter version...,The context provided introduces two subsequent...,0.625683,0.032577,0.059322
17,Who developed the language model family known ...,The Chinchilla language model family was devel...,The Chinchilla family of language models was i...,0.742812,0.015648,0.047059


In [None]:
import os

folder_path = '/content/drive/My Drive/w267/a5/Config_Results/Config_5_Test'

if not os.path.exists(folder_path):
    os.makedirs(folder_path)

file_path_marketing = os.path.join(folder_path, 'rag_results_marketing_5_test.csv')
file_path_research = os.path.join(folder_path, 'rag_results_research_5_test.csv')

df_marketing.to_csv(file_path_marketing, index=False)
df_research.to_csv(file_path_research, index=False)

In [None]:
path_marketing = '/content/drive/My Drive/w267/a5/Config_Results/Config_5_Test/rag_results_marketing_5_test.csv'
path_research = '/content/drive/My Drive/w267/a5/Config_Results/Config_5_Test/rag_results_research_5_test.csv'

df_marketing = pd.read_csv(path_marketing)
df_research = pd.read_csv(path_research)

In [None]:
df_marketing.head()

Unnamed: 0,Question,Gold Answer Marketing,RAG Answer,Cosine Similarity,BLEU Score,ROUGE Score
0,What purpose do large language models serve in...,Large language models serve the purpose of imp...,,,,
1,How does a large language model learn from tex...,A large language model learns from text during...,"Large language models, like those used in GenA...",0.800879,0.005638,0.014493
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,,,,
3,Can you name some specific large language mode...,"Chinchilla by DeepMind, GPT-3 by OpenAI.",,,,
4,What licensing models have been adopted for th...,Answer: Some organizations choose open-sourcin...,,,,


In [None]:
average_cosine_similarity_m = df_marketing['Cosine Similarity'].mean()

average_bleu_score_m = df_marketing['BLEU Score'].mean()

average_rouge_score_m = df_marketing['ROUGE Score'].mean()

print("Average Cosine Similarity:", average_cosine_similarity_m)
print("Average BLEU Score:", average_bleu_score_m)
print("Average ROUGE Score:", average_rouge_score_m)

Average Cosine Similarity: 0.7172273128000001
Average BLEU Score: 0.018411748647950456
Average ROUGE Score: 0.04870211509156528


In [None]:
average_cosine_similarity_r = df_research['Cosine Similarity'].mean()

average_bleu_score_r = df_research['BLEU Score'].mean()

average_rouge_score_r = df_research['ROUGE Score'].mean()

print("Average Cosine Similarity:", average_cosine_similarity_r)
print("Average BLEU Score:", average_bleu_score_r)
print("Average ROUGE Score:", average_rouge_score_r)

Average Cosine Similarity: 0.787453394375
Average BLEU Score: 0.03739493287186066
Average ROUGE Score: 0.08515126454150808


#### Configuration 6

Model: cohere
Embedding: multi-qa-mpnet-base-dot-v1
Chunk Size: 128
Overlap: 25
Temperature: 0.6
Improved prompts

In [None]:
base_embeddings = HuggingFaceEmbeddings(model_name="all-distilroberta-v1")


loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=128, chunk_overlap=25)
splits = text_splitter.split_documents(documents)

vectorstore = Qdrant.from_documents(splits,
    base_embeddings,
    location=":memory:",
    collection_name="test",
    force_recreate=True
)
retriever = vectorstore.as_retriever()


qdrant_vectorstore = Qdrant.from_documents(splits,
    base_embeddings,
    location=":memory:",
    collection_name="rag_tech_db",
    force_recreate=True
)

retriever = qdrant_vectorstore.as_retriever()
CHUNK_SIZE=128
OVERLAP=25

text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=OVERLAP)

global_doc_number = 1
arxiv_numbers = ('2005.11401', '2104.07567', '2104.09864', '2105.03011', '2106.09685', '2203.02155', '2211.09260', '2211.12561',
                 '2212.09741', '2305.14314', '2305.18290', '2306.15595', '2309.08872', '2309.15217', '2310.06825', '2310.11511',
                 '2311.08377', '2312.05708', '2401.06532', '2401.17268', '2402.01306', '2402.19473', '2406.04744')
all_arxiv_pages = []

for identifier in arxiv_numbers:
    arx_url = f"https://arxiv.org/pdf/{identifier}.pdf"

    arx_loader = PyMuPDFLoader(arx_url)
    arx_pages = arx_loader.load()
    for page_num in range(len(arx_pages)):
        page = arx_pages[page_num]
        page.metadata['page_num'] = page_num
        page.metadata['doc_num'] = global_doc_number
        page.metadata['doc_source'] = "ArXiv"
        all_arxiv_pages.append(page)


    global_doc_number += 1

splits = text_splitter.split_documents(all_arxiv_pages)
for idx, text in enumerate(splits):
    splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=splits)


wiki_docs = WikipediaLoader(query="Generative Artificial Intelligence", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


wiki_docs = WikipediaLoader(query="Information Retrieval", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


wiki_docs = WikipediaLoader(query="Large Language Models", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


web_loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2020-10-29-odqa/",
               "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
               "https://lilianweng.github.io/posts/2018-06-24-attention/",
               "https://lilianweng.github.io/posts/2023-06-23-agent/",
               "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/"),

    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

web_documents = web_loader.load()

for idx, text in enumerate(web_documents):
    web_documents[idx].metadata['doc_num'] = global_doc_number
    web_documents[idx].metadata['doc_source'] = "WWW"
    global_doc_number += 1

web_splits = text_splitter.split_documents(web_documents)

for idx, text in enumerate(web_splits):
    web_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=web_splits)


['11b2816349a24b74ac75c28e11da2ede',
 'b96286800a9a4848930809a682e91c73',
 '61b1ec3b1709470faf8cda4fa52e8cc3',
 'f9a54cbc093f47dea79e64c3b44c10c9',
 '0f963ee5b8054ac8b8fcde9576ae4f39',
 '733da611c23d4f0a829c6cba0ae6436c',
 '9c2328897e8c46e1acc5f3c8da2ac671',
 '89b9cee4b41e4bc1a19410eb253d4c41',
 '28d3c1bc68ec411082c4a01969be760b',
 '4064c51a63dc42d881d1844714460a67',
 '484dd79d0f1f44d3a9c982d7a9e06b47',
 '4a461aae951c40dbaaac83c72620043a',
 'a880c3d12a3441d8a9e51850a7bf14e3',
 '5d130b1d4c2544aa9afead8231e75efd',
 'fa128c99a0af480dbfcd7893e3bc1e6a',
 '64c36955938841af9e320f614f91168c',
 '608a1daf432e47b7b9cafca3e937dfa5',
 '027c3ce4047d4dc0a1be468aa826138b',
 'bc5022f225f44750900df4d4a9bc4ac9',
 '2d2bb547992446139b6e12d6e3376647',
 'e272cfe970104c0b8b07f07c339a93ea',
 'dd0042b6b70f43e78179186bdaf5df4c',
 '88b5ca8913464469a04e3d5839d6add4',
 '0a35b368d6cc4745b3f3f8093d395a09',
 'e0b4256fb8cd4abaa4b1d4d7ce7906ca',
 'aaac133f77e54bdbb1c55553cb9d70ee',
 '5c179227fa144316986fe34b1d2e24ca',
 

In [None]:
prompt_eng_temp = """[INST]
You are a dedicated assistant versed in NLP and generative AI technologies. Your audience consists of a group of engineers at a technology company eager to develop innovative GenAI applications.
These engineers possess a strong technical foundation in NLP, so they require answers with sufficient technical depth.
Please rely solely on the context provided below for information:

{context}

That concludes the context section.

Next, please provide an answer to the following question, drawing exclusively from the provided context. Your response should not exceed 125 words and must cater to engineers who expect a nuanced and detailed understanding of NLP concepts. Ensure your answer is precise and detailed, avoiding any lists or bullet points.

Here is the question:

{question}
[/INST]

Assistant:"""

prompt_mark_temp = """[INST]
You are an expert assistant specializing in NLP and generative AI, geared to assist a marketing team at a tech firm planning to introduce a range of GenAI solutions.
This team lacks a technical background in NLP, requiring explanations free from complex jargon.
Please use only the context provided below to answer their questions:

{context}

That concludes the context section.

Now, respond to the following question based on the provided context in no more than 75 words. Aim to deliver your answer in a straightforward manner suitable for non-technical team members, focusing on general insights about GenAI technologies and their implications. Ensure your response is clear and concise, strictly avoiding lists or bullet points.

Here is the question:

{question}
[/INST]

Assistant:"""

In [None]:
cohere_chat_model = ChatCohere(cohere_api_key=COHERE_API_KEY, temperature=0.6)

output_parser = StrOutputParser()

eng_prompt = ChatPromptTemplate.from_template(prompt_eng_temp)
mark_prompt = ChatPromptTemplate.from_template(prompt_mark_temp)

cohere_rag_chain_eng = (
    {"context": retriever | format_docs,
     "question": RunnablePassthrough()}
    | eng_prompt
    | cohere_chat_model
    | output_parser
)

cohere_rag_chain_mark = (
    {"context": retriever | format_docs,
     "question": RunnablePassthrough()}
    | mark_prompt
    | cohere_chat_model
    | output_parser
)


In [None]:
resp = cohere_rag_chain_mark.invoke('What are LLMs?')
resp

'LLM stands for Large Language Model, a type of artificial intelligence technology that has revolutionized the field of natural language processing (NLP). These models are designed to understand and generate human-like language, enabling them to perform various tasks, such as answering questions, summarizing text, and even writing creative content. The key strength of LLMs lies in their ability to learn from vast amounts of data, allowing them to grasp the intricacies of language, including grammar, vocabulary, and context. This technology is particularly useful for businesses looking to automate and enhance their language-related processes, such as customer service, content creation, and information retrieval.'

In [None]:
data_marketing = []
data_research = []

for idx, info in validation_questions_answers.items():
    question = info['question']
    answer_marketing = info['gold_answer_marketing']
    answer_research = info['gold_answer_research']

    data_marketing.append([question, answer_marketing])
    data_research.append([question, answer_research])

df_marketing = pd.DataFrame(data_marketing, columns=['Question', 'Gold Answer Marketing'])
df_research = pd.DataFrame(data_research, columns=['Question', 'Gold Answer Research'])

def add_rag_answer_column(df):
    df['RAG Answer'] = ''
    return df

df_marketing = add_rag_answer_column(df_marketing)
df_research = add_rag_answer_column(df_research)

In [None]:
df_research.head()

Unnamed: 0,Question,Gold Answer Research,RAG Answer
0,What purpose do large language models serve in...,Large language models (LLMs) serve the purpose...,
1,How does a large language model learn from tex...,A large language model learns from text during...,
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,
3,Can you name some specific large language mode...,Some specific large language models include GP...,
4,What licensing models have been adopted for th...,"Based on the provided context, it seems that l...",


In [None]:
import numpy as np

def evaluate_rag(questions, gold_answers, df, rag_chain, model):
    embeddings = SentenceTransformer(model)
    rouge = Rouge()

    sampled_indices = np.random.choice(df.index, size=25, replace=False)

    for index in sampled_indices:
        question = questions.loc[index]
        gold_answer = gold_answers.loc[index]

        rag_answer = rag_chain.invoke(question)
        time.sleep(6)

        df.loc[index, 'RAG Answer'] = rag_answer

        gold_embed = embeddings.encode([gold_answer])
        rag_embed = embeddings.encode([rag_answer])

        cos_sim = cosine_similarity(gold_embed, rag_embed)[0][0]
        df.loc[index, 'Cosine Similarity'] = cos_sim

        bleu_score = sentence_bleu([gold_answer.split()], rag_answer.split(), smoothing_function=SmoothingFunction().method1)
        rouge_scores = rouge.get_scores(rag_answer, gold_answer)[0]

        df.loc[index, 'BLEU Score'] = bleu_score
        df.loc[index, 'ROUGE Score'] = rouge_scores['rouge-2']['f']

    return df

df_research = evaluate_rag(df_research['Question'], df_research['Gold Answer Research'], df_research, cohere_rag_chain_eng, 'all-distilroberta-v1')
df_marketing = evaluate_rag(df_marketing['Question'], df_marketing['Gold Answer Marketing'], df_marketing, cohere_rag_chain_mark, 'all-distilroberta-v1')


The reference to the research paper by Solaiman and Dennison (2021) suggests that the authors propose a process for adapting language models to societal needs, but it does not indicate their involvement in developing the Chinchilla models. Given the provided context, the specific developers or research teams behind the Chinchilla language models remain unclear.' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.loc[index, 'RAG Answer'] = rag_answer
  df.loc[index, 'RAG Answer'] = rag_answer


In [None]:
df_research[df_research['RAG Answer'] != ''].head()

Unnamed: 0,Question,Gold Answer Research,RAG Answer,Cosine Similarity,BLEU Score,ROUGE Score
0,What purpose do large language models serve in...,Large language models (LLMs) serve the purpose...,,,,
1,How does a large language model learn from tex...,A large language model learns from text during...,,,,
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,,,,
3,Can you name some specific large language mode...,Some specific large language models include GP...,,,,
4,What licensing models have been adopted for th...,"Based on the provided context, it seems that l...",The context provided discusses the distributio...,0.694982,0.053362,0.099174


In [None]:
import os

folder_path = '/content/drive/My Drive/w267/a5/Config_Results/Config_6_Test'

if not os.path.exists(folder_path):
    os.makedirs(folder_path)

file_path_marketing = os.path.join(folder_path, 'rag_results_marketing_6_test.csv')
file_path_research = os.path.join(folder_path, 'rag_results_research_6_test.csv')

df_marketing.to_csv(file_path_marketing, index=False)
df_research.to_csv(file_path_research, index=False)

In [None]:
path_marketing = '/content/drive/My Drive/w267/a5/Config_Results/Config_6_Test/rag_results_marketing_6_test.csv'
path_research = '/content/drive/My Drive/w267/a5/Config_Results/Config_6_Test/rag_results_research_6_test.csv'

df_marketing = pd.read_csv(path_marketing)
df_research = pd.read_csv(path_research)

In [None]:
df_marketing.head()

Unnamed: 0,Question,Gold Answer Marketing,RAG Answer,Cosine Similarity,BLEU Score,ROUGE Score
0,What purpose do large language models serve in...,Large language models serve the purpose of imp...,,,,
1,How does a large language model learn from tex...,A large language model learns from text during...,,,,
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,The development of Large Language Models (LLMs...,0.661399,0.006831,0.025641
3,Can you name some specific large language mode...,"Chinchilla by DeepMind, GPT-3 by OpenAI.",,,,
4,What licensing models have been adopted for th...,Answer: Some organizations choose open-sourcin...,The context provided discusses the use of lang...,0.3092,0.004493,0.015267


In [None]:
average_cosine_similarity_m = df_marketing['Cosine Similarity'].mean()

average_bleu_score_m = df_marketing['BLEU Score'].mean()

average_rouge_score_m = df_marketing['ROUGE Score'].mean()

print("Average Cosine Similarity:", average_cosine_similarity_m)
print("Average BLEU Score:", average_bleu_score_m)
print("Average ROUGE Score:", average_rouge_score_m)

Average Cosine Similarity: 0.6325043104
Average BLEU Score: 0.015255728796396872
Average ROUGE Score: 0.04698817610672561


In [None]:
average_cosine_similarity_r = df_research['Cosine Similarity'].mean()

average_bleu_score_r = df_research['BLEU Score'].mean()

average_rouge_score_r = df_research['ROUGE Score'].mean()

print("Average Cosine Similarity:", average_cosine_similarity_r)
print("Average BLEU Score:", average_bleu_score_r)
print("Average ROUGE Score:", average_rouge_score_r)

Average Cosine Similarity: 0.7213262683999999
Average BLEU Score: 0.04281695614894522
Average ROUGE Score: 0.09371255120486074


#### Configuration 7

Model: cohere
Embedding: multi-qa-mpnet-base-dot-v1
Chunk Size: 128
Overlap: 25
Temperature: 0.6
Improved prompts

In [None]:
base_embeddings = HuggingFaceEmbeddings(model_name="multi-qa-mpnet-base-dot-v1")


loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=128, chunk_overlap=0)
splits = text_splitter.split_documents(documents)

vectorstore = Qdrant.from_documents(splits,
    base_embeddings,
    location=":memory:",
    collection_name="test",
    force_recreate=True
)
retriever = vectorstore.as_retriever()


qdrant_vectorstore = Qdrant.from_documents(splits,
    base_embeddings,
    location=":memory:",
    collection_name="rag_tech_db",
    force_recreate=True
)

retriever = qdrant_vectorstore.as_retriever()
CHUNK_SIZE=128
OVERLAP=0

text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=OVERLAP)

global_doc_number = 1
arxiv_numbers = ('2005.11401', '2104.07567', '2104.09864', '2105.03011', '2106.09685', '2203.02155', '2211.09260', '2211.12561',
                 '2212.09741', '2305.14314', '2305.18290', '2306.15595', '2309.08872', '2309.15217', '2310.06825', '2310.11511',
                 '2311.08377', '2312.05708', '2401.06532', '2401.17268', '2402.01306', '2402.19473', '2406.04744')
all_arxiv_pages = []

for identifier in arxiv_numbers:
    arx_url = f"https://arxiv.org/pdf/{identifier}.pdf"

    arx_loader = PyMuPDFLoader(arx_url)
    arx_pages = arx_loader.load()
    for page_num in range(len(arx_pages)):
        page = arx_pages[page_num]
        page.metadata['page_num'] = page_num
        page.metadata['doc_num'] = global_doc_number
        page.metadata['doc_source'] = "ArXiv"
        all_arxiv_pages.append(page)


    global_doc_number += 1

splits = text_splitter.split_documents(all_arxiv_pages)
for idx, text in enumerate(splits):
    splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=splits)


wiki_docs = WikipediaLoader(query="Generative Artificial Intelligence", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


wiki_docs = WikipediaLoader(query="Information Retrieval", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


wiki_docs = WikipediaLoader(query="Large Language Models", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


web_loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2020-10-29-odqa/",
               "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
               "https://lilianweng.github.io/posts/2018-06-24-attention/",
               "https://lilianweng.github.io/posts/2023-06-23-agent/",
               "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/"),

    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

web_documents = web_loader.load()

for idx, text in enumerate(web_documents):
    web_documents[idx].metadata['doc_num'] = global_doc_number
    web_documents[idx].metadata['doc_source'] = "WWW"
    global_doc_number += 1

web_splits = text_splitter.split_documents(web_documents)

for idx, text in enumerate(web_splits):
    web_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=web_splits)


['6f29f5f133a249a6b4590f925ef04f10',
 'e32011e496964b3187297f0036cabd95',
 '7406e6b8c7a74706914e22f63894d223',
 'a654ad5f2a74486babbda8089726cf11',
 'd553ad9be39f4c79a1005b854ab333b0',
 '55d8618481404010a9d4cf7a83051fe8',
 '1d1d5ced5eb54d239f7c8a127315c771',
 'd212a9bbfae0442d9061b3a7b25e128b',
 '174830193842441b9ccba441c13e68d0',
 '73e85f5590744bddbd8fb43c7bea316a',
 '04cf108f2d31408ebde7ea18c613231c',
 '711b121b881140cc9752852751966063',
 'dccc9c4c79fe495395d0d4d94e1ead18',
 '82728795803a40348b0bd903de207f81',
 '01949d48efbc4d4381cb81e4a956644d',
 '965307c80aef49cdbec4c7ba1964d72f',
 'e05689ef2047447cb882878c81a37c1e',
 'e4b890ec0eba4272bdcef099111178f0',
 'e9754281658a4d3c973ef821c679df69',
 '0db2fffaadd24184931970a8224050d3',
 '059393a86f3e4ce8a67d4c9543a9c3ed',
 'd0b11fc1f47543889f96f35ff54e0503',
 '66ff49a9dacb44e8b29281f678683b41',
 '18c903bb8abb48b3ac170fc175feb63f',
 '6267644327b6428d9277298767dadc3d',
 'c85b8c795e4248f19ae38840d6d41d39',
 'eadc8ecb333149ba938773920238e708',
 

In [None]:
prompt_eng_temp = """[INST]
You are a dedicated assistant versed in NLP and generative AI technologies. Your audience consists of a group of engineers at a technology company eager to develop innovative GenAI applications.
These engineers possess a strong technical foundation in NLP, so they require answers with sufficient technical depth.
Please rely solely on the context provided below for information:

{context}

That concludes the context section.

Next, please provide an answer to the following question, drawing exclusively from the provided context. Your response should not exceed 125 words and must cater to engineers who expect a nuanced and detailed understanding of NLP concepts. Ensure your answer is precise and detailed, avoiding any lists or bullet points.

Here is the question:

{question}
[/INST]

Assistant:"""

prompt_mark_temp = """[INST]
You are an expert assistant specializing in NLP and generative AI, geared to assist a marketing team at a tech firm planning to introduce a range of GenAI solutions.
This team lacks a technical background in NLP, requiring explanations free from complex jargon.
Please use only the context provided below to answer their questions:

{context}

That concludes the context section.

Now, respond to the following question based on the provided context in no more than 75 words. Aim to deliver your answer in a straightforward manner suitable for non-technical team members, focusing on general insights about GenAI technologies and their implications. Ensure your response is clear and concise, strictly avoiding lists or bullet points.

Here is the question:

{question}
[/INST]

Assistant:"""

In [None]:
cohere_chat_model = ChatCohere(cohere_api_key=COHERE_API_KEY, temperature=0.2)

output_parser = StrOutputParser()

eng_prompt = ChatPromptTemplate.from_template(prompt_eng_temp)
mark_prompt = ChatPromptTemplate.from_template(prompt_mark_temp)

cohere_rag_chain_eng = (
    {"context": retriever | format_docs,
     "question": RunnablePassthrough()}
    | eng_prompt
    | cohere_chat_model
    | output_parser
)

cohere_rag_chain_mark = (
    {"context": retriever | format_docs,
     "question": RunnablePassthrough()}
    | mark_prompt
    | cohere_chat_model
    | output_parser
)


In [None]:
resp = cohere_rag_chain_mark.invoke('What are LLMs?')
resp

"LLMs, or Large Language Models, are advanced artificial intelligence systems designed to understand and generate human language. These models have been trained on vast amounts of text data, enabling them to learn patterns and acquire knowledge from it. The context provided suggests that LLMs are powerful tools with a wide range of applications, especially in the field of natural language processing (NLP). The goal of making these models open-source is to increase accessibility, allowing more people to benefit from and contribute to the development of this technology. This is particularly relevant for the marketing team's efforts to promote and educate their audience about GenAI solutions."

In [None]:
data_marketing = []
data_research = []

for idx, info in validation_questions_answers.items():
    question = info['question']
    answer_marketing = info['gold_answer_marketing']
    answer_research = info['gold_answer_research']

    data_marketing.append([question, answer_marketing])
    data_research.append([question, answer_research])

df_marketing = pd.DataFrame(data_marketing, columns=['Question', 'Gold Answer Marketing'])
df_research = pd.DataFrame(data_research, columns=['Question', 'Gold Answer Research'])

def add_rag_answer_column(df):
    df['RAG Answer'] = ''
    return df

df_marketing = add_rag_answer_column(df_marketing)
df_research = add_rag_answer_column(df_research)

In [None]:
df_research.head()

Unnamed: 0,Question,Gold Answer Research,RAG Answer
0,What purpose do large language models serve in...,Large language models (LLMs) serve the purpose...,
1,How does a large language model learn from tex...,A large language model learns from text during...,
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,
3,Can you name some specific large language mode...,Some specific large language models include GP...,
4,What licensing models have been adopted for th...,"Based on the provided context, it seems that l...",


In [None]:
import numpy as np

def evaluate_rag(questions, gold_answers, df, rag_chain, model):
    embeddings = SentenceTransformer(model)
    rouge = Rouge()

    sampled_indices = np.random.choice(df.index, size=25, replace=False)

    for index in sampled_indices:
        question = questions.loc[index]
        gold_answer = gold_answers.loc[index]

        rag_answer = rag_chain.invoke(question)
        time.sleep(6)

        df.loc[index, 'RAG Answer'] = rag_answer

        gold_embed = embeddings.encode([gold_answer])
        rag_embed = embeddings.encode([rag_answer])

        cos_sim = cosine_similarity(gold_embed, rag_embed)[0][0]
        df.loc[index, 'Cosine Similarity'] = cos_sim

        bleu_score = sentence_bleu([gold_answer.split()], rag_answer.split(), smoothing_function=SmoothingFunction().method1)
        rouge_scores = rouge.get_scores(rag_answer, gold_answer)[0]

        df.loc[index, 'BLEU Score'] = bleu_score
        df.loc[index, 'ROUGE Score'] = rouge_scores['rouge-2']['f']

    return df

df_research = evaluate_rag(df_research['Question'], df_research['Gold Answer Research'], df_research, cohere_rag_chain_eng, 'all-distilroberta-v1')
df_marketing = evaluate_rag(df_marketing['Question'], df_marketing['Gold Answer Marketing'], df_marketing, cohere_rag_chain_mark, 'all-distilroberta-v1')

In [None]:
df_marketing[df_marketing['RAG Answer'] != ''].head()

Unnamed: 0,Question,Gold Answer Marketing,RAG Answer,Cosine Similarity,BLEU Score,ROUGE Score
3,Can you name some specific large language mode...,"Chinchilla by DeepMind, GPT-3 by OpenAI.",Some well-known large language models include ...,0.510007,0.00239,0.0
8,What is the benefit of using continuous space ...,Continuous space embeddings in recurrent neura...,Continuous space embeddings are a powerful too...,0.693313,0.011834,0.046875
14,How has the token handling capacity changed be...,The token handling capacity has increased from...,The Claude model has seen significant improvem...,0.805822,0.019193,0.101449
16,How do subsequent versions of Claude compare i...,"Claude Instant is a faster, less expensive, an...",The context provided offers a glimpse into the...,0.401349,0.012892,0.030534
17,Who developed the language model family known ...,The research team at DeepMind developed the la...,The Chinchilla language model family was devel...,0.670671,0.013155,0.040816


In [None]:
df_research[df_research['RAG Answer'] != ''].head()

Unnamed: 0,Question,Gold Answer Research,RAG Answer,Cosine Similarity,BLEU Score,ROUGE Score
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,The architecture of Large Language Models (LLM...,0.745779,0.016385,0.04918
5,What are language models and what is their pur...,Language models are probabilistic models of na...,Language models are powerful tools in the fiel...,0.800639,0.013942,0.061538
7,Can you explain how maximum entropy language m...,Maximum entropy language models use feature fu...,Maximum Entropy (MaxEnt) language models are a...,0.776427,0.097799,0.171875
12,How do advances in AI models impact their abil...,"Advances in AI models, such as multimodal mode...","Recent advancements in AI models, particularly...",0.660681,0.017115,0.098361
14,How has the token handling capacity changed be...,The token handling capacity has increased with...,The Claude model has undergone significant enh...,0.772212,0.051533,0.121827


In [None]:
import os

folder_path = '/content/drive/My Drive/w267/a5/Config_Results/Config_7_Test'  # You can change this path to your desired Google Drive folder

if not os.path.exists(folder_path):
    os.makedirs(folder_path)

file_path_marketing = os.path.join(folder_path, 'rag_results_marketing_7_test.csv')
file_path_research = os.path.join(folder_path, 'rag_results_research_7_test.csv')

df_marketing.to_csv(file_path_marketing, index=False)
df_research.to_csv(file_path_research, index=False)

In [None]:
path_marketing = '/content/drive/My Drive/w267/a5/Config_Results/Config_7_Test/rag_results_marketing_7_test.csv'
path_research = '/content/drive/My Drive/w267/a5/Config_Results/Config_7_Test/rag_results_research_7_test.csv'

df_marketing = pd.read_csv(path_marketing)
df_research = pd.read_csv(path_research)

In [None]:
df_marketing.head()

Unnamed: 0,Question,Gold Answer Marketing,RAG Answer,Cosine Similarity,BLEU Score,ROUGE Score
0,What purpose do large language models serve in...,Large language models serve the purpose of imp...,,,,
1,How does a large language model learn from tex...,A large language model learns from text during...,,,,
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,,,,
3,Can you name some specific large language mode...,"Chinchilla by DeepMind, GPT-3 by OpenAI.",Some well-known large language models include ...,0.510007,0.00239,0.0
4,What licensing models have been adopted for th...,Answer: Some organizations choose open-sourcin...,,,,


In [None]:
average_cosine_similarity_m = df_marketing['Cosine Similarity'].mean()

average_bleu_score_m = df_marketing['BLEU Score'].mean()

average_rouge_score_m = df_marketing['ROUGE Score'].mean()

print("Average Cosine Similarity:", average_cosine_similarity_m)
print("Average BLEU Score:", average_bleu_score_m)
print("Average ROUGE Score:", average_rouge_score_m)

Average Cosine Similarity: 0.6508690852
Average BLEU Score: 0.021848950084805564
Average ROUGE Score: 0.05661759882904614


In [None]:
average_cosine_similarity_r = df_research['Cosine Similarity'].mean()

average_bleu_score_r = df_research['BLEU Score'].mean()

average_rouge_score_r = df_research['ROUGE Score'].mean()

print("Average Cosine Similarity:", average_cosine_similarity_r)
print("Average BLEU Score:", average_bleu_score_r)
print("Average ROUGE Score:", average_rouge_score_r)

Average Cosine Similarity: 0.7102951528
Average BLEU Score: 0.041500320099066845
Average ROUGE Score: 0.09636985001261347


## Top 3 Configurations

### Top Configuration for Marketing Team

Embedding: all-distilroberta-v1,
Chunk Size: 128,
Overlap: 25,
Temperature: 0.8,
Model: mistral,
Improved prompts

In [None]:
base_embeddings = HuggingFaceEmbeddings(model_name="all-distilroberta-v1")


loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=128, chunk_overlap=25)
splits = text_splitter.split_documents(documents)

vectorstore = Qdrant.from_documents(splits,
    base_embeddings,
    location=":memory:",
    collection_name="test",
    force_recreate=True
)
retriever = vectorstore.as_retriever()


qdrant_vectorstore = Qdrant.from_documents(splits,
    base_embeddings,
    location=":memory:",
    collection_name="rag_tech_db",
    force_recreate=True
)

retriever = qdrant_vectorstore.as_retriever()
CHUNK_SIZE=128
OVERLAP=25

text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=OVERLAP)

global_doc_number = 1
arxiv_numbers = ('2005.11401', '2104.07567', '2104.09864', '2105.03011', '2106.09685', '2203.02155', '2211.09260', '2211.12561',
                 '2212.09741', '2305.14314', '2305.18290', '2306.15595', '2309.08872', '2309.15217', '2310.06825', '2310.11511',
                 '2311.08377', '2312.05708', '2401.06532', '2401.17268', '2402.01306', '2402.19473', '2406.04744')
all_arxiv_pages = []

for identifier in arxiv_numbers:
    arx_url = f"https://arxiv.org/pdf/{identifier}.pdf"

    arx_loader = PyMuPDFLoader(arx_url)
    arx_pages = arx_loader.load()
    for page_num in range(len(arx_pages)):
        page = arx_pages[page_num]
        page.metadata['page_num'] = page_num
        page.metadata['doc_num'] = global_doc_number
        page.metadata['doc_source'] = "ArXiv"
        all_arxiv_pages.append(page)


    global_doc_number += 1

splits = text_splitter.split_documents(all_arxiv_pages)
for idx, text in enumerate(splits):
    splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=splits)


wiki_docs = WikipediaLoader(query="Generative Artificial Intelligence", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


wiki_docs = WikipediaLoader(query="Information Retrieval", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


wiki_docs = WikipediaLoader(query="Large Language Models", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


web_loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2020-10-29-odqa/",
               "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
               "https://lilianweng.github.io/posts/2018-06-24-attention/",
               "https://lilianweng.github.io/posts/2023-06-23-agent/",
               "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/"),

    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

web_documents = web_loader.load()

for idx, text in enumerate(web_documents):
    web_documents[idx].metadata['doc_num'] = global_doc_number
    web_documents[idx].metadata['doc_source'] = "WWW"
    global_doc_number += 1

web_splits = text_splitter.split_documents(web_documents)

for idx, text in enumerate(web_splits):
    web_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=web_splits)


  base_embeddings = HuggingFaceEmbeddings(model_name="all-distilroberta-v1")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

['e409f03ed82b4bf2b2dba2316a847300',
 '9e5d6db63c984705bafa67a218768190',
 'adc941e208dd40dea35d786d725b1b63',
 '59872d2162e64fe9aab1f99b376277d3',
 '57036c6aca1e4f35a82347af233e3d7f',
 'cf1d0fd8c5a840198969d9f2e712c681',
 'b1c431bb6f094e998b9f958c9ec3fe83',
 'ae27125c69824fcfb81956f0182d7027',
 '8a78643c493d4f4092b046abd58bf39b',
 '562495a0983d4abe9d8fcbf7a63d630f',
 '553e74eb845a4d3cadbbb74e6db06f24',
 '9063a9948ee94fec803634a98d2fafcb',
 '911d35a48b3643eeab548e9b3320da84',
 'b71aaed52b124e3eb70eddb3830e3649',
 'cddbb2d9a81448ce815c8cf38b76e964',
 'b77ab52249d84aff9ffc9a0075d75291',
 'a38b4d4bc0304e4aba2f53c4f700b492',
 '6d3df45ab8df439b8bc7028cc38cabd9',
 'b854709d8f774b43849be5a01e3f896c',
 '88eda963deda4d53b9c73d375291328b',
 'd840642a45a24db6a08ffff3b992ba7c',
 '15b982a8b02f48fe971ab4d7b24142d8',
 'fbabd780214d4a2da29c3808ba01ead4',
 '736faeca5138425dada7c4fbf37939e2',
 'c7756bcc836846f1a57829e89b9eb2b0',
 '4ec855bde3714be68a9a3f7f288f2922',
 'c5701dab219b4357a65bb0a235908c78',
 

In [None]:
from huggingface_hub import login

login(token=userdata.get('HUGGING_FACE_TOKEN'))

quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         )


llm_mistral_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    torch_dtype=torch.float32,
    device_map='auto',
    quantization_config=quantization_config
)

llm_mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [None]:
mistral_pipe = pipeline(
    "text-generation",
    model=llm_mistral_model,
    tokenizer=llm_mistral_tokenizer,
    max_new_tokens=1000,
    temperature=0.8,
    top_p=0.95,
    do_sample=True,
    repetition_penalty=1.2
)
mistral_pipe.model.config.pad_token_id = mistral_pipe.model.config.eos_token_id

mistral_llm_lc = HuggingFacePipeline(pipeline=mistral_pipe)

Device set to use cuda:0
  mistral_llm_lc = HuggingFacePipeline(pipeline=mistral_pipe)


In [None]:
prompt_mark_temp = """[INST]
You are an expert assistant specializing in NLP and generative AI, geared to assist a marketing team at a tech firm planning to introduce a range of GenAI solutions.
This team lacks a technical background in NLP, requiring explanations free from complex jargon.
Please use only the context provided below to answer their questions:

{context}

That concludes the context section.

Now, respond to the following question based on the provided context in no more than 75 words. Aim to deliver your answer in a straightforward manner suitable for non-technical team members, focusing on general insights about GenAI technologies and their implications. Ensure your response is clear and concise, strictly avoiding lists or bullet points.

Here is the question:

{question}
[/INST]

Assistant:"""

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
mark_prompt = ChatPromptTemplate.from_template(prompt_mark_temp)

rag_chain_mark = (
    {"context": retriever | format_docs,
     "question": RunnablePassthrough()}
    | mark_prompt
    | mistral_llm_lc
)

In [None]:
import re

def extract_assistant_response(response):
    pattern = r"Assistant:\s*(.*)"

    match = re.search(pattern, response)

    if match:
        answer_text = match.group(1).strip()

        if not isinstance(answer_text, str):
            answer_text = str(answer_text)

        return answer_text

    return None

In [None]:
response_text = rag_chain_mark.invoke('What is Chain of Thought?')
extract_assistant_response(response_text)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'Chain of Thought refers to a method used by some AI models, as described in Wei et al.\'s research in 2022. Instead of processing information linearly, these models generate several possible ideas or "thoughts" for each part of a task or problem. This approach allows them to tackle intricate tasks more effectively. In simpler terms, it\'s like having a brainstorm session with different ideas before choosing the best one.'

In [None]:
data_marketing = []

for idx, info in validation_questions_answers.items():
    question = info['question']
    answer_marketing = info['gold_answer_marketing']

    data_marketing.append([question, answer_marketing])

# Create DataFrames
df_marketing = pd.DataFrame(data_marketing, columns=['Question', 'Gold Answer Marketing'])

def add_rag_answer_column(df):
    df['RAG Answer'] = ''
    return df

df_marketing = add_rag_answer_column(df_marketing)


In [None]:
df_marketing.head()

Unnamed: 0,Question,Gold Answer Marketing,RAG Answer
0,What purpose do large language models serve in...,Large language models serve the purpose of imp...,
1,How does a large language model learn from tex...,A large language model learns from text during...,
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,
3,Can you name some specific large language mode...,"Chinchilla by DeepMind, GPT-3 by OpenAI.",
4,What licensing models have been adopted for th...,Answer: Some organizations choose open-sourcin...,


In [None]:
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge

def evaluate_rag(questions, gold_answers, df, rag_chain, model):
    embeddings = SentenceTransformer(model)
    rouge = Rouge()

    for index, (question, gold_answer) in enumerate(zip(questions, gold_answers)):
        response_text = rag_chain.invoke(question)


        rag_answer = extract_assistant_response(response_text)

        df.loc[index, 'RAG Answer'] = rag_answer

        gold_embed = embeddings.encode([gold_answer])
        rag_embed = embeddings.encode([rag_answer])

        cos_sim = cosine_similarity(gold_embed, rag_embed)[0][0]
        df.loc[index, 'Cosine Similarity'] = cos_sim

        bleu_score = sentence_bleu([gold_answer.split()], rag_answer.split(), smoothing_function=SmoothingFunction().method1)
        rouge_scores = rouge.get_scores(rag_answer, gold_answer)[0]

        df.loc[index, 'BLEU Score'] = bleu_score
        df.loc[index, 'ROUGE Score'] = rouge_scores['rouge-2']['f']

    return df

df_marketing = evaluate_rag(df_marketing['Question'], df_marketing['Gold Answer Marketing'], df_marketing, rag_chain_mark, 'all-mpnet-base-v2')

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [None]:
df_marketing.head()

Unnamed: 0,Question,Gold Answer Marketing,RAG Answer,Cosine Similarity,BLEU Score,ROUGE Score
0,What purpose do large language models serve in...,Large language models serve the purpose of imp...,Large Language Models (LLMs) are advanced tool...,0.786122,0.006091,0.0
1,How does a large language model learn from tex...,A large language model learns from text during...,A large language model learns by analyzing vas...,0.786722,0.06948,0.083333
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,The creation of large language models involves...,0.655316,0.046626,0.066667
3,Can you name some specific large language mode...,"Chinchilla by DeepMind, GPT-3 by OpenAI.",IBM's Deep Blue project introduced large langu...,0.382179,0.003745,0.0
4,What licensing models have been adopted for th...,Answer: Some organizations choose open-sourcin...,"The text mentions two approaches: firstly, dep...",0.355925,0.007781,0.027397


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

folder_path = '/content/drive/My Drive/w267/a5/Config_Results/Marketing_Best_Config'  # You can change this path to your desired Google Drive folder

if not os.path.exists(folder_path):
    os.makedirs(folder_path)

file_path_marketing = os.path.join(folder_path, 'rag_results_marketing_best_test.csv')

df_marketing.to_csv(file_path_marketing, index=False)

In [None]:
path_marketing = '/content/drive/My Drive/w267/a5/Config_Results/Marketing_Best_Config/rag_results_marketing_best_test.csv'

df_marketing = pd.read_csv(path_marketing)

In [None]:
df_marketing.head()

Unnamed: 0,Question,Gold Answer Marketing,RAG Answer,Cosine Similarity,BLEU Score,ROUGE Score
0,What purpose do large language models serve in...,Large language models serve the purpose of imp...,Large Language Models (LLMs) are advanced tool...,0.786122,0.006091,0.0
1,How does a large language model learn from tex...,A large language model learns from text during...,A large language model learns by analyzing vas...,0.786722,0.06948,0.083333
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,The creation of large language models involves...,0.655316,0.046626,0.066667
3,Can you name some specific large language mode...,"Chinchilla by DeepMind, GPT-3 by OpenAI.",IBM's Deep Blue project introduced large langu...,0.382179,0.003745,0.0
4,What licensing models have been adopted for th...,Answer: Some organizations choose open-sourcin...,"The text mentions two approaches: firstly, dep...",0.355925,0.007781,0.027397


In [None]:
average_cosine_similarity_m = df_marketing['Cosine Similarity'].mean()

average_bleu_score_m = df_marketing['BLEU Score'].mean()

average_rouge_score_m = df_marketing['ROUGE Score'].mean()

print("Average Cosine Similarity:", average_cosine_similarity_m)
print("Average BLEU Score:", average_bleu_score_m)
print("Average ROUGE Score:", average_rouge_score_m)

Average Cosine Similarity: 0.7302247442666665
Average BLEU Score: 0.03701302521438866
Average ROUGE Score: 0.06732216990156031


### Top Configuration for Research Team

Model: cohere
Embedding: multi-qa-mpnet-base-dot-v1
Chunk Size: 128
Overlap: 25
Temperature: 0.6
Improved prompts

In [None]:
base_embeddings = HuggingFaceEmbeddings(model_name="multi-qa-mpnet-base-dot-v1")


loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=128, chunk_overlap=25)
splits = text_splitter.split_documents(documents)

vectorstore = Qdrant.from_documents(splits,
    base_embeddings,
    location=":memory:",
    collection_name="test",
    force_recreate=True
)
retriever = vectorstore.as_retriever()


qdrant_vectorstore = Qdrant.from_documents(splits,
    base_embeddings,
    location=":memory:",
    collection_name="rag_tech_db",
    force_recreate=True
)

retriever = qdrant_vectorstore.as_retriever()
CHUNK_SIZE=128
OVERLAP=25

text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=OVERLAP)

global_doc_number = 1
arxiv_numbers = ('2005.11401', '2104.07567', '2104.09864', '2105.03011', '2106.09685', '2203.02155', '2211.09260', '2211.12561',
                 '2212.09741', '2305.14314', '2305.18290', '2306.15595', '2309.08872', '2309.15217', '2310.06825', '2310.11511',
                 '2311.08377', '2312.05708', '2401.06532', '2401.17268', '2402.01306', '2402.19473', '2406.04744')
all_arxiv_pages = []

for identifier in arxiv_numbers:
    arx_url = f"https://arxiv.org/pdf/{identifier}.pdf"

    arx_loader = PyMuPDFLoader(arx_url)
    arx_pages = arx_loader.load()
    for page_num in range(len(arx_pages)):
        page = arx_pages[page_num]
        page.metadata['page_num'] = page_num
        page.metadata['doc_num'] = global_doc_number
        page.metadata['doc_source'] = "ArXiv"
        all_arxiv_pages.append(page)


    global_doc_number += 1

splits = text_splitter.split_documents(all_arxiv_pages)
for idx, text in enumerate(splits):
    splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=splits)


wiki_docs = WikipediaLoader(query="Generative Artificial Intelligence", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


wiki_docs = WikipediaLoader(query="Information Retrieval", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


wiki_docs = WikipediaLoader(query="Large Language Models", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


web_loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2020-10-29-odqa/",
               "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
               "https://lilianweng.github.io/posts/2018-06-24-attention/",
               "https://lilianweng.github.io/posts/2023-06-23-agent/",
               "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/"),

    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

web_documents = web_loader.load()

for idx, text in enumerate(web_documents):
    web_documents[idx].metadata['doc_num'] = global_doc_number
    web_documents[idx].metadata['doc_source'] = "WWW"
    global_doc_number += 1

web_splits = text_splitter.split_documents(web_documents)

for idx, text in enumerate(web_splits):
    web_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=web_splits)


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

['80b2374206fb4f7bb06c0bd4911c66be',
 'ca9c7169356e491abf3501e4b8b349fd',
 '6b357ab0296a4b5abb6c3c20b4baa94a',
 'd72bb10407284594890a2b72aba25488',
 '80e2c5817a4a440e9836220be504348c',
 'db8ef233dad243aea05736b6d55c0674',
 'd7b7058399c2485aaf6174b673666556',
 '28e27084f7104d2b8f255590361a9008',
 'c7ec75b2e6264c1fbc0bb5db8b579811',
 '5b35db3bbf53459bba22077433eff2c7',
 '296805fa41184184b5258a55f7a89303',
 'd5cd3a9e55784453a87125519b6f2c09',
 'ef9cabfbb1d44252b271738d762119fd',
 'da80068042a241339c9f18dfa3cf0d4a',
 'b0d6b79a38ed4937ab13a16e48ec169c',
 'e89f9198a696460f9546b49b47e9cb4b',
 'eeb6d32d01fd465982dc74a2b3ab93fc',
 '1c4832adfcf34c3d99a868f86d335b98',
 '22d61fcfff1248bfb89d17c339ed6efc',
 'ec91ade52e5c4388bec223a58633e4b7',
 '36ca246225d943cfb38e809e2a44dd4d',
 'ce965e062460409883bcd1b88f55c320',
 '97985c9415474ab28f07311505d1572e',
 '5abae73a3c5c43dfaf1479aaffc7977e',
 '7997e2e635904818b02f137e9d8f0938',
 'ed3ba4cb5fb14969a058d70e137c3f85',
 '3166a0ab512a4e5eb710152f53bb9a84',
 

In [None]:
prompt_eng_temp = """[INST]
You are a dedicated assistant versed in NLP and generative AI technologies. Your audience consists of a group of engineers at a technology company eager to develop innovative GenAI applications.
These engineers possess a strong technical foundation in NLP, so they require answers with sufficient technical depth.
Please rely solely on the context provided below for information:

{context}

That concludes the context section.

Next, please provide an answer to the following question, drawing exclusively from the provided context. Your response should not exceed 125 words and must cater to engineers who expect a nuanced and detailed understanding of NLP concepts. Ensure your answer is precise and detailed, avoiding any lists or bullet points.

Here is the question:

{question}
[/INST]

Assistant:"""


In [None]:
cohere_chat_model = ChatCohere(cohere_api_key=COHERE_API_KEY, temperature=0.6)

output_parser = StrOutputParser()

eng_prompt = ChatPromptTemplate.from_template(prompt_eng_temp)

cohere_rag_chain_eng = (
    {"context": retriever | format_docs,
     "question": RunnablePassthrough()}
    | eng_prompt
    | cohere_chat_model
    | output_parser
)

In [None]:
resp = cohere_rag_chain_eng.invoke('What are LLMs?')
resp

'A Large Language Model (LLM) is a sophisticated computational model that has revolutionized Natural Language Processing (NLP) tasks. LLMs are designed to understand and generate human-like language, making them powerful tools for various applications. These models have a unique ability to process and analyze vast amounts of textual data, learning complex patterns and structures inherent in language.\n\nLLMs fall under the broader category of deep learning models, often consisting of transformer-based architectures. They are trained on extensive datasets, containing diverse linguistic contexts, which enables them to capture intricate semantic relationships and nuances in language. This training process equips LLMs with the capability to perform a wide range of tasks, including language translation, text summarization, sentiment analysis, and even creative writing, all while maintaining a high degree of contextual understanding.\n\nThe provided context suggests that LLMs are not just to

In [None]:
data_research = []

for idx, info in validation_questions_answers.items():
    question = info['question']
    answer_research = info['gold_answer_research']

    data_research.append([question, answer_research])


df_research = pd.DataFrame(data_research, columns=['Question', 'Gold Answer Research'])

def add_rag_answer_column(df):
    df['RAG Answer'] = ''
    return df

df_research = add_rag_answer_column(df_research)

In [None]:
df_research.head()

Unnamed: 0,Question,Gold Answer Research,RAG Answer
0,What purpose do large language models serve in...,Large language models (LLMs) serve the purpose...,
1,How does a large language model learn from tex...,A large language model learns from text during...,
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,
3,Can you name some specific large language mode...,Some specific large language models include GP...,
4,What licensing models have been adopted for th...,"Based on the provided context, it seems that l...",


In [None]:
def evaluate_rag(questions, gold_answers, df, rag_chain, model):
    embeddings = SentenceTransformer(model)
    rouge = Rouge()

    for index, (question, gold_answer) in enumerate(zip(questions, gold_answers)):

        rag_answer = rag_chain.invoke(question)
        time.sleep(6)

        df.loc[index, 'RAG Answer'] = rag_answer

        gold_embed = embeddings.encode([gold_answer])
        rag_embed = embeddings.encode([rag_answer])

        cos_sim = cosine_similarity(gold_embed, rag_embed)[0][0]
        df.loc[index, 'Cosine Similarity'] = cos_sim

        bleu_score = sentence_bleu([gold_answer.split()], rag_answer.split(), smoothing_function=SmoothingFunction().method1)
        rouge_scores = rouge.get_scores(rag_answer, gold_answer)[0]

        df.loc[index, 'BLEU Score'] = bleu_score
        df.loc[index, 'ROUGE Score'] = rouge_scores['rouge-2']['f']

    return df

df_research = evaluate_rag(df_research['Question'], df_research['Gold Answer Research'], df_research, cohere_rag_chain_eng, 'multi-qa-mpnet-base-dot-v1')


In [None]:
df_research.head()

Unnamed: 0,Question,Gold Answer Research,RAG Answer,Cosine Similarity,BLEU Score,ROUGE Score
0,What purpose do large language models serve in...,Large language models (LLMs) serve the purpose...,Large language models (LLMs) have revolutioniz...,0.919003,0.029979,0.052632
1,How does a large language model learn from tex...,A large language model learns from text during...,"Large language models, with their extensive pr...",0.731893,0.031179,0.094017
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,The architecture of Large Language Models (LLM...,0.772751,0.019715,0.070312
3,Can you name some specific large language mode...,Some specific large language models include GP...,The landscape of large language models (LLMs) ...,0.730328,0.014157,0.055249
4,What licensing models have been adopted for th...,"Based on the provided context, it seems that l...",The context provided discusses the distributio...,0.759128,0.061657,0.10084


In [None]:
folder_path = '/content/drive/My Drive/w267/a5/Config_Results/Research_Best_Config'

if not os.path.exists(folder_path):
    os.makedirs(folder_path)

file_path_research = os.path.join(folder_path, 'rag_results_research_best_test.csv')

df_marketing.to_csv(file_path_marketing, index=False)
df_research.to_csv(file_path_research, index=False)

In [None]:
path_research = '/content/drive/My Drive/w267/a5/Config_Results/Research_Best_Config/rag_results_research_best_test.csv'

df_research = pd.read_csv(path_research)

In [None]:
df_research.head()

Unnamed: 0,Question,Gold Answer Research,RAG Answer,Cosine Similarity,BLEU Score,ROUGE Score
0,What purpose do large language models serve in...,Large language models (LLMs) serve the purpose...,Large language models (LLMs) have revolutioniz...,0.919003,0.029979,0.052632
1,How does a large language model learn from tex...,A large language model learns from text during...,"Large language models, with their extensive pr...",0.731893,0.031179,0.094017
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,The architecture of Large Language Models (LLM...,0.772751,0.019715,0.070312
3,Can you name some specific large language mode...,Some specific large language models include GP...,The landscape of large language models (LLMs) ...,0.730328,0.014157,0.055249
4,What licensing models have been adopted for th...,"Based on the provided context, it seems that l...",The context provided discusses the distributio...,0.759128,0.061657,0.10084


In [None]:
average_cosine_similarity_r = df_research['Cosine Similarity'].mean()

average_bleu_score_r = df_research['BLEU Score'].mean()

average_rouge_score_r = df_research['ROUGE Score'].mean()

print("Average Cosine Similarity:", average_cosine_similarity_r)
print("Average BLEU Score:", average_bleu_score_r)
print("Average ROUGE Score:", average_rouge_score_r)

Average Cosine Similarity: 0.8015972873333334
Average BLEU Score: 0.03813580619750066
Average ROUGE Score: 0.08967720401914901


### Second Best Configuration for Research Team

Model: Mistral
Embedding: distilroberta
Chunk Size: 128
Overlap: 25
Temperature: 0.6
Improved prompts

In [None]:
base_embeddings = HuggingFaceEmbeddings(model_name="all-distilroberta-v1")


loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=128, chunk_overlap=25)
splits = text_splitter.split_documents(documents)

vectorstore = Qdrant.from_documents(splits,
    base_embeddings,
    location=":memory:",
    collection_name="test",
    force_recreate=True
)
retriever = vectorstore.as_retriever()


qdrant_vectorstore = Qdrant.from_documents(splits,
    base_embeddings,
    location=":memory:",
    collection_name="rag_tech_db",
    force_recreate=True
)

retriever = qdrant_vectorstore.as_retriever()
CHUNK_SIZE=128
OVERLAP=25

text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=OVERLAP)

global_doc_number = 1
arxiv_numbers = ('2005.11401', '2104.07567', '2104.09864', '2105.03011', '2106.09685', '2203.02155', '2211.09260', '2211.12561',
                 '2212.09741', '2305.14314', '2305.18290', '2306.15595', '2309.08872', '2309.15217', '2310.06825', '2310.11511',
                 '2311.08377', '2312.05708', '2401.06532', '2401.17268', '2402.01306', '2402.19473', '2406.04744')
all_arxiv_pages = []

for identifier in arxiv_numbers:
    arx_url = f"https://arxiv.org/pdf/{identifier}.pdf"

    arx_loader = PyMuPDFLoader(arx_url)
    arx_pages = arx_loader.load()
    for page_num in range(len(arx_pages)):
        page = arx_pages[page_num]
        page.metadata['page_num'] = page_num
        page.metadata['doc_num'] = global_doc_number
        page.metadata['doc_source'] = "ArXiv"
        all_arxiv_pages.append(page)


    global_doc_number += 1

splits = text_splitter.split_documents(all_arxiv_pages)
for idx, text in enumerate(splits):
    splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=splits)


wiki_docs = WikipediaLoader(query="Generative Artificial Intelligence", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


wiki_docs = WikipediaLoader(query="Information Retrieval", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


wiki_docs = WikipediaLoader(query="Large Language Models", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


web_loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2020-10-29-odqa/",
               "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
               "https://lilianweng.github.io/posts/2018-06-24-attention/",
               "https://lilianweng.github.io/posts/2023-06-23-agent/",
               "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/"),

    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

web_documents = web_loader.load()

for idx, text in enumerate(web_documents):
    web_documents[idx].metadata['doc_num'] = global_doc_number
    web_documents[idx].metadata['doc_source'] = "WWW"
    global_doc_number += 1

web_splits = text_splitter.split_documents(web_documents)

for idx, text in enumerate(web_splits):
    web_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=web_splits)


['bc105c12fab241e6aa4b5c10d0735f27',
 'dcdb730c328a475bb2f691779c3ccc30',
 '8bbb696b76d844398d7a0a5b58ea8bbe',
 '621f8ace80874738bc7d1101471e95e3',
 '4719fc62626a4210aef555cb94cae224',
 '102193e99ce54515b446d93ea25657da',
 '3d73dcbb76fc48fd83b174d193e31a92',
 'd116e98b37ae42ae8a06a6776878ad07',
 '349a070533794e71ae9bbf53451aa06f',
 'b0b85c001f284a42b51ab72c9e35e242',
 '57334f781e774c6195c461a62874820a',
 '947604b0f4ff453581fa6b7cb53d5e8c',
 'ec2c5462726041f3b8ca2204ac331c2c',
 '0c53629989df4bbbb761eae936ef4daa',
 'e212f16ca89a40d19c65ed9c534f3f2c',
 '14062cac3aa04917bdc75dc9bc6b0832',
 '2cc76013a44a4dc89b3e165ab5fde116',
 '103f8a32dcae4f4fa74684ac08cba923',
 'fe671364733d4bb5b48653881b2bebd1',
 '964ac2fa35e84c8d90c6925c324936b1',
 'de60a882039b4c808c817a02a9d116e8',
 '2d4e92eed1054973886df8cc9f5eb030',
 '328f938c6caa4d56bac5043869075a70',
 'ee3b63570e1343d3aa656612a3634443',
 '58db6933e80c4cf7af3564f15f35f393',
 '11e8c45fe45240919fabb56066b55b8c',
 '5b0587e29e5b45228b69120de3aec2ab',
 

In [None]:
mistral_pipe = pipeline(
    "text-generation",
    model=llm_mistral_model,
    tokenizer=llm_mistral_tokenizer,
    max_new_tokens=700,
    temperature=0.6,
    top_p=0.95,
    do_sample=True,
    repetition_penalty=1.2
)
mistral_pipe.model.config.pad_token_id = mistral_pipe.model.config.eos_token_id

mistral_llm_lc = HuggingFacePipeline(pipeline=mistral_pipe)

Device set to use cuda:0


In [None]:
prompt_eng_temp = """[INST]
You are a dedicated assistant versed in NLP and generative AI technologies. Your audience consists of a group of engineers at a technology company eager to develop innovative GenAI applications.
These engineers possess a strong technical foundation in NLP, so they require answers with sufficient technical depth.
Please rely solely on the context provided below for information:

{context}

That concludes the context section.

Next, please provide an answer to the following question, drawing exclusively from the provided context. Your response should not exceed 125 words and must cater to engineers who expect a nuanced and detailed understanding of NLP concepts. Ensure your answer is precise and detailed, avoiding any lists or bullet points.

Here is the question:

{question}
[/INST]

Assistant:"""

In [None]:
eng_prompt = ChatPromptTemplate.from_template(prompt_eng_temp)

rag_chain_eng = (
    {"context": retriever | format_docs,
     "question": RunnablePassthrough()}
    | eng_prompt
    | mistral_llm_lc
)


In [None]:
response_text = rag_chain_eng.invoke('What are LLMs?')
print(response_text)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Human: [INST]
You are a dedicated assistant versed in NLP and generative AI technologies. Your audience consists of a group of engineers at a technology company eager to develop innovative GenAI applications.
These engineers possess a strong technical foundation in NLP, so they require answers with sufficient technical depth.
Please rely solely on the context provided below for information:

2. Background
For a detailed survey, please see Appendix A. In brief, LLMs

LLMs themselves. Our work falls within the for-
mer category, placing a particular emphasis on

research in the realm of LLMs, particularly in their
application to IR tasks, and will encourage contin-

the LLM to be able to understand and adhere to specific requirements expressed in the instructions
8

That concludes the context section.

Next, please provide an answer to the following question, drawing exclusively from the provided context. Your response should not exceed 125 words and must cater to engineers who expect a 

In [None]:
extract_assistant_response("Assistant: LLMs, or Large Language Models")

'LLMs, or Large Language Models'

In [None]:
data_research = []

for idx, info in validation_questions_answers.items():
    question = info['question']
    answer_research = info['gold_answer_research']
    data_research.append([question, answer_research])

df_research = pd.DataFrame(data_research, columns=['Question', 'Gold Answer Research'])

def add_rag_answer_column(df):
    df['RAG Answer'] = ''
    return df

df_research = add_rag_answer_column(df_research)

In [None]:
df_research.head()

Unnamed: 0,Question,Gold Answer Research,RAG Answer
0,What purpose do large language models serve in...,Large language models (LLMs) serve the purpose...,
1,How does a large language model learn from tex...,A large language model learns from text during...,
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,
3,Can you name some specific large language mode...,Some specific large language models include GP...,
4,What licensing models have been adopted for th...,"Based on the provided context, it seems that l...",


In [None]:
import numpy as np

def evaluate_rag(questions, gold_answers, df, rag_chain, model):
    embeddings = SentenceTransformer(model)
    rouge = Rouge()

    for index, (question, gold_answer) in enumerate(zip(questions, gold_answers)):
        response_text = rag_chain.invoke(question)

        rag_answer = extract_assistant_response(response_text)

        df.loc[index, 'RAG Answer'] = rag_answer

        gold_embed = embeddings.encode([gold_answer])
        rag_embed = embeddings.encode([rag_answer])

        cos_sim = cosine_similarity(gold_embed, rag_embed)[0][0]
        df.loc[index, 'Cosine Similarity'] = cos_sim

        bleu_score = sentence_bleu([gold_answer.split()], rag_answer.split(), smoothing_function=SmoothingFunction().method1)
        rouge_scores = rouge.get_scores(rag_answer, gold_answer)[0]

        df.loc[index, 'BLEU Score'] = bleu_score
        df.loc[index, 'ROUGE Score'] = rouge_scores['rouge-2']['f']

    return df

df_research = evaluate_rag(df_research['Question'], df_research['Gold Answer Research'], df_research, rag_chain_eng, 'all-mpnet-base-v2')

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [None]:
df_research.head()

Unnamed: 0,Question,Gold Answer Research,RAG Answer,Cosine Similarity,BLEU Score,ROUGE Score
0,What purpose do large language models serve in...,Large language models (LLMs) serve the purpose...,In the realm of Natural Language Processing (N...,0.74364,0.004066,0.022857
1,How does a large language model learn from tex...,A large language model learns from text during...,"In the context provided, we see that large lan...",0.704636,0.010697,0.04878
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,"In the abstract provided, no specific mention ...",0.664728,0.088252,0.107527
3,Can you name some specific large language mode...,Some specific large language models include GP...,"In the context provided, no explicit mention w...",0.716404,0.043547,0.09375
4,What licensing models have been adopted for th...,"Based on the provided context, it seems that l...",The text suggests that there's ongoing researc...,0.791488,0.015821,0.080537


In [None]:
folder_path = '/content/drive/My Drive/w267/a5/Config_Results/Research_Second_Best_Config'

if not os.path.exists(folder_path):
    os.makedirs(folder_path)

file_path_research = os.path.join(folder_path, 'rag_results_research_second_best_test.csv')

df_research.to_csv(file_path_research, index=False)

In [None]:
path_research = '/content/drive/My Drive/w267/a5/Config_Results/Research_Second_Best_Config/rag_results_research_second_best_test.csv'

df_research = pd.read_csv(path_research)

In [None]:
df_research.head()

Unnamed: 0,Question,Gold Answer Research,RAG Answer,Cosine Similarity,BLEU Score,ROUGE Score
0,What purpose do large language models serve in...,Large language models (LLMs) serve the purpose...,In the realm of Natural Language Processing (N...,0.74364,0.004066,0.022857
1,How does a large language model learn from tex...,A large language model learns from text during...,"In the context provided, we see that large lan...",0.704636,0.010697,0.04878
2,What are some key architectures behind the dev...,Key architectures behind the development of la...,"In the abstract provided, no specific mention ...",0.664728,0.088252,0.107527
3,Can you name some specific large language mode...,Some specific large language models include GP...,"In the context provided, no explicit mention w...",0.716404,0.043547,0.09375
4,What licensing models have been adopted for th...,"Based on the provided context, it seems that l...",The text suggests that there's ongoing researc...,0.791488,0.015821,0.080537


In [None]:
average_cosine_similarity_r = df_research['Cosine Similarity'].mean()

average_bleu_score_r = df_research['BLEU Score'].mean()

average_rouge_score_r = df_research['ROUGE Score'].mean()

print("Average Cosine Similarity:", average_cosine_similarity_r)
print("Average BLEU Score:", average_bleu_score_r)
print("Average ROUGE Score:", average_rouge_score_r)

Average Cosine Similarity: 0.7643433446666666
Average BLEU Score: 0.03490047324552584
Average ROUGE Score: 0.06876010691840893


## 5. Results

### 5.1 Model Specifications

Document the detailed specs of your choices. Also comment on how you valued the needs of the marketing tean vs the needs of the researchers, in case you had to make a trade-off.


### Setup Chosen Configuration (Best Performing Configuration for Research)

Model: cohere
Embedding: multi-qa-mpnet-base-dot-v1
Chunk Size: 128
Overlap: 25
Temperature: 0.6
Improved prompts

In [None]:
base_embeddings = HuggingFaceEmbeddings(model_name="multi-qa-mpnet-base-dot-v1")


loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=128, chunk_overlap=25)
splits = text_splitter.split_documents(documents)

vectorstore = Qdrant.from_documents(splits,
    base_embeddings,
    location=":memory:",
    collection_name="test",
    force_recreate=True
)
retriever = vectorstore.as_retriever()


qdrant_vectorstore = Qdrant.from_documents(splits,
    base_embeddings,
    location=":memory:",
    collection_name="rag_tech_db",
    force_recreate=True
)

retriever = qdrant_vectorstore.as_retriever()
CHUNK_SIZE=128
OVERLAP=25

text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=OVERLAP)

global_doc_number = 1
arxiv_numbers = ('2005.11401', '2104.07567', '2104.09864', '2105.03011', '2106.09685', '2203.02155', '2211.09260', '2211.12561',
                 '2212.09741', '2305.14314', '2305.18290', '2306.15595', '2309.08872', '2309.15217', '2310.06825', '2310.11511',
                 '2311.08377', '2312.05708', '2401.06532', '2401.17268', '2402.01306', '2402.19473', '2406.04744')
all_arxiv_pages = []

for identifier in arxiv_numbers:
    arx_url = f"https://arxiv.org/pdf/{identifier}.pdf"

    arx_loader = PyMuPDFLoader(arx_url)
    arx_pages = arx_loader.load()
    for page_num in range(len(arx_pages)):
        page = arx_pages[page_num]
        page.metadata['page_num'] = page_num
        page.metadata['doc_num'] = global_doc_number
        page.metadata['doc_source'] = "ArXiv"
        all_arxiv_pages.append(page)


    global_doc_number += 1

splits = text_splitter.split_documents(all_arxiv_pages)
for idx, text in enumerate(splits):
    splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=splits)


wiki_docs = WikipediaLoader(query="Generative Artificial Intelligence", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


wiki_docs = WikipediaLoader(query="Information Retrieval", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


wiki_docs = WikipediaLoader(query="Large Language Models", load_max_docs=4).load()
for idx, text in enumerate(wiki_docs):
    wiki_docs[idx].metadata['doc_num'] = global_doc_number
    wiki_docs[idx].metadata['doc_source'] = "Wikipedia"

    global_doc_number += 1

wiki_splits = text_splitter.split_documents(wiki_docs)
for idx, text in enumerate(wiki_splits):
    wiki_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=wiki_splits)


web_loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2020-10-29-odqa/",
               "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
               "https://lilianweng.github.io/posts/2018-06-24-attention/",
               "https://lilianweng.github.io/posts/2023-06-23-agent/",
               "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/"),

    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

web_documents = web_loader.load()

for idx, text in enumerate(web_documents):
    web_documents[idx].metadata['doc_num'] = global_doc_number
    web_documents[idx].metadata['doc_source'] = "WWW"
    global_doc_number += 1

web_splits = text_splitter.split_documents(web_documents)

for idx, text in enumerate(web_splits):
    web_splits[idx].metadata['split_id'] = idx

qdrant_vectorstore.add_documents(documents=web_splits)


['a1927a1d2b1149ad8f45e467de388c06',
 'd4c0833cb9854289942be73adb94fa50',
 '75ebdcc035564007a8fe4248262f64d5',
 'df548f2b7d2342079d7ec01504154055',
 'bb42fc9b6a1a4c3aad05e7004e57f113',
 '299f5b2fbf754879bca5082c3ea382c5',
 '3192470a412b4a59b8c1863ca48b50b1',
 '7733a5cf22d84532ad1a0002a8c3308b',
 'ec8776675e2f4badb373b76225662000',
 'f5cbfa6d51034bacb151c844ab57f520',
 '54d0c007d6c74d38ab765d6927c25241',
 'cdf03a1ed98748059fa2cc1cac351d5b',
 '0ae7ee475d414d9787d67cc96f97245c',
 '2f5106d91be843b9a903ff686a35ac9b',
 '99b1b98b6d504b38a028bef5cfdd94ee',
 '1084b2a719bf4f71985fdf5a9f757ba3',
 '4e5af7b928434ecd831349f372ce540c',
 '6d3db304951841cfbf6249d996bfca3d',
 '46a36a992bc74fafa5fa17ff7397e35d',
 '1c82d957832d4ff494b8c0fa1b5053fc',
 '9661d0c10bbd4e8d88d0ec06a6bde015',
 'f10a28bdf0ea4dbb8e3bbd432ea0b3a0',
 '20d10f9f53964918aa869d1ae8a0316b',
 '9e8949245e7240daa1ed5c72388e896f',
 'f3e9731120614a4681776a2b85726ce4',
 'f80c5ec4823d43bfb6aa67d2f4a8dfb4',
 '1523516f58714c49b5919997bb887446',
 

In [None]:
prompt_eng_temp = """[INST]
You are a dedicated assistant versed in NLP and generative AI technologies. Your audience consists of a group of engineers at a technology company eager to develop innovative GenAI applications.
These engineers possess a strong technical foundation in NLP, so they require answers with sufficient technical depth.
Please rely solely on the context provided below for information:

{context}

That concludes the context section.

Next, please provide an answer to the following question, drawing exclusively from the provided context. Your response should not exceed 125 words and must cater to engineers who expect a nuanced and detailed understanding of NLP concepts. Ensure your answer is precise and detailed, avoiding any lists or bullet points.

Here is the question:

{question}
[/INST]

Assistant:"""

prompt_mark_temp = """[INST]
You are an expert assistant specializing in NLP and generative AI, geared to assist a marketing team at a tech firm planning to introduce a range of GenAI solutions.
This team lacks a technical background in NLP, requiring explanations free from complex jargon.
Please use only the context provided below to answer their questions:

{context}

That concludes the context section.

Now, respond to the following question based on the provided context in no more than 75 words. Aim to deliver your answer in a straightforward manner suitable for non-technical team members, focusing on general insights about GenAI technologies and their implications. Ensure your response is clear and concise, strictly avoiding lists or bullet points.

Here is the question:

{question}
[/INST]

Assistant:"""

In [None]:
cohere_chat_model = ChatCohere(cohere_api_key=COHERE_API_KEY, temperature=0.6)

output_parser = StrOutputParser()

eng_prompt = ChatPromptTemplate.from_template(prompt_eng_temp)
mark_prompt = ChatPromptTemplate.from_template(prompt_mark_temp)

cohere_rag_chain_eng = (
    {"context": retriever | format_docs,
     "question": RunnablePassthrough()}
    | eng_prompt
    | cohere_chat_model
    | output_parser
)

cohere_rag_chain_mark = (
    {"context": retriever | format_docs,
     "question": RunnablePassthrough()}
    | mark_prompt
    | cohere_chat_model
    | output_parser
)


In [None]:
def evaluate_responses(engineer_resp, marketing_resp, gold_answer_research, gold_answer_marketing, model_name):
    embeddings = SentenceTransformer(model_name)
    rouge = Rouge()

    gold_embed_research = embeddings.encode([gold_answer_research])
    resp_embed_research = embeddings.encode([engineer_resp])
    gold_embed_marketing = embeddings.encode([gold_answer_marketing])
    resp_embed_marketing = embeddings.encode([marketing_resp])

    cos_sim_research = cosine_similarity(gold_embed_research, resp_embed_research)[0][0]
    cos_sim_marketing = cosine_similarity(gold_embed_marketing, resp_embed_marketing)[0][0]

    bleu_score_research = sentence_bleu([gold_answer_research.split()], engineer_resp.split(), smoothing_function=SmoothingFunction().method1)
    rouge_scores_research = rouge.get_scores(engineer_resp, gold_answer_research)[0]

    bleu_score_marketing = sentence_bleu([gold_answer_marketing.split()], marketing_resp.split(), smoothing_function=SmoothingFunction().method1)
    rouge_scores_marketing = rouge.get_scores(marketing_resp, gold_answer_marketing)[0]

    results = {
        'Research': {
            'Cosine Similarity': cos_sim_research,
            'BLEU Score': bleu_score_research,
            'ROUGE Score': rouge_scores_research['rouge-2']['f']
        },
        'Marketing': {
            'Cosine Similarity': cos_sim_marketing,
            'BLEU Score': bleu_score_marketing,
            'ROUGE Score': rouge_scores_marketing['rouge-2']['f']
        }
    }

    return results

In [None]:
engineer_resp = cohere_rag_chain_eng.invoke(validation_questions_answers[0]['question'])
marketing_resp = cohere_rag_chain_mark.invoke(validation_questions_answers[0]['question'])

In [None]:
print(engineer_resp)

Large language models (LLMs) have become indispensable tools in Natural Language Processing (NLP) due to their ability to handle a vast array of language-related tasks. These models, often boasting billions of parameters, are trained on extensive text corpora, enabling them to learn the intricacies of language, including grammar, semantics, and context. This comprehensive training allows LLMs to generate human-like text, complete with accurate predictions and a nuanced understanding of language patterns.

In the context of NLP, LLMs are particularly valuable for tasks such as speech recognition, where they excel at predicting high-probability word sequences, thus enhancing accuracy and reducing errors. Moreover, these models can be applied to a broad spectrum of language-centric tasks, from text generation and translation to sentiment analysis and language understanding. The versatility of LLMs in tackling diverse NLP challenges makes them a cornerstone in the development of advanced A

In [None]:
print(marketing_resp)

Large language models are powerful tools in the world of Natural Language Processing (NLP) and artificial intelligence. These models have been developed to understand and process human language, making them incredibly valuable for various tasks. They can assist with speech recognition, ensuring accuracy by predicting high-probability words and phrases. The models' capabilities extend to a wide range of language-related tasks, such as text generation, translation, and summarization, all of which are essential for many applications, especially in the tech industry. Understanding and utilizing these models can significantly enhance the team's marketing strategies for GenAI solutions.


In [None]:
search_results = qdrant_vectorstore.similarity_search(validation_questions_answers[0]['question'])

In [None]:
for doc in search_results:
    context = doc.metadata['summary'] if 'summary' in doc.metadata else doc.page_content
    print("Context Provided:")
    print(context)
    print("\n")

    source = doc.metadata['source']
    doc_num = doc.metadata['doc_num']
    doc_source = doc.metadata['doc_source']
    page_num = doc.metadata.get('page_num', 'N/A')
    print("Document Source:")
    print(f"URL: {source}, Document Number: {doc_num}, Document Source: {doc_source}, Page Number: {page_num}")
    print("--------------------------------------------------------------------------\n")

Context Provided:
A language model is a probabilistic model of a natural language. In 1980, the first significant statistical language model was proposed, and during the decade IBM performed ‘Shannon-style’ experiments, in which potential sources for language modeling improvement were identified by observing and analyzing the performance of human subjects in predicting or correcting text.
Language models are useful for a variety of tasks, including speech recognition (helping prevent predictions of low-probability (e.g. nonsense) sequences), machine translation, natural language generation (generating more human-like text), optical character recognition, route optimization, handwriting recognition, grammar induction, and information retrieval.
Large language models, currently their most advanced form, are a combination of larger datasets (frequently using words scraped from the public internet), feedforward neural networks, and transformers. They have superseded recurrent neural networ

In [None]:
question_index = 0
validation_data = validation_questions_answers[question_index]

evaluation_results = evaluate_responses(
    engineer_resp,
    marketing_resp,
    validation_data['gold_answer_research'],
    validation_data['gold_answer_marketing'],
    'multi-qa-mpnet-base-dot-v1'
)

print(evaluation_results)

{'Research': {'Cosine Similarity': 0.92672336, 'BLEU Score': 0.043424186661179996, 'ROUGE Score': 0.10389609931672965}, 'Marketing': {'Cosine Similarity': 0.8481807, 'BLEU Score': 0.021049522867395996, 'ROUGE Score': 0.07999999619072018}}


In [None]:
engineer_resp = cohere_rag_chain_eng.invoke(validation_questions_answers[50]['question'])
marketing_resp = cohere_rag_chain_mark.invoke(validation_questions_answers[50]['question'])

In [None]:
print(engineer_resp)

The creation of training data for embedding models that incorporate task-specific instructions is a nuanced process. One approach, as outlined in the provided context, is the INSTRUCTOR method, which involves fine-tuning embedding models using both the input text and its corresponding task instructions. This technique ensures that the generated embeddings are not only contextually relevant but also task-aware.

The training data for such models can be derived from diverse sources, emphasizing the importance of instruction fine-tuning. By including task instructions alongside the text data, the model learns to capture the nuances of different tasks and domains. This is particularly beneficial when dealing with varied datasets, as it allows the model to adapt to the specific requirements of each task. 

The key innovation here is the model's ability to encode both the input text and the task instructions, resulting in embeddings that are highly attuned to the given task and domain, thus 

In [None]:
print(marketing_resp)

In the context of GenAI, creating training data for embedding models often involves a technique called 'instruction finetuning'. This method ensures that the model learns not just from the text itself but also from specific task instructions associated with that text. By doing so, the model becomes more adept at understanding the context and purpose of the text, which is crucial for generating relevant and accurate embeddings. This process is particularly important when dealing with diverse datasets, as it helps the model adapt to various tasks and domains, making it more versatile and effective for real-world applications.


In [None]:
search_results = qdrant_vectorstore.similarity_search(validation_questions_answers[50]['question'])

In [None]:
for doc in search_results:
    context = doc.metadata['summary'] if 'summary' in doc.metadata else doc.page_content
    print("Context Provided:")
    print(context)
    print("\n")

    source = doc.metadata['source']
    doc_num = doc.metadata['doc_num']
    doc_source = doc.metadata['doc_source']
    page_num = doc.metadata.get('page_num', 'N/A')
    print("Document Source:")
    print(f"URL: {source}, Document Number: {doc_num}, Document Source: {doc_source}, Page Number: {page_num}")
    print("--------------------------------------------------------------------------\n")

Context Provided:
model that generates task- and domain-aware em-
beddings given a text input and its task instructions.


Document Source:
URL: https://arxiv.org/pdf/2212.09741.pdf, Document Number: 9, Document Source: ArXiv, Page Number: 1
--------------------------------------------------------------------------

Context Provided:
finetuned embedding models. Given an input text
x and a task instruction Ix, INSTRUCTOR encodes


Document Source:
URL: https://arxiv.org/pdf/2212.09741.pdf, Document Number: 9, Document Source: ArXiv, Page Number: 2
--------------------------------------------------------------------------

Context Provided:
tance of instruction finetuning when diverse data
are used for embedding training. Note that train-


Document Source:
URL: https://arxiv.org/pdf/2212.09741.pdf, Document Number: 9, Document Source: ArXiv, Page Number: 5
--------------------------------------------------------------------------

Context Provided:
only text inputs but also task instruc

In [None]:
question_index = 50
validation_data = validation_questions_answers[question_index]

evaluation_results = evaluate_responses(
    engineer_resp,
    marketing_resp,
    validation_data['gold_answer_research'],
    validation_data['gold_answer_marketing'],
    'multi-qa-mpnet-base-dot-v1'
)

print(evaluation_results)

{'Research': {'Cosine Similarity': 0.7510888, 'BLEU Score': 0.056243171100448235, 'ROUGE Score': 0.07594936232014131}, 'Marketing': {'Cosine Similarity': 0.7339634, 'BLEU Score': 0.03375843711950882, 'ROUGE Score': 0.04285713839693924}}


In [None]:
test_questions[83]['question']

"How does a model's ability to answer questions relate to its exposure to specific types of questions during training?"

In [None]:
engineer_resp = cohere_rag_chain_eng.invoke(test_questions[83]['question'])
marketing_resp = cohere_rag_chain_mark.invoke(test_questions[83]['question'])

In [None]:
print(engineer_resp)

The model's performance on question-answering tasks is intricately linked to the diversity and quality of questions it encounters during training. When a model is exposed to a wide range of questions, including those with answers present in the training data and novel questions with unseen answers, it learns to generalize and adapt its responses. This is crucial for developing robust question-answering systems.

For questions with answers in the training set, the model can easily memorize and recall the correct response, demonstrating its ability to learn from direct examples. However, the true test of a model's intelligence lies in its capacity to handle novel questions. By encountering questions that require reasoning and extrapolation, the model learns to identify relevant patterns and relationships in the data, enabling it to provide reasonable answers even when the exact question-answer pair is not present in its training experience.

Furthermore, the concept of refusals is an adv

In [None]:
print(marketing_resp)

The model's performance in answering questions is closely tied to the quality and diversity of its training data, especially the types of questions it encounters during this phase. If a model is trained on a dataset with questions and their corresponding answers, it can learn to memorize and replicate these answers accurately when it sees the same questions again. However, the real challenge and power of GenAI come into play when the model encounters novel questions, requiring it to generalize and apply its knowledge to new situations. This is where the model's ability to refuse to answer becomes crucial, ensuring it doesn't provide incorrect or misleading information.


In [None]:
search_results = qdrant_vectorstore.similarity_search(test_questions[83]['question'])

In [None]:
for doc in search_results:
    context = doc.metadata['summary'] if 'summary' in doc.metadata else doc.page_content
    print("Context Provided:")
    print(context)
    print("\n")

    source = doc.metadata['source']
    doc_num = doc.metadata['doc_num']
    doc_source = doc.metadata['doc_source']
    page_num = doc.metadata.get('page_num', 'N/A')
    print("Document Source:")
    print(f"URL: {source}, Document Number: {doc_num}, Document Source: {doc_source}, Page Number: {page_num}")
    print("--------------------------------------------------------------------------\n")

Context Provided:
A model is able to correctly memorize and respond with the answer to a question that has been seen at training time.


Document Source:
URL: https://lilianweng.github.io/posts/2020-10-29-odqa/, Document Number: 36, Document Source: WWW, Page Number: N/A
--------------------------------------------------------------------------

Context Provided:
A model is able to answer novel questions which have answers not contained in the training dataset.


Document Source:
URL: https://lilianweng.github.io/posts/2020-10-29-odqa/, Document Number: 36, Document Source: WWW, Page Number: N/A
--------------------------------------------------------------------------

Context Provided:
A model is able to answer novel questions at test time and choose an answer from the set of answers it has seen during


Document Source:
URL: https://lilianweng.github.io/posts/2020-10-29-odqa/, Document Number: 36, Document Source: WWW, Page Number: N/A
-----------------------------------------------