In [9]:
import requests
from bs4 import BeautifulSoup
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


links = ['https://en.wikipedia.org/wiki/2002_FIFA_World_Cup',
         'https://en.wikipedia.org/wiki/2006_FIFA_World_Cup',
         'https://en.wikipedia.org/wiki/2010_FIFA_World_Cup']

''',
         'https://en.wikipedia.org/wiki/2014_FIFA_World_Cup',
         'https://en.wikipedia.org/wiki/2018_FIFA_World_Cup',
         'https://en.wikipedia.org/wiki/2022_FIFA_World_Cup',
         'https://en.wikipedia.org/wiki/2026_FIFA_World_Cup']'''

",\n         'https://en.wikipedia.org/wiki/2014_FIFA_World_Cup',\n         'https://en.wikipedia.org/wiki/2018_FIFA_World_Cup',\n         'https://en.wikipedia.org/wiki/2022_FIFA_World_Cup',\n         'https://en.wikipedia.org/wiki/2026_FIFA_World_Cup']"

In [10]:
def format_docs(docs):
    return "\n".join(doc.page_content for doc in docs)

In [11]:
d = {'2002':[],
     '2006':[],
     '2010':[],
     '2014':[],
     '2018':[],
     '2022':[],
     '2026':[]}

for link in links:
    loader = WebBaseLoader(link)
    docs = loader.load()
    #formatted_docs = format_docs(docs)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap = 100)
    splits = text_splitter.split_documents(docs)
    d[link[-19:-15]] = splits


d['2026']

[]

In [12]:
d['2002']

[Document(page_content='2002 FIFA World Cup - Wikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMain menu\n\n\n\n\n\nMain menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation\n\t\n\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact usDonate\n\n\n\n\n\n\t\tContribute\n\t\n\n\nHelpLearn to editCommunity portalRecent changesUpload file\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCreate account\n\nLog in\n\n\n\n\n\n\n\n\nPersonal tools\n\n\n\n\n\n Create account Log in\n\n\n\n\n\n\t\tPages for logged out editors learn more\n\n\n\nContributionsTalk\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nContents\nmove to sidebar\nhide\n\n\n\n\n(Top)\n\n\n\n\n\n1Host selection\n\n\n\n\n\n\n\n2Qualification\n\n\n\nToggle Qualification subsection\n\n\n\n\n\n2.1List of qualified teams\n\n\n\n\n\n\n\n\n\n3Venues\n\n\n\n\n\n\n

In [13]:
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain_community.vectorstores import Chroma

In [14]:
import ollama
#Define the Ollama LLM function
def ollama_llm(question, context):
    formatted_prompt = f"Question: {question}\n\nContext: {context}"
    response = ollama.chat(model='mistral', messages=[{'role': 'user', 'content':formatted_prompt}])
    return response['message']['content']


In [15]:
loader = WebBaseLoader(links)
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap = 200)
splits = text_splitter.split_documents(docs)
embeddings = OllamaEmbeddings(model="mistral")
vectorStore = None
vectorStore = Chroma.from_documents(documents = splits, embedding = embeddings, persist_directory="/home/shivam/Desktop/Git-repo/RAG-chatbot")
#retriever = vectorStore.as_retriever()

# Needs improvement

1. docs need to be formatted : remove unnecessary characters and spaces.
2. webbase loader does not work well when we pass links all together: pass links one by one.
3. Improve RAG chain:
    3.1. Try increasing the chunk size.
    3.2. Try creating chunks wrt sections.
    3.3. Increase relevance: how??
    3.4. Re-ranking
    3.5. Fact ectraction
    3.6. prompt compression
    

# Creating my own embedding model
Idea:
1. Take the retrieved docs for any given prompt
2. Pass it through your embedding functions
3. Now perform a similarity test between your prompt and the retrieved docs
4. Define a criteria to select only the helpful documents and proceed with generating response.

In [16]:
# Function for RAG with semantic search
#Add tests


'''
1. Calling our RAG chain. (rag_chain_test_semantic): This function takes in the prompt and retriever as arguments
2. The RAG chain retrieves the top documents from the VectorDB. (Chroma)
3. We perform semantic search on the retrieved documents which returns the documents having score between [0.7*max_similarity_score, max_similarity_score] (similarity scores are the calculated between retrieved documents and prompt to get relevant data.)
4. Next, we format the rectrieved context and pass it to the function ollama_llm.
5. The ollama_llm function takes the prompt alongside the context generated in step 4 and generates a response from the given LLM model. (In our case mistral)

'''


from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
embeddings = OllamaEmbeddings(model="mistral")


def list_of_retrieved_docs(retrieved_docs):
    dict_response = {}
    for i in range(len(retrieved_docs)):
        if i not in dict_response:
            dict_response[i] = retrieved_docs[i].page_content.replace("\n", "")
    list_retrieved_docs = list(dict_response.values())
    return list_retrieved_docs

def valid_docs(similarity_scores, max_similarity):
    valid_doc_index = []
    for i in range(len(similarity_scores)):
        if similarity_scores[i] >= 0.6*max_similarity:
            valid_doc_index.append(i)
    return valid_doc_index

def semantic_search_on_retrieved_docs(prompt,list_retrieved_docs, retrieved_docs):
    model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
    query_embedding = model.encode(prompt)
    retrieved_docs_embedding = model.encode(list_retrieved_docs)
    similarity_scores = util.dot_score(query_embedding, retrieved_docs_embedding)[0].tolist()
    max_similarity = max(similarity_scores)
    valid_doc_index = valid_docs(similarity_scores, max_similarity)
    best_retrieved_docs = []
    for idx in valid_doc_index:
        best_retrieved_docs.append(retrieved_docs[idx])
    return best_retrieved_docs

def rag_chain_test_semantic(question,retriever):
    retrieved_docs = retriever.invoke(question)
    list_retrieved_docs = list_of_retrieved_docs(retrieved_docs)
    best_retrieved_docs = semantic_search_on_retrieved_docs(question,list_retrieved_docs, retrieved_docs)
    formatted_context = format_docs(best_retrieved_docs)
    return ollama_llm(question, formatted_context)

  from .autonotebook import tqdm as notebook_tqdm


In [33]:
## comment
prompt = "Who was top scorer in FIFA world cup 2010?"
vectorStore = None

In [34]:
#loading from disk
vectorStore = Chroma(persist_directory = "/home/shivam/Desktop/Git-repo/RAG-chatbot", embedding_function = embeddings)
retriever = vectorStore.as_retriever()

In [35]:
rag_chain_test_semantic(prompt, retriever)

' The top scorers in the FIFA World Cup 2010 were Thomas Müller of Germany, David Villa of Spain, Wesley Sneijder of the Netherlands, and Diego Forlán of Uruguay, each scoring five goals during the tournament. However, the Golden Boot went to Thomas Müller due to his three assists, while the Silver Boot was awarded to David Villa who played a total of 635 minutes, and the Bronze Boot went to Wesley Sneijder with a total of 652 minutes played. Therefore, David Villa was not the top scorer in the tournament but he finished second behind Thomas Müller.'

In [20]:
# add one link at a time: https://www.reddit.com/r/learnmachinelearning/comments/1b2wmfk/how_to_add_millions_of_documents_to_chromadb/