In [1]:
import os
from langchain_cohere.chat_models import ChatCohere
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate
from langchain.schema import Document
from langchain_cohere.embeddings import CohereEmbeddings
from langchain_mongodb import MongoDBAtlasVectorSearch
from pymongo.mongo_client import MongoClient
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
chat_model = ChatCohere(cohere_api_key = os.getenv("COHERE_API_KEY"), model = "command-r-plus", temperature = 0.5)

In [3]:
response = chat_model.invoke(input = "Hello World!")

In [4]:
prompt_template = PromptTemplate.from_template(template = "Tell me a joke about {subject}")

In [5]:
response = chat_model.invoke(prompt_template.format(subject = "dog"))

In [6]:
str_parser = StrOutputParser()

In [7]:
str_parser.invoke(response)

'Why do dogs make terrible dance partners? \n\nBecause they have two left feet!'

In [8]:
def get_joke(subject : str) -> str:
    prompt_template = PromptTemplate.from_template(template = "Tell me a joke about {subject}")
    str_parser = StrOutputParser()
    response = str_parser.invoke(chat_model.invoke(prompt_template.format(subject = subject)))
    return response

In [9]:
get_joke("dog")

'Why do dogs make terrible dance partners? \n\nBecause they have two left feet!'

In [10]:
get_joke_chain = prompt_template | chat_model | str_parser

In [11]:
get_joke_chain.invoke(input = {'subject' : 'dog'})

'Why do dogs make terrible dance partners? \n\nBecause they have two left feet!'

In [12]:
document_sample = Document(page_content = "Page Content of Document 1", metadata = {'id' : 1, 'author' : 'Aritta'})

In [13]:
document_sample

Document(page_content='Page Content of Document 1', metadata={'id': 1, 'author': 'Aritta'})

In [14]:
embeddings_model = CohereEmbeddings(cohere_api_key = os.getenv("COHERE_API_KEY"), model = "embed-english-v3.0")

In [15]:
len(embeddings_model.embed_query(text = "Hello, how are you?"))

1024

In [16]:
mongo_client = MongoClient(host = os.getenv("ATLAS_CONNECTION_STRING"))

In [17]:
products_database = mongo_client["products"]
knowledge_collection = products_database["knowledge"]

In [18]:
vectorstore = MongoDBAtlasVectorSearch(collection = knowledge_collection, embedding = embeddings_model, index_name = "knowledge_index")

In [19]:
# list(knowledge_collection.find())
for i in knowledge_collection.find():
    print(i)

In [20]:
knowledge_collection.insert_one({'hello':'world'})

InsertOneResult(ObjectId('6679206327469b706797e31d'), acknowledged=True)

In [21]:
for i in knowledge_collection.find():
    print(i)

{'_id': ObjectId('6679206327469b706797e31d'), 'hello': 'world'}


In [22]:
import pandas as pd

df = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/passages.parquet/part.0.parquet")

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
df

Unnamed: 0_level_0,passage
id,Unnamed: 1_level_1
0,"Uruguay (official full name in ; pron. , Eas..."
1,"It is bordered by Brazil to the north, by Arge..."
2,Montevideo was founded by the Spanish in the e...
3,The economy is largely based in agriculture (m...
4,"According to Transparency International, Urugu..."
...,...
3196,"*In 2007, a duck in Tallahassee, Florida survi..."
3197,*A rare genetic mutation sees some ducks born ...
3198,*The Moche people of ancient Peru worshipped n...
3199,*Angel Wing - A disease common in ducks.


In [24]:
dataset = df[:10]
dataset

Unnamed: 0_level_0,passage
id,Unnamed: 1_level_1
0,"Uruguay (official full name in ; pron. , Eas..."
1,"It is bordered by Brazil to the north, by Arge..."
2,Montevideo was founded by the Spanish in the e...
3,The economy is largely based in agriculture (m...
4,"According to Transparency International, Urugu..."
5,In November 2007 it became the first Latin Ame...
6,88% of the population are of European descent....
7,"The name ""Uruguay"" comes from GuaranÃ­. It has..."
8,"* ""River of colorful or 'painted' chinchillas ..."
9,"* ""River of those who bring food"": an anonymou..."


In [25]:
ingestion_docs = []
author, source = 'Aritta', 'Wikipedia'
id = 1
for i in dataset.iterrows():
    ingestion_docs.append(Document(page_content = i[1]["passage"], metadata = {'id' : id, 'author' : author, 'source' : source}))
    id+=1

In [26]:
ingestion_docs

[Document(page_content='Uruguay (official full name in  ; pron.  , Eastern Republic of  Uruguay) is a country located in the southeastern part of South America.  It is home to 3.3 million people, of which 1.7 million live in the capital Montevideo and its metropolitan area.', metadata={'id': 1, 'author': 'Aritta', 'source': 'Wikipedia'}),
 Document(page_content='It is bordered by Brazil to the north, by Argentina across the bank of both the Uruguay River to the west and the estuary of RÃ\xado de la Plata to the southwest, and the South Atlantic Ocean to the southeast. It is the second smallest independent country in South America, larger only than Suriname and the French overseas department of French Guiana.', metadata={'id': 2, 'author': 'Aritta', 'source': 'Wikipedia'}),
 Document(page_content='Montevideo was founded by the Spanish in the early 18th century as a military stronghold. Uruguay won its independence in 1828 following a three-way struggle between Spain, Argentina and Brazi

In [27]:
insert_ids = vectorstore.add_documents(ingestion_docs)
insert_ids

[ObjectId('6679206727469b706797e31e'),
 ObjectId('6679206727469b706797e31f'),
 ObjectId('6679206727469b706797e320'),
 ObjectId('6679206727469b706797e321'),
 ObjectId('6679206727469b706797e322'),
 ObjectId('6679206727469b706797e323'),
 ObjectId('6679206727469b706797e324'),
 ObjectId('6679206727469b706797e325'),
 ObjectId('6679206727469b706797e326'),
 ObjectId('6679206727469b706797e327')]

In [28]:
search_result = vectorstore.similarity_search(query = "Tell me about Uruguay.", k = 3)
search_result

[]

In [29]:
qa_prompt_template = PromptTemplate.from_template("Answer the question based on the provided documents: {context}\n\nQuestion: {query}")

In [30]:
context = ''
id = 1
for i in search_result:
    context += f'Source {id}:\n' + i.page_content + '\n\n'
    id += 1

In [31]:
context

''

In [32]:
qa_prompt_template.format(query = "Tell me about Brazil.", context = context)

'Answer the question based on the provided documents: \n\nQuestion: Tell me about Brazil.'

In [33]:
def get_answer(query: str) -> str:
    qa_prompt_template = PromptTemplate.from_template("Answer the question based on the provided documents: {context}\n\nQuestion: {query}")
    str_parser = StrOutputParser()
    search_result = vectorstore.similarity_search(query = query)
    context = ''
    id = 1
    for i in search_result:
        context += f'\n\nSource {id}:\n' + i.page_content
        id += 1
    qa_chain = qa_prompt_template | chat_model | str_parser
    response = qa_chain.invoke({'context': context, 'query': query})
    return response

In [34]:
get_answer(query = 'What country is in the north border of Uruguay?')

'Brazil and Argentina are the countries that share a border with Uruguay.'

In [35]:
# LangSmith Website: https://smith.langchain.com/o/01459434-cc55-50e2-ac3c-57013defb5cc/
# LangChain Documentation Website: https://python.langchain.com/v0.2/docs/introduction/
# Cohere API Key Website: https://dashboard.cohere.com/api-keys
# MongoDB Atlas Website: https://cloud.mongodb.com/v2/6674d3b4061745758b0d5ed5#/overview
# RAG Dataset Website: https://huggingface.co/datasets/rag-datasets/rag-mini-wikipedia
