## install dependencies

In [16]:
!pip install langchain
!pip install langchain_community
!pip install -qU chromadb langchain-chroma
!pip install groq

Collecting groq
  Downloading groq-1.0.0-py3-none-any.whl.metadata (16 kB)
Downloading groq-1.0.0-py3-none-any.whl (138 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.3/138.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-1.0.0


## setup environment

In [17]:
from dotenv import load_dotenv
from langchain_core.documents import Document # common format for data retrieval and processing workflows
from langchain_community.document_loaders import TextLoader, DirectoryLoader # load the entire directory, and filter out files with pattern matching
from langchain_community.embeddings import HuggingFaceEmbeddings
import os
from langchain_chroma import Chroma

load_dotenv()
groqAPIKEY = os.getenv("GROQ_API_KEY")

In [18]:
kb_dir_path = "/content/drive/MyDrive"
dir_loader = DirectoryLoader(
    kb_dir_path,
    glob="*.txt",
    loader_cls=TextLoader,
    loader_kwargs={"encoding":"utf-8"},
    show_progress=True
)

docs = dir_loader.load()

100%|██████████| 3/3 [00:00<00:00, 283.18it/s]


In [24]:
from langchain_community.embeddings import HuggingFaceEmbeddings

# define the embedding method
embeddings=HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

  embeddings=HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]



README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [25]:
# embedding the documents to store them in our vector DB
vector_store = Chroma(
    collection_name="rag",
    embedding_function=embeddings
)
vector_store.add_documents(documents=docs)

['8f02823a-e2eb-4779-b7e1-33cbf6003b56',
 'b9fd6fad-455e-40dd-86dd-86cbe9b2b93d',
 '2dff6619-dbe8-4f3f-9f75-f5d034f7e353']

In [26]:
results = vector_store.similarity_search(query="what are the traits of a good corpus?", k=1)
for doc in results:
  print(doc)

page_content='a good corpus is something that consists of one topic, and is easily identifiable.
' metadata={'source': '/content/drive/MyDrive/test1.txt'}


In [27]:
# retrieve data via a retriever
retriever = vector_store.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 1, "fetch_k": 2, "lambda_mult": 0.5},
)

Query Processing Phase

In [36]:
user_query = input("Ask me anything, I'll answer if I know something about it.")
results = retriever.invoke(user_query)

context = ""
for i, res in enumerate(results):
  print(f"doc {i} = {res}")
  context+=res.page_content
  context+="\n---\n"

Ask me anything, I'll answer if I know something about it.what is a good corpus?
doc 0 = page_content='a good corpus is something that consists of one topic, and is easily identifiable.
' metadata={'source': '/content/drive/MyDrive/test1.txt'}


Generation Phase

In [37]:
from groq import Groq
from google.colab import userdata

apikey=userdata.get('GROQ_API_KEY')
client = Groq(
    api_key=apikey
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant. Respond only from the given context, if you don't know, respond saying you don't know about the query."
        },
        {
            "role": "user",
            "content": "Find the context below:"
        },
        {
            "role": "user",
            "content": context
        },
        {
            "role": "user",
            "content": user_query
        }
    ],
    model="groq/compound",
    temperature=0.5,
    top_p=1
)

print(chat_completion.choices[0].message.content)

A good corpus is one that focuses on a single topic and is easily identifiable.
