In [None]:
!pip install OpenAi
!pip install langchain
!pip install python-dotenv
!pip install langchain-openai
!pip install chromadb

In [None]:
import os
from dotenv import load_dotenv, find_dotenv
from langchain_openai import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.text_splitter import  CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
#from langchain.vectorstores.chroma import Chroma
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
import langchain

# Avoid Colab Output srollbar
from IPython.display import HTML, display
def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
load_dotenv(find_dotenv(), override=True)
os.environ.get('OPENAI_API_KEY')
os.environ.get('PINECONE_API_KEY')

chatt = ChatOpenAI(verbose=True)

In [None]:
embeddings = OpenAIEmbeddings()

# emb = embeddings.embed_query("Hi there")
# print(type(emb))
# print(len(emb))
# print(emb)

  warn_deprecated(


In [None]:
text_splitter1 = CharacterTextSplitter(
    separator="\n",
    chunk_size=200,   #Characters quantity. For example, at most 200 characters
    chunk_overlap=0
)


loader = TextLoader("facts.txt")
# docs = loader.load() # Alla abajo lo estoy leyendo y dividiendo

docs = loader.load_and_split(
    text_splitter=text_splitter1
)
# print(docs)

In [None]:
print(type(docs))
print(len(docs))
print(docs[0])

for doc in docs:
  print(doc.page_content + "\n")

---
<h1> CRHOMA DB </h1>
from langchain.vectorstores.chroma import Chroma

*Nota: Ambas estan usando el mismo directorio: "emb"*

In [None]:
# DB Creating must be in another file. This code must be runned only one time.
#   Every time that I run this code, I'll re-insert all registers in db. I mean, I'm going to duplicate registers.
db = Chroma.from_documents(
    docs,
    embedding=embeddings,
    persist_directory="emb1"
)

In [None]:
results = db.similarity_search_with_score(
    "What is an interesting fact about the english language?",
    k=5 #two relevants results. Default is 4.
)

In [None]:
# print(results)

for result in results:
  print("\n")
  print(result[1])
  print(result[0].page_content)



0.3504563570022583
1. "Dreamt" is the only English word that ends with the letters "mt."
2. An ostrich's eye is bigger than its brain.
3. Honey is the only natural food that is made without destroying any kind of life.


0.3504563570022583
1. "Dreamt" is the only English word that ends with the letters "mt."
2. An ostrich's eye is bigger than its brain.
3. Honey is the only natural food that is made without destroying any kind of life.


0.35151627327188995
1. "Dreamt" is the only English word that ends with the letters "mt."
2. An ostrich's eye is bigger than its brain.
3. Honey is the only natural food that is made without destroying any kind of life.


0.3533172011375427
4. A snail can sleep for three years.
5. The longest word in the English language is 'pneumonoultramicroscopicsilicovolcanoconiosis.'
6. The elephant is the only mammal that can't jump.


0.3533819317817688
4. A snail can sleep for three years.
5. The longest word in the English language is 'pneumonoultramicroscop

---
<h1> CRHOMA DB - Retrievers </h1>
from langchain.vectorstores import Chroma

*Nota: Ambas estan usando el mismo directorio: "emb"*


In [None]:
#from langchain.vectorstores import Chroma

db = Chroma(
    persist_directory="emb1",
    embedding_function=embeddings
)

retriever = db.as_retriever()

chain = RetrievalQA.from_chain_type(
    llm=chatt,
    retriever=retriever,
    chain_type="stuff"
)

In [None]:
langchain.debug = True
result = chain.run("What is an interesting fact about the english language?")
print(result)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "What is an interesting fact about the english language?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "What is an interesting fact about the english language?",
  "context": "1. \"Dreamt\" is the only English word that ends with the letters \"mt.\"\n2. An ostrich's eye is bigger than its brain.\n3. Honey is the only natural food that is made without destroying any kind of life.\n\n1. \"Dreamt\" is the only English word that ends with the letters \"mt.\"\n2. An ostrich's eye is bigger than its brain.\n3. Honey is the only natural food that is made without destroying any kind of life.\n\n1. \"Dreamt\" is the only English word that ends with th

---
<h1> CRHOMA DB - Customs Retrievers </h1>
from langchain.vectorstores import Chroma

This time, removing duplicated chunks.
This script could be another .py

*Note: Both of them are using same directory: "emb"*




In [None]:
from langchain.embeddings.base import Embeddings
from langchain.vectorstores import Chroma
from langchain.schema import BaseRetriever

class RedundantFilterRetriever(BaseRetriever):
  embeddings: Embeddings
  chroma: Chroma

  def get_relevant_documents(self, query):
    #calculate embeddings for the query string
    emb = self.embeddings.embed_query(query)
    #take embeddings and feed them into that
    #max_marginal_relevance_search_by_vector (remove duplicates)
    return self.chroma.max_marginal_relevance_search_by_vector(
        embedding=emb,
        lambda_mult=0.8
    )

  async def aget_relevant_documents(self):
    return[]

In [None]:
db = Chroma(
    persist_directory="emb1",
    embedding_function=embeddings
)

retriever = RedundantFilterRetriever(
    embeddings=embeddings,
    chroma=db
)

chain = RetrievalQA.from_chain_type(
    llm=chatt,
    retriever=retriever,
    chain_type="stuff"
)

In [None]:
langchain.debug = True
result = chain.run("What is an interesting fact about the english language?")
print(result)

  warn_deprecated(


[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "What is an interesting fact about the english language?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "What is an interesting fact about the english language?",
  "context": "1. \"Dreamt\" is the only English word that ends with the letters \"mt.\"\n2. An ostrich's eye is bigger than its brain.\n3. Honey is the only natural food that is made without destroying any kind of life.\n\n4. A snail can sleep for three years.\n5. The longest word in the English language is 'pneumonoultramicroscopicsilicovolcanoconiosis.'\n6. The elephant is the only mammal that can't jump.\n\n86. Broccoli and cauliflower are the only vegetables that are flowers.\n87