In [1]:
# from transformers import BloomTokenizerFast, BloomForQuestionAnswering, BloomForCausalLM, TrainingArguments, Trainer
import os
from dotenv import load_dotenv
from langchain.embeddings import HuggingFaceEmbeddings
import pinecone
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA
from langchain import HuggingFaceHub
from langchain.prompts import PromptTemplate
from config.config import BASE_DIR, DATA_DIR, EMBEDDING_MODEL_NAME, PINECONE_INDEX_NAME


  from tqdm.autonotebook import tqdm


In [2]:
load_dotenv()

True

In [3]:

# Save it into pinecone
API_KEY = os.environ.get("PINECONE_API_KEY")
YOUR_ENV = os.environ.get("PINECONE_ENVIRONMENT", "us-west4-gcp-free")
index_name = PINECONE_INDEX_NAME
OPENAI_API_KEY = os.environ.get("OPEN_AI_KEY")

In [4]:
pinecone.init(
    api_key=API_KEY,
    environment=YOUR_ENV
)

if len(pinecone.list_indexes()) == 0:
    pinecone.create_index(name=index_name, metric="cosine", shards=1, dimension=len(res[0]))

pinecone.describe_index(pinecone.list_indexes()[0])

IndexDescription(name='stsb-test', metric='cosine', replicas=1, dimension=768.0, shards=1, pods=1, pod_type='p1', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')

In [5]:
text_field = "text"

index = pinecone.Index(index_name)

In [6]:
embeddings = HuggingFaceEmbeddings(model_name = EMBEDDING_MODEL_NAME)

In [7]:
vectorstore = Pinecone(index, embeddings.embed_query, text_field)

In [8]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

# completion llm
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name='gpt-3.5-turbo',
    temperature=0.0
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

In [9]:
from langchain.chains import RetrievalQAWithSourcesChain

qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

In [10]:
query_NVIDIA = "What is NVIDIa?"


In [15]:
vectorstore.similarity_search(
    "Blake Gerard",  # our search query
    k=3  # return 3 most relevant docs
)

[Document(page_content='W. Lehnert, C. Cardie, D. Fisher, J. McCarthy, E. Riloff, and S. Soderland. University of Massachusetts: Description of the CIRCUS system as used for MUC-4. In Proceedings of the Fourth Message Understanding Conference (MUC-4), pages 282-288, 1992.\n\nW. Lehnert, J. McCarthy, S. Soderland, E. Riloff, C. Cardie, J. Peterson, F. Feng, C. Dolan, and S. Goldman. University of Massachusetts/Hughes: Description of the CIRCUS system as used for MUC-5. In Proceedings of the Fourth Message Understanding Conference (MUC-5), pages 277-290, 1993.\n\nW. Lehnert. Symbolic/subsymbolic sentence analysis: Exploiting the best of two worlds. In J. Barnden and J. Pollack, editors, Advances in Connectionist and Neural Computation Theory, Vol. 1, pages 135-164. Ablex Publishers, Norwood, NJ, 1991.\n\nR. H. Merchant. Tipster program overview. In Proceedings of the TIPSTER Text Program (Phase I), pages 1-2, 1993.\n\nD. Moldovan, S. Cha, M. Chung, K. Hendrickson, J. Kim, and S. Kowalski

In [24]:
response = qa_with_sources(query_NVIDIA)

In [25]:
response['answer'], response['sources']

('There is no information available about NVIDIA in the provided sources.\nSOURCES:',
 '')

In [45]:
query_disambiguation = "How could i implement a disambiguation mechanism?"

In [46]:
response = qa_with_sources(query_disambiguation)

In [47]:
response['answer'], response['sources']

('To implement a disambiguation mechanism, you can use an algorithm that coordinates anaphora resolution and prepositional phrase (PP) disambiguation. The algorithm applies resolution rules based on the focusing approach to the conceptual representation and uses attachment rules to fill empty roles in the Conceptual Structures (CSs). The algorithm is applied sentence by sentence, and the resolution of an anaphor is postponed if it is preceded by an unattached preposition. The algorithm was developed in the context of the COBALT project and is described in detail in the paper "An Algorithm to Co-Ordinate Anaphora Resolution and PPS Disambiguation Process" by Azzam (1994).\n',
 '9502033.xml')

In [51]:
query_senior_living = "I have some questions about senior living, what are "

In [54]:
response = qa_with_sources(query_senior_living)

In [55]:
response['answer'], response['sources']

("The best way to get started with senior living is to have open conversations with friends, family, healthcare professionals, and senior living experts. They can guide and support you through the decision-making process. Additionally, you can consider factors such as whether household chores and daily tasks have become overwhelming, if you need assistance with personal care, and if you have safety concerns. It's important to note that senior living offers various lifestyle options that prioritize independence while providing necessary care and support. The affordability of senior living can vary depending on factors such as location, level of care required, amenities offered, and the specific community chosen. However, many people are surprised to learn that the cost of senior living is often lower than the cost of staying in their current homes. It's recommended to use a Cost Comparison Calculator to get a more accurate comparison. Senior living offers benefits such as socializing on

In [16]:
query_espanol = "Dime acerca de la vida de la tercera edad"

In [17]:
response = qa_with_sources(query_espanol)

In [19]:
response['answer'], response['sources']

('La vida de la tercera edad en las comunidades de Carlton ofrece numerosos beneficios, como socializar según tus propios términos, un entorno de apoyo con personal capacitado disponible para brindar asistencia, seguridad y tecnología, vivir sin preocupaciones de tareas domésticas, y acceso a entretenimiento y actividades en la comunidad. La vida de la tercera edad no es lo mismo que una residencia de ancianos, ya que se enfoca en brindar cuidado cuando se necesita mientras se preserva la independencia. Además, la asequibilidad de la vida de la tercera edad puede variar según varios factores. Para determinar si es el momento de considerar la vida de la tercera edad, es útil evaluar si estás buscando compañía, oportunidades para socializar y un sentido de comunidad, y si tienes preocupaciones de seguridad como caídas, navegar escaleras o conducir.\n',
 'common.txt')

In [14]:
query_espanol = "Quien es y que hace Blake Gerard?"
response = qa_with_sources(query_espanol)
response['answer'], response['sources']

('There is no information available about Blake Gerard in the provided document.\n',
 '9505043.xml, 9506016.xml')