In [6]:
# Loading the text
with open("data.txt", "r", encoding="utf-8") as f:
    data = f.read()

In [78]:
from dotenv import load_dotenv
import os

# Load variables from .env file
load_dotenv()

True

In [49]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

In [50]:
loader = PyPDFLoader("./ramayana.pdf")
data = loader.load()

In [51]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 1960 document(s) in your data
There are 437 characters in your document


In [52]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

print (f'You have {len(texts)} document(s) in your data')

You have 2003 document(s) in your data


In [53]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

In [72]:
# Check to see if there is an environment variable with you API keys, if not, use what you put below
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV')

In [55]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [98]:
print(embeddings.model)

text-embedding-ada-002


In [88]:
# initialize pinecone
pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"), # find at app.pinecone.io.
    environment=os.getenv("PINECONE_API_ENV")
)
index_name = "ramayana" # put in the name of your pinecone index here

In [89]:
for t in texts:
    if not isinstance(t.page_content, (str, bytes)):
        raise TypeError(f't.page_content should be a string or bytes-like object, but got {type(t.page_content)}')

In [92]:
if index_name in pinecone.list_indexes():
    print("hi!")
    docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

hi!


In [93]:
query = "What is the role of Hanuman in the Ramayana?"
docs = docsearch.similarity_search(query)

In [99]:
docs

[Document(page_content='1860 The Ramayana\nthe moon to the sun, the evening to the morning sun, the sun of\nwinter to that of spring; the young son betrays and overthrows\nthe old one. …Râmas, who treacherously kills the old king of\nthe monkeys, Bâlin, is the equivalent of Vish Gus, who hurls his\npredecessor Indras from his throne; and Sugrívas, the new king\nof the monkeys resembles Indras when he promises to find the\nravished Sítá, in the same way as Vish Gus in one of his incarna-\ntionsfindsagainthelostvedás.Andthereareotherindicationsin\nthe Râmâya Gam of opposition between Indras and the monkeys\nwho assist Râmas. The great monkey Hanumant, of the reddish\ncolourofgold,hashisjawbroken,Indrashavingstruckhimwith\nhisthunderboltandcausedhimtofalluponamountain,because,\nwhile yet a child, he threw himself off a mountain into the air in\norder to arrest the course of the sun, whose rays had no effect\nupon him. (The cloud rises from the mountain and hides the\nsun, which is unable 

In [95]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
llm = OpenAI(temperature=0, openai_api_key=os.getenv("OPENAI_API_KEY"))
chain = load_qa_chain(llm, chain_type="stuff")

In [96]:
chain.run(input_documents=docs, question=query)

" Hanuman is a character in the Ramayana who is the son of the Wind God and is sent by the king Sugriva to find the kidnapped Sita. He is described as having reddish gold color and is able to fly and change shape at will. He is also able to withstand the power of the sun and Indra's thunderbolt. He is a loyal friend to Rama and helps him in his quest to rescue Sita."