In [1]:
!pip install langchain



In [19]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

In [17]:
# mount your google drive on Google colab. Choose the account and click on allow.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Replace 'YOUR_API_KEY' with your actual OpenAI API key
import os
os.environ['OPENAI_API_KEY'] = 'YOUR_API_KEY' #Sets your ‘OPENAI_API_KEY’ Environment Variable

In [5]:
!pip install chromadb



In [6]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.4.0


In [7]:
!pip install openai

Collecting openai
  Downloading openai-0.27.8-py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.27.8


In [18]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-3.15.1-py3-none-any.whl (271 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/271.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/271.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m271.0/271.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-3.15.1


In [20]:
# use pypdfloader to load the pdf.
loader = PyPDFLoader('/content/drive/MyDrive/Demo-pdf-3-idiots.pdf')
documents_ = loader.load_and_split() # This line uses the load_and_split method of the PyPDFLoader to load the PDF document and split its content into smaller text chunks or documents.

# using CharacterTextSplitter
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=100, chunk_overlap=20)
documents = text_splitter.split_documents(documents_)

# Here, a CharacterTextSplitter object is created. This splitter splits the text based on certain parameters:

# separator="\n": It uses newline characters to separate the text chunks.
# chunk_size=100: Each text chunk is limited to 100 characters in length.
# chunk_overlap=20: There is a 20-character overlap between adjacent chunks.

vectordb = Chroma.from_documents(
  documents,
  embedding=OpenAIEmbeddings(), # It creates a Chroma vector database from the documents variable. It uses an embedding provided by OpenAI to represent the text data as vectors.
  persist_directory='./MyDrive' # This specifies the directory where the vector database should be persisted or saved.
)

# creates a vector database using Chroma.
vectordb.persist()



In [34]:
vectordb.get(ids=['doc0'], include=['embeddings'])

# it fetches the embeddings (vector representations) for the document with the ID 'doc0' from your vector database (vectordb).

{'ids': [],
 'embeddings': [[0.0018996911215881183,
   -0.021708788988049572,
   -0.011597751980536478,
   -0.0020855304932160413,
   0.0016020039799989451,
   0.010021559257205395,
   -0.022796292527734817,
   -0.00855549263759048,
   -0.028275112520913585,
   -0.014426639727433561,
   -0.007247734562166396,
   0.019630142133015407,
   0.011205424883872165,
   -0.010200515689143395,
   0.005909003025307009,
   -0.004990130809532928,
   0.01832238266060742,
   -0.007536818029143165,
   0.03064284377424712,
   -0.013993014992629709,
   -0.031578921709431454,
   0.020167010497506802,
   0.003758084605036698,
   -0.028247580762153893,
   -0.013359784541156786,
   -0.012836681124722632,
   0.009057947700616164,
   -0.008142516256195095,
   0.014344044451154484,
   -0.004377549177129775,
   0.012781617607203248,
   -0.004270863379105316,
   0.0031678726430411194,
   -0.008837693630538627,
   -0.014027429225418024,
   -0.010964521529063555,
   -0.01942365301099511,
   -0.005124347900655778,


In [21]:
from langchain.memory import ConversationSummaryMemory
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain

In [25]:
llm = ChatOpenAI() #This line of code initializes a ChatGPT model. Here llm stands for "Large Language Model."
memory = ConversationSummaryMemory(llm=llm,memory_key="chat_history",return_messages=True)

# This memory is designed to store and manage conversation history within a chatbot system. Here's what the parameters mean:

# llm=llm: This associates the ChatGPT model (llm) with the conversation memory.
# memory_key="chat_history": This specifies a key (or label) for the memory. It's used to identify the conversation history.
# return_messages=True: This parameter indicates that the memory should store and return individual messages within the conversation.

retriever = vectordb.as_retriever()

# This code creates a document retriever from the previously defined vector database (vectordb).
# A document retriever is a component that can retrieve documents or information from a database based on certain queries.

qa = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)

#This code sets up a conversational retrieval chain (qa).

In [26]:
qa("Brief the story of 3-idiots")

# write the query which you want to ask from the pdf. For example - Here we are using: 'Brief the story of 3-idiots'

{'question': 'Brief the story of 3-idiots',
 'chat_history': [SystemMessage(content='', additional_kwargs={})],
 'answer': 'The story of 3 Idiots revolves around three friends named Rancho, Raju, and Farhan who enroll in an elite engineering college. The film explores their journey and the life lessons they learn along the way that cannot be taught through books. The story begins with their entry into the college and their initial experiences with ragging.'}