In [46]:
# import libraries
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.llms import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import os

In [2]:
# estimator object of PyPDF with folder Machine Learning Notes as parameter
loader = PyPDFDirectoryLoader('Machine Learning Notes')
loader

<langchain_community.document_loaders.pdf.PyPDFDirectoryLoader at 0x7f3d7dd54610>

In [4]:
# load the pdf data in the Machine Learning Notes folder
data = loader.load()
data

[Document(page_content='  Advancements in Generative AI: A \nComprehensive Review of GANs, GPT, Autoencoders, Diffusion Model, and \nTransformers. \nStaphord Bengesi1, Hoda El -Sayed1, Md Kamruzzaman Sarker1, Yao  Houkpati1, John \nIrungu3   and Timothy Oladunni2 \n1 Dept. of Computer Science, Bowie State University, Bowie, MD 20715 USA  \n2 Dept. of Computer Science, Morgan State University, Baltimore, MD 21251 USA  \n3 Dept.  of Computer Science, University of the District of Columbia , Washington, DC 20008  USA  \nCorresponding author: sbengesi@bowiestate.edu  \n \nABSTRACT  The launch of ChatGPT has garnered global attention, marking a significant milestone in the \nfield of Generative Artificial Intelligence. While Generative AI has been in effect for the past decade, the \nintroduction of ChatGPT has ignited a new wave of research and innovation in the AI domain. This surge in interest has led to the development and release of numerous cutting -edge tools, such as Bard, Stable Di

In [5]:
# estimator object of the recursive character for chunks of data
test_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)

In [6]:
# get chunks/pages of data
test_chunk = test_splitter.split_documents(data)
test_chunk

[Document(page_content='Advancements in Generative AI: A \nComprehensive Review of GANs, GPT, Autoencoders, Diffusion Model, and \nTransformers. \nStaphord Bengesi1, Hoda El -Sayed1, Md Kamruzzaman Sarker1, Yao  Houkpati1, John \nIrungu3   and Timothy Oladunni2 \n1 Dept. of Computer Science, Bowie State University, Bowie, MD 20715 USA  \n2 Dept. of Computer Science, Morgan State University, Baltimore, MD 21251 USA  \n3 Dept.  of Computer Science, University of the District of Columbia , Washington, DC 20008  USA', metadata={'source': 'Machine Learning Notes/AI paper.pdf', 'page': 0}),
 Document(page_content='Corresponding author: sbengesi@bowiestate.edu  \n \nABSTRACT  The launch of ChatGPT has garnered global attention, marking a significant milestone in the \nfield of Generative Artificial Intelligence. While Generative AI has been in effect for the past decade, the \nintroduction of ChatGPT has ignited a new wave of research and innovation in the AI domain. This surge in interest ha

In [21]:
# total chunks of data
len(test_chunk)

233

In [15]:
# get the first chunk from first page 0
print(test_chunk[0].page_content)

Advancements in Generative AI: A 
Comprehensive Review of GANs, GPT, Autoencoders, Diffusion Model, and 
Transformers. 
Staphord Bengesi1, Hoda El -Sayed1, Md Kamruzzaman Sarker1, Yao  Houkpati1, John 
Irungu3   and Timothy Oladunni2 
1 Dept. of Computer Science, Bowie State University, Bowie, MD 20715 USA  
2 Dept. of Computer Science, Morgan State University, Baltimore, MD 21251 USA  
3 Dept.  of Computer Science, University of the District of Columbia , Washington, DC 20008  USA


In [16]:
print(test_chunk[0].metadata)

{'source': 'Machine Learning Notes/AI paper.pdf', 'page': 0}


In [17]:
# second chunk from first page 0
print(test_chunk[1].page_content)

Corresponding author: sbengesi@bowiestate.edu  
 
ABSTRACT  The launch of ChatGPT has garnered global attention, marking a significant milestone in the 
field of Generative Artificial Intelligence. While Generative AI has been in effect for the past decade, the 
introduction of ChatGPT has ignited a new wave of research and innovation in the AI domain. This surge in interest has led to the development and release of numerous cutting -edge tools, such as Bard, Stable Diffusion,


In [20]:
# third chunk
print(test_chunk[2].page_content)

DALL -E, Make-A- Video, Runway ML, and Jukebox, among others. These tools exhibit remarkable 
capabilities, encompassing tasks ranging from text generation and music composition, image creation, video 
production, code generation, and even scientific work. They are built upon various state -of-the-art models, 
including Stable Diffusion, transformer models like GPT- 3 (recent GPT -4), variational autoencoders, and


In [23]:
# last chunk of data
print(test_chunk[232].page_content)

[175]  B. Lane, “An AI -generated Rihanna cover of 
Beyoncé’s ‘Cuff It’ is going viral, and it could open up a new legal nightmare for the music industry,” Insider. 
Accessed: Oct. 14, 2023. [Online]. Available: 
https://www.insider.com/rihanna -ai-cuff-it-cover -
legal -nightmare -music -industry -2023- 4


In [25]:
# get the openai api key with os
os.environ['OPENAI_API_KEY'] = ''

In [27]:
#estimator object of OPENAI embeddings
embeddings = OpenAIEmbeddings()

In [None]:
# call an embedding method with any query. This gave an error bcos of price and billing from OPENAI. You should get a vector embedding
embeddings.embed_query('How are you?')
len(embeddings.embed_query('How are you?'))    # this gives the total number of embeddings

In [41]:
# Pinecone vector database configurations(API_KEY and ENV name) from Pinecone website
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', '7e5d84fa-109f-4055-914a-2db9d3dc3b37')
#PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'index')

In [40]:
# pinecone method with some parameters.
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key=PINECONE_API_KEY)

#### create index on Pinecone. use free tier cos it's usually charged

In [48]:
# index = Pinecone.Index('testing')    --> imaginary index created in pinecone
index_name='testing'

##### create embeddings for each of the test chunk

In [None]:
docsearch = Pinecone.from_texts([t.page_content for t in test_chunk], embeddings, index_name=index_name) # error cos of index not created in pinecone and billing

In [47]:
# for question and answer. just practice code for retrieving answers from question
qa = RetrievalQA.from_chain_type(llm=OpenAI, chain_type='stuff', retriever=docsearch.as_retriever())
query = 'any question asked from the pdf paper'
# qa.run(query)
# doc = docsearch.similarity_search(query)    --> get cosine similarity search of the query wrt to the docsearch