In [None]:
# !pip install -qU \
#   langchain==0.0.276 \
#   openai==0.27.10 \
#   tiktoken==0.4.0 \
#   sentence-transformers==2.2.2 \
#   spacy==3.6.1 \
#   nltk==3.8.1 \
#   pinecone-client==2.2.2 \
#   wikipedia==1.4.0 \
#   pypdf==3.15.4

In [10]:
from langchain.embeddings import OpenAIEmbeddings   
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter, NLTKTextSplitter, TokenTextSplitter, SpacyTextSplitter, SentenceTransformersTokenTextSplitter

import pinecone
import itertools
import time
import uuid
from tqdm.autonotebook import tqdm

from config import OPENAI_API_KEY, PINECONE_API_KEY, PINECONE_ENVIRONMENT, PINECONE_INDEX_NAME, EMBEDDING_MODEL, SPLITTER_CHUNK_SIZE, SPLITTER_CHUNK_OVERLAP

OpenAI Init

In [11]:
embedding_model = OpenAIEmbeddings(
    openai_api_key=OPENAI_API_KEY, 
    model=EMBEDDING_MODEL
)

Pinecone init

In [12]:
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENVIRONMENT
)

if PINECONE_INDEX_NAME not in pinecone.list_indexes():
    # we create a new index if it doesn't exist
    pinecone.create_index(
        name=PINECONE_INDEX_NAME,
        metric='cosine',
        dimension=1536  # 1536 dim of text-embedding-ada-002
    )
    # wait for index to be initialized
    time.sleep(1)

pinecone_index = pinecone.Index(PINECONE_INDEX_NAME)
pinecone_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.04929,
 'namespaces': {'': {'vector_count': 4929}},
 'total_vector_count': 4929}

In [11]:
paper_list = ["data/paper1.pdf", "data/paper2.pdf", "data/paper3.pdf", "data/test.pdf", "data/mypaper.pdf"]

loader = PyPDFLoader("data/mypaper.pdf")
pages = loader.load_and_split()
print(f"Pages shape: {len(pages)}")

text_splitter = TokenTextSplitter(chunk_size=SPLITTER_CHUNK_SIZE, chunk_overlap=SPLITTER_CHUNK_OVERLAP)

source = pages[0].metadata["source"]

total_sentences = []
page_number_list = []
for idx, page in enumerate(pages):
    page_num = page.metadata["page"] + 1
    print(f"Page [{idx+1}]")
    sentences = text_splitter.split_text(page.page_content)
    print(f"Sentences shape: {len(sentences)}")
    total_sentences += sentences
    page_number_list += [page_num] * len(sentences)

# Due to OpenAPI rate limitation, I have to embed multiple chunks at the same time
paper_embedding = embedding_model.embed_documents(total_sentences)

to_upsert = []
for i, sentence_vector in enumerate(paper_embedding):
    to_upsert.append({
        "id": str(uuid.uuid4()),
        "values": sentence_vector,
        "metadata": {
                        "text": total_sentences[i],
                        "source": source,
                        "page": page_number_list[i]
                    }
    })

# print(to_upsert)


Pages shape: 38
Page [1]
Sentences shape: 7
Page [2]
Sentences shape: 16
Page [3]
Sentences shape: 62
Page [4]
Sentences shape: 34
Page [5]
Sentences shape: 26
Page [6]
Sentences shape: 26
Page [7]
Sentences shape: 21
Page [8]
Sentences shape: 7
Page [9]
Sentences shape: 9
Page [10]
Sentences shape: 23
Page [11]
Sentences shape: 29
Page [12]
Sentences shape: 35
Page [13]
Sentences shape: 26
Page [14]
Sentences shape: 17
Page [15]
Sentences shape: 18
Page [16]
Sentences shape: 11
Page [17]
Sentences shape: 10
Page [18]
Sentences shape: 26
Page [19]
Sentences shape: 18
Page [20]
Sentences shape: 34
Page [21]
Sentences shape: 3
Page [22]
Sentences shape: 10
Page [23]
Sentences shape: 21
Page [24]
Sentences shape: 22
Page [25]
Sentences shape: 21
Page [26]
Sentences shape: 28
Page [27]
Sentences shape: 36
Page [28]
Sentences shape: 8
Page [29]
Sentences shape: 21
Page [30]
Sentences shape: 22
Page [31]
Sentences shape: 15
Page [32]
Sentences shape: 21
Page [33]
Sentences shape: 21
Page [34

In [12]:
batch_size = 32
n = len(to_upsert)
print(f"Total number: {n}")

for i in range(0, n, batch_size):
    if i + batch_size <= n:
        batch = to_upsert[i: i+batch_size]     
    else:
        batch = to_upsert[i:]

    pinecone_index.upsert(vectors=batch)
    print(f"Uploaded batch [{i} : {min(n, i+batch_size)}]")



Total number: 843
Uploaded batch [0 : 32]
Uploaded batch [32 : 64]
Uploaded batch [64 : 96]
Uploaded batch [96 : 128]
Uploaded batch [128 : 160]
Uploaded batch [160 : 192]
Uploaded batch [192 : 224]
Uploaded batch [224 : 256]
Uploaded batch [256 : 288]
Uploaded batch [288 : 320]
Uploaded batch [320 : 352]
Uploaded batch [352 : 384]
Uploaded batch [384 : 416]
Uploaded batch [416 : 448]
Uploaded batch [448 : 480]
Uploaded batch [480 : 512]
Uploaded batch [512 : 544]
Uploaded batch [544 : 576]
Uploaded batch [576 : 608]
Uploaded batch [608 : 640]
Uploaded batch [640 : 672]
Uploaded batch [672 : 704]
Uploaded batch [704 : 736]
Uploaded batch [736 : 768]
Uploaded batch [768 : 800]
Uploaded batch [800 : 832]
Uploaded batch [832 : 843]


Test

In [None]:
query = ["How to treat patient with ACHD?"]
query_embedding = embedding_model.embed_documents(query)
res = pinecone_index.query(query_embedding, top_k=3, include_metadata=True)
for match in res['matches']:
    print("="*30)
    print(f"Score: {match['score']:.2f} \t Source: {match['metadata']['source']}")
    print("="*30)
    print(f"{match['metadata']['text']}")
    print("="*30)
    print()

In [None]:
query = ["Why the dot products get large?"]
query_embedding = embedding_model.embed_documents(query)
res = pinecone_index.query(query_embedding, top_k=3, include_metadata=True)
for match in res['matches']:
    print("="*30)
    print(f"Score: {match['score']:.2f} \t Source: {match['metadata']['source']} \t Page: {int(match['metadata']['page'])}")
    print("="*30)
    print(f"{match['metadata']['text']}")
    print("="*30)
    print()

In [13]:
query = ["Who is bill gates?"]
query_embedding = embedding_model.embed_documents(query)
res = pinecone_index.query(query_embedding, top_k=3, include_metadata=True)
for match in res['matches']:
    print("="*30)
    print(f"Score: {match['score']:.2f} \t Source: {match['metadata']['source']} \t Page: {int(match['metadata']['page'])}")
    print("="*30)
    print(f"{match['metadata']['text']}")
    print("="*30)
    print()

Score: 0.76 	 Source: data/mypaper.pdf 	 Page: 3
2023-09-01 00:00:00

Score: 0.76 	 Source: data/mypaper.pdf 	 Page: 3
 . . . . . . . . . . . . . . . . . . . . . . . . . . . . 14
4.1.3 Computer Vision . . .

Score: 0.76 	 Source: data/mypaper.pdf 	 Page: 3
 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 2
2 Research

