In [None]:
# !pip install -qU \
#   langchain==0.0.276 \
#   openai==0.27.10 \
#   tiktoken==0.4.0 \
#   sentence-transformers==2.2.2 \
#   spacy==3.6.1 \
#   nltk==3.8.1 \
#   pinecone-client==2.2.2 \
#   pypdf==3.15.4

In [7]:
from langchain.embeddings import OpenAIEmbeddings   
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter, NLTKTextSplitter, TokenTextSplitter, SpacyTextSplitter, SentenceTransformersTokenTextSplitter

import pinecone
import itertools
import time
import uuid
from tqdm.autonotebook import tqdm

from config import OPENAI_API_KEY, PINECONE_API_KEY, PINECONE_ENVIRONMENT, PINECONE_INDEX_NAME, EMBEDDING_MODEL, SPLITTER_CHUNK_SIZE, SPLITTER_CHUNK_OVERLAP

OpenAI Init

In [2]:
embedding_model = OpenAIEmbeddings(
    openai_api_key=OPENAI_API_KEY, 
    model=EMBEDDING_MODEL
)

Pinecone init

In [3]:
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENVIRONMENT
)

if PINECONE_INDEX_NAME not in pinecone.list_indexes():
    # we create a new index if it doesn't exist
    pinecone.create_index(
        name=PINECONE_INDEX_NAME,
        metric='cosine',
        dimension=1536  # 1536 dim of text-embedding-ada-002
    )
    # wait for index to be initialized
    time.sleep(1)

pinecone_index = pinecone.Index(PINECONE_INDEX_NAME)
pinecone_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0043,
 'namespaces': {'': {'vector_count': 430}},
 'total_vector_count': 430}

In [4]:
paper_list = ["data/paper1.pdf", "data/paper2.pdf", "data/paper3.pdf", "data/test.pdf"]

loader = PyPDFLoader("data/paper1.pdf")
pages = loader.load_and_split()
print(f"Pages shape: {len(pages)}")

text_splitter = TokenTextSplitter(chunk_size=SPLITTER_CHUNK_SIZE, chunk_overlap=SPLITTER_CHUNK_OVERLAP)

total_sentences = []
for idx, page in enumerate(pages):
    print(f"Page [{idx+1}]")
    sentences = text_splitter.split_text(page.page_content)
    print(f"Sentences shape: {len(sentences)}")

    total_sentences += sentences

paper_embedding = embedding_model.embed_documents(total_sentences)

to_upsert = []
for i, sentence_vector in enumerate(paper_embedding):
    to_upsert.append({
        "id": str(uuid.uuid4()),
        "values": sentence_vector,
        "metadata": {'text': total_sentences[i]}
    })


Pages shape: 42
Page [1]
Sentences shape: 47
Page [2]
Sentences shape: 10
Page [3]
Sentences shape: 38
Page [4]
Sentences shape: 31
Page [5]
Sentences shape: 43
Page [6]
Sentences shape: 9
Page [7]
Sentences shape: 40
Page [8]
Sentences shape: 11
Page [9]
Sentences shape: 41
Page [10]
Sentences shape: 19
Page [11]
Sentences shape: 37
Page [12]
Sentences shape: 18
Page [13]
Sentences shape: 39
Page [14]
Sentences shape: 19
Page [15]
Sentences shape: 38
Page [16]
Sentences shape: 9
Page [17]
Sentences shape: 45
Page [18]
Sentences shape: 10
Page [19]
Sentences shape: 40
Page [20]
Sentences shape: 15
Page [21]
Sentences shape: 41
Page [22]
Sentences shape: 14
Page [23]
Sentences shape: 39
Page [24]
Sentences shape: 13
Page [25]
Sentences shape: 37
Page [26]
Sentences shape: 37
Page [27]
Sentences shape: 12
Page [28]
Sentences shape: 38
Page [29]
Sentences shape: 42
Page [30]
Sentences shape: 11
Page [31]
Sentences shape: 46
Page [32]
Sentences shape: 26
Page [33]
Sentences shape: 50
Page 

In [5]:
batch_size = 32
n = len(to_upsert)
print(f"Total number: {n}")

for i in range(0, n, batch_size):
    if i + batch_size <= n:
        batch = to_upsert[i: i+batch_size]     
    else:
        batch = to_upsert[i:]

    pinecone_index.upsert(vectors=batch)
    print(f"Uploaded batch [{i} : {min(n, i+batch_size)}]")



Total number: 1307
Uploaded batch [0 : 32]
Uploaded batch [32 : 64]
Uploaded batch [64 : 96]
Uploaded batch [96 : 128]
Uploaded batch [128 : 160]
Uploaded batch [160 : 192]
Uploaded batch [192 : 224]
Uploaded batch [224 : 256]
Uploaded batch [256 : 288]
Uploaded batch [288 : 320]
Uploaded batch [320 : 352]
Uploaded batch [352 : 384]
Uploaded batch [384 : 416]
Uploaded batch [416 : 448]
Uploaded batch [448 : 480]
Uploaded batch [480 : 512]
Uploaded batch [512 : 544]
Uploaded batch [544 : 576]
Uploaded batch [576 : 608]
Uploaded batch [608 : 640]
Uploaded batch [640 : 672]
Uploaded batch [672 : 704]
Uploaded batch [704 : 736]
Uploaded batch [736 : 768]
Uploaded batch [768 : 800]
Uploaded batch [800 : 832]
Uploaded batch [832 : 864]
Uploaded batch [864 : 896]
Uploaded batch [896 : 928]
Uploaded batch [928 : 960]
Uploaded batch [960 : 992]
Uploaded batch [992 : 1024]
Uploaded batch [1024 : 1056]
Uploaded batch [1056 : 1088]
Uploaded batch [1088 : 1120]
Uploaded batch [1120 : 1152]
Uploaded

Test

In [8]:
query = ["How to treat patient with ACHD?"]
query_embedding = embedding_model.embed_documents(query)
res = pinecone_index.query(query_embedding, top_k=5, include_metadata=True)
for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['text']}")

0.87: with ACHD must be considered during HTx assessment.
RECOMMENDATION
17. We recommend particular attention be paid to the

0.87: oided in patients with AR.RECOMMENDATION
15. We recommend early referral for assessment of HTx
in patients with ACHD
0.87: ACHD patients.78It should be emphasized that observationalRECOMMENDATION
16. We recommend patients with ACHD undergo eval-

0.86: . ACHD patients should be referred early
and followed by transplant and ACHD teams to determine
optimal timing for transplant listing. HT
0.85: 
symptoms when alternate management options are no longer
effective and/or bene ﬁcial.
Practical tip. ACHD patients


In [7]:
query = ["How is the training on the WMT 2014 dataset?"]
query_embedding = embedding_model.embed_documents(query)
res = pinecone_index.query(query_embedding, top_k=5, include_metadata=True)
for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['text']}")

0.82: les, at a fraction of the training cost of any of
the competitive models.
On the WMT 2014 English-to-French translation task
0.81:  the significantly larger WMT
2014 English-French dataset consisting of 36M sentences and split tokens into a 32000 word-piece
vocabulary [
0.80: MT 2014 English-
to-German translation task, improving over the existing best results, including
ensembles, by over 2 BLEU
0.80: Parser Training WSJ 23 F1
Vinyals & Kaiser el al. (2014) [37] WSJ only, discriminative 88
0.80: .
For translation tasks, the Transformer can be trained significantly faster than architectures based
on recurrent or convolutional layers. On both WMT
