In [None]:
# !pip install -qU \
#   langchain==0.0.276 \
#   openai==0.27.10 \
#   tiktoken==0.4.0 \
#   sentence-transformers==2.2.2 \
#   spacy==3.6.1 \
#   nltk==3.8.1 \
#   pinecone-client==2.2.2 \
#   pypdf==3.15.4

In [1]:
from langchain.embeddings import OpenAIEmbeddings   
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter, NLTKTextSplitter, TokenTextSplitter, SpacyTextSplitter, SentenceTransformersTokenTextSplitter

import pinecone
import itertools
import time
import uuid
from tqdm.auto import tqdm

  from tqdm.autonotebook import tqdm


In [2]:
from config import OPENAI_API_KEY, PINECONE_API_KEY, PINECONE_ENVIRONMENT, PINECONE_INDEX_NAME, EMBEDDING_MODEL, SPLITTER_CHUNK_SIZE, SPLITTER_CHUNK_OVERLAP

OpenAI Init

In [3]:
embedding_model = OpenAIEmbeddings(
    openai_api_key=OPENAI_API_KEY, 
    model=EMBEDDING_MODEL
)

Pinecone init

In [4]:
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENVIRONMENT
)

if PINECONE_INDEX_NAME not in pinecone.list_indexes():
    # we create a new index if it doesn't exist
    pinecone.create_index(
        name=PINECONE_INDEX_NAME,
        metric='cosine',
        dimension=1536  # 1536 dim of text-embedding-ada-002
    )
    # wait for index to be initialized
    time.sleep(1)

pinecone_index = pinecone.Index(PINECONE_INDEX_NAME)
pinecone_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [5]:
loader = PyPDFLoader("data/test.pdf")
pages = loader.load_and_split()
print(f"Pages shape: {len(pages)}")

text_splitter = TokenTextSplitter(chunk_size=SPLITTER_CHUNK_SIZE, chunk_overlap=SPLITTER_CHUNK_OVERLAP)

total_sentences = []
for idx, page in enumerate(pages):
    print(f"Page [{idx+1}]")
    sentences = text_splitter.split_text(page.page_content)
    print(f"Sentences shape: {len(sentences)}")
    total_sentences += sentences

paper_embedding = embedding_model.embed_documents(total_sentences)

to_upsert = []
for i, sentence_vector in enumerate(paper_embedding):
    to_upsert.append({
        "id": str(uuid.uuid4()),
        "values": sentence_vector,
        "metadata": {'text': total_sentences[i]}
    })






Pages shape: 16
Page [1]
Sentences shape: 29
Page [2]
Sentences shape: 33
Page [3]
Sentences shape: 5
Page [4]
Sentences shape: 18
Page [5]
Sentences shape: 23
Page [6]
Sentences shape: 32
Page [7]
Sentences shape: 34
Page [8]
Sentences shape: 32
Page [9]
Sentences shape: 37
Page [10]
Sentences shape: 34
Page [11]
Sentences shape: 35
Page [12]
Sentences shape: 41
Page [13]
Sentences shape: 40
Page [14]
Sentences shape: 11
Page [15]
Sentences shape: 13
Page [16]
Sentences shape: 13


In [6]:
batch_size = 32
n = len(to_upsert)
print(f"Total number: {n}")

for i in range(0, n, batch_size):
    if i + batch_size <= n:
        batch = to_upsert[i: i+batch_size]     
    else:
        batch = to_upsert[i:]

    pinecone_index.upsert(vectors=batch)
    print(f"Uploaded batch [{i} : {min(n, i+batch_size)}]")



Total number: 430
Uploaded batch [0 : 32]
Uploaded batch [32 : 64]
Uploaded batch [64 : 96]
Uploaded batch [96 : 128]
Uploaded batch [128 : 160]
Uploaded batch [160 : 192]
Uploaded batch [192 : 224]
Uploaded batch [224 : 256]
Uploaded batch [256 : 288]
Uploaded batch [288 : 320]
Uploaded batch [320 : 352]
Uploaded batch [352 : 384]
Uploaded batch [384 : 416]
Uploaded batch [416 : 430]


Test

In [8]:
query = ["Why the dot products get large"]
query_embedding = embedding_model.embed_documents(query)
res = pinecone_index.query(query_embedding, top_k=5, include_metadata=True)
for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['text']}")

0.83:  values of
dk, the dot products grow large in magnitude, pushing the softmax function into regions where it has
extremely small gradients4.
0.81:  values we then perform the attention function in parallel, yielding dv-dimensional
4To illustrate why the dot products get large, assume that the components
0.79:  small gradients4. To counteract this effect, we scale the dot products by1√dk.
3.2.2 Multi-Head
0.79: , additive attention outperforms
dot product attention without scaling for larger values of dk[3]. We suspect that for large values of
dk,
0.79: 1 Scaled Dot-Product Attention
We call our particular attention "Scaled Dot-Product Attention" (Figure 2). The input consists of



In [9]:
query = ["How is the training on the WMT 2014 dataset?"]
query_embedding = embedding_model.embed_documents(query)
res = pinecone_index.query(query_embedding, top_k=5, include_metadata=True)
for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['text']}")

0.82: les, at a fraction of the training cost of any of
the competitive models.
On the WMT 2014 English-to-French translation task
0.81:  the significantly larger WMT
2014 English-French dataset consisting of 36M sentences and split tokens into a 32000 word-piece
vocabulary [
0.80: MT 2014 English-
to-German translation task, improving over the existing best results, including
ensembles, by over 2 BLEU
0.80: Parser Training WSJ 23 F1
Vinyals & Kaiser el al. (2014) [37] WSJ only, discriminative 88
0.80: .
For translation tasks, the Transformer can be trained significantly faster than architectures based
on recurrent or convolutional layers. On both WMT
