### Input PolicyCard Data

In [None]:
data = # dictionary

### Initialize the Embedding Model and Vector DB

In [10]:
# imports
from getpass import getpass
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

from tqdm.auto import tqdm
from uuid import uuid4

from langchain.vectorstores import Pinecone

In [4]:
OPENAI_API_KEY = getpass("OpenAI API Key: ")
model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model = model_name,
    openai_api_key = OPENAI_API_KEY
)

OpenAI API Key: ········


In [6]:
# find API key in console at app.pinecone.io
YOUR_API_KEY = getpass("Pinecone API Key: ")
# find ENV (cloud region) next to API key in console
YOUR_ENV = input("Pinecone environment: ")

index_name = 'langchain-retrieval-agent'
pinecone.init(
    api_key=YOUR_API_KEY,
    environment=YOUR_ENV
)

if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
        name=index_name,
        metric='dotproduct',
        dimension=1536  # 1536 dim of text-embedding-ada-002
    )

Pinecone API Key: ········
Pinecone environment: gcp-starter


In [7]:
# connect to index
index = pinecone.GRPCIndex(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 0}},
 'total_vector_count': 0}

### Indexing Framework

In [None]:
batch_size = 100

texts = []
metadatas = []

for i in tqdm(range(0, len(data), batch_size)):
    # get end of batch
    i_end = min(len(data), i+batch_size)
    batch = data.iloc[i:i_end]
    # first get metadata fields for this record
    metadatas = [{
        'GeneratedFromUserNeedsIDs': record['GeneratedFromUserNeedsIDs'],
        'Category': record['Category'],
        'Effective Date': record['Effective Date'],
        'PolicyMakers': record['PolicyMakers'],
        'VotingStatus': record['VotingStatus'],
        'RegionalInfo': record['RegionalInfo'],
        'Other': record['Other']
    } for j, record in batch.iterrows()]
    # get the list of contexts / documents
    documents = batch['context']
    # create document embeddings
    embeds = embed.embed_documents(documents)
    # get IDs
    ids = batch['id']
    # add everything to pinecone
    index.upsert(vectors=zip(ids, embeds, metadatas))

In [9]:
# check number of vectors in index

index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 0}},
 'total_vector_count': 0}

### Creating a Vector Store

In [None]:
text_field = "text"

# switch back to normal index for langchain
index = pinecone.Index(index_name)

vectorstore = Pinecone(
    index, embed.embed_query, text_field
)