In [None]:
#!pip install pinecone
#!pip install pandas



In [None]:
#Importing the Pinecone library
from pinecone import Pinecone
from pinecone import ServerlessSpec
from google.colab import drive
import pandas as pd
import numpy as np
drive.mount('/content/drive')
import time

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_path = "/content/drive/My Drive/Geography311/Data_for_Hitachi_1_Project.csv"
df = pd.read_csv(file_path)
df = df.head(96)
df['text'] = df['text'].astype(str)

In [None]:
#Using my API Key to initialize my client connection to Pinecone
pc = Pinecone(api_key="pcsk_2wP34Q_BMcTpsBMdLgVjxq5SrFidKYDnr9Xj2heFtv6iEKdVUnVsyv58XLXFpBT8rk2r2U")

In [None]:
#Creating a serverless index with a dimension and a similarity metric
#based on the embedding model we will use to create the vector embeddings
index_name = "document--embeddings"
pc.create_index(
    name=index_name,
    #The dimension size may change based on which embedding model we plan to use
    dimension=1024,
    #"cosine" metric is the best for similarity analysis
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

PineconeApiException: (409)
Reason: Conflict
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'x-pinecone-api-version': '2025-01', 'x-cloud-trace-context': '6a8cebd620f06f9d4dad2d6f17631b45', 'date': 'Mon, 31 Mar 2025 23:00:22 GMT', 'server': 'Google Frontend', 'Content-Length': '85', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"ALREADY_EXISTS","message":"Resource  already exists"},"status":409}


In [None]:
def send_in_batches(data, batch_size=96, delay=5):  # Adding delay (default 5 seconds)
    # Split the data into smaller chunks (batches)
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]

        # Send the batch to Pinecone's inference service
        embeddings = pc.inference.embed(
            model="multilingual-e5-large",
            inputs=[d['text'] for d in batch],
            parameters={"input_type": "passage", "truncate": "END"}
        )

        # Process the embeddings (printing the first one as an example)
        print(embeddings[0])  # Modify this line to handle embeddings as needed

        # Add a delay to avoid hitting the rate limit
        time.sleep(delay)  # Adjust delay as needed

# Assuming data_from_df is the data you want to send to Pinecone
data = df.apply(lambda row: {"id": f"vec{row['Id']}", "text": row['text']}, axis=1).tolist()

# Call the function to send the data in batches with a delay
send_in_batches(data, batch_size=96, delay=5)  # Add a delay of 5 seconds between batches

{'vector_type': dense, 'values': [0.0145263671875, -0.027496337890625, ..., -0.0460205078125, -0.01428985595703125]}


In [None]:
#Test input data which is a dictionary with ID and text, which will the content of the document
#data = [
#    {"id": "vec1", "text": "Apple is a popular fruit known for its sweetness and crisp texture."},
#   {"id": "vec2", "text": "The tech company Apple is known for its innovative products like the iPhone."},
#   {"id": "vec3", "text": "Many people enjoy eating apples as a healthy snack."},
#    {"id": "vec4", "text": "Apple Inc. has revolutionized the tech industry with its sleek designs and user-friendly interfaces."},
#    {"id": "vec5", "text": "An apple a day keeps the doctor away, as the saying goes."},
#    {"id": "vec6", "text": "Apple Computer Company was founded on April 1, 1976, by Steve Jobs, Steve Wozniak, and Ronald Wayne as a partnership."}
#]




#This function sends the text data to Pinecone's inference service to generate embeddings
embeddings = pc.inference.embed(
    #This model is the embedding model Pinecone uses
    model="multilingual-e5-large",
    #Extracts just the text from each document and sends it to the model
    inputs=[d['text'] for d in data],
    #Additional parameters for the imbedding process
    # "input_type": "passage" tells the model that the input is a passage of text instead of a sentence or phrase
    # "truncate": "END" ensures that if a text passage exceeds the models input legnth then it gets truncated from the end
    parameters={"input_type": "passage", "truncate": "END"}
)

#This will print the embedding vector for the first document (vec 1)
#The result is a numerical vector
print(embeddings[0])

{'vector_type': dense, 'values': [0.0145263671875, -0.027496337890625, ..., -0.0460205078125, -0.01428985595703125]}


In [None]:
# Waits for the Pinecone index to be fully created and ready to accept data
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

#Creates an Index object for interacting with your Pinecone index (index_name)
index = pc.Index(index_name)

#This loops through data and embeddings to prepare the vectors to be inserted into the Pinecone index
vectors = []
for d, e in zip(data, embeddings):
    vectors.append({
        "id": d['id'],
        "values": e['values'],
        "metadata": {'text': d['text']}
    })

#This inserts the prepared vectors into the Pinecone index
index.upsert(
    vectors=vectors,
    namespace="ns1"
)

{'upserted_count': 96}

In [None]:
#Prints the current statistics of the Pinecone index, including the number of vectors, dimensions, and memory usage.
print(index.describe_index_stats())

{'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'ns1': {'vector_count': 6}},
 'total_vector_count': 6,
 'vector_type': 'dense'}


In [None]:
#This is the input query we want to perform a similarity search with; for this instance its an example with the test data
query = "Who talks about technology"

#Generates an embedding vector for the input query using the model for similarity search.
embedding = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[query],
    parameters={
        "input_type": "query"
    }
)

In [None]:
results = index.query(
    #Specifies where to query
    namespace="ns1",
    #The query vector value is compared against stored values
    vector=embedding[0].values,
    #Retrieves the top 3 most similar vectors
    top_k=3,
    #Does not include the actual vector value
    include_values=False,
    #Includes the metadata of the vector
    include_metadata=True
)

#Prints the results of the most similar vectors
print(results)

{'matches': [{'id': 'vec63',
              'metadata': {'text': 'in article <1pi9btinnqa5@gapcaltechedu> '
                                   'keith@ccocaltechedu (keith allan '
                                   'schneider) writes:\n'
                                   '|> kmr4@pocwruedu (keith m ryan) writes:\n'
                                   '|> \n'
                                   '|> >>then why do people keep asking the '
                                   'same questions over and over?\n'
                                   '|> >because you rarely ever answer them\n'
                                   '|> \n'
                                   "|> nope i've answered each question posed "
                                   'and most were answered multiple\n'
                                   '|> times\n'
                                   '\n'
                                   "\the:   fifty dollars if i can't answer "
                                   'your question\n'
  