# Text Embeddings in Pinecone Index

In [3]:
!pip install sentence-transformers
!pip install pinecone

Collecting sentence-transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.4.1-py3-none-any.whl (275 kB)
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-3.4.1
Collecting pinecone
  Downloading pinecone-6.0.1-py3-none-any.whl.metadata (8.8 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone-6.0.1-py3-none-any.whl (421 kB)
Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone
Successfully installed pinecone-6.0.1 pinecone-plugin-interface-0.0.7


In [None]:
from pinecone import Pinecone
import boto3
from sentence_transformers import SentenceTransformer
import json
import os
from pinecone import (
    Pinecone,
    ServerlessSpec,
    CloudProvider,
    AwsRegion,
    Metric,
    DeletionProtection,
    VectorType
)

In [5]:

os.environ["PINECONE_API_KEY"] = "pcsk_4bEMf4_9Sn1sJL6a6vJL1Tu6nRqmssTwc5guzsTsyXKYJ7U8Vf14Hh7SdiX1oAgBg3Kuwn"
os.environ["PINECONE_ENV"] = "us-east1-aws"
s3_client = boto3.client('s3')


## Pinecone for Reddit Posts

In [10]:
# Initialize Pinecone and AWS
pc = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'),
              environment="us-east1-aws")

# Define the Pinecone index name and embeddings dimension
index_name = 'lands-between-eldenringbuilds'
embedding_dimension = 768  # Update based on the embedding model you're using (e.g., 768 for BERT)

# Create the Pinecone index if it doesn't exist
if not any(index['name'] == index_name for index in pc.list_indexes()):
    pc.create_index(index_name, 
                    dimension=embedding_dimension,
                    spec=ServerlessSpec(cloud=CloudProvider.AWS,
                                        region=AwsRegion.US_EAST_1)
                   )


In [42]:
# Create an index instance
index = pc.Index(index_name)

# Load the pre-trained SentenceTransformer model
model = SentenceTransformer('bert-base-uncased')  # You can use any SentenceTransformer model

def retrieve_s3_files(bucket_name, prefix):
    """ Retrieve list of text file keys from an S3 bucket """
    response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
    
    return [item['Key'] for item in response.get('Contents', []) if item['Key'].endswith('.json')]


def read_s3_file(bucket_name, file_key):
    """ Read a file from S3 and return its content """
    response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
    return response['Body'].read().decode('utf-8')


def generate_embeddings(text_data):
    """ Generate embeddings for a given text """
    return model.encode(text_data).tolist()

def insert_into_pinecone(vectors):
    """Upsert embeddings into Pinecone."""
    index.upsert(vectors=vectors)
    print(f"Inserted {len(vectors)} records into Pinecone.")


def process_s3_files(bucket_name, prefix):
    """Process S3 files containing Reddit JSON data, generate embeddings, and store in Pinecone """

    file_keys = retrieve_s3_files(bucket_name, prefix)  # Get list of files from S3
    vectors = []  # Store vectors to insert into Pinecone

    for file_key in file_keys:
        text_data = read_s3_file(bucket_name, file_key)  # Read JSON file from S3
        reddit_posts = json.loads(text_data)  # Parse JSON

        for post in reddit_posts:
            post_id = post["id"]
            subreddit = post["metadata"]["subreddit"]
            url = post["metadata"]["url"]
            author = post["metadata"]["author"]
            timestamp = post["metadata"]["timestamp"]

            # Process title embedding
            title_embedding = generate_embeddings(post["title"])
            vectors.append({
                "id": f"{post_id}-title",
                "values": title_embedding,
                "metadata": {
                    "type": "title",
                    "subreddit": subreddit,
                    "url": url,
                    "author": author,
                    "timestamp": timestamp
                }
            })

            # Process body embedding
            if post["body"]:
                body_embedding = generate_embeddings(post["body"])
                vectors.append({
                    "id": f"{post_id}-body",
                    "values": body_embedding,
                    "metadata": {
                        "type": "body",
                        "subreddit": subreddit,
                        "url": url,
                        "author": author,
                        "timestamp": timestamp
                    }
                })

            # Process comments embeddings
            for idx, comment in enumerate(post["comments"]):
                comment_embedding = generate_embeddings(comment)
                vectors.append({
                    "id": f"{post_id}-comment-{idx}",
                    "values": comment_embedding,
                    "metadata": {
                        "type": "comment",
                        "subreddit": subreddit,
                        "url": url,
                        "author": author,
                        "timestamp": timestamp
                    }
                })

        if vectors:  # Only insert if we have embeddings to upsert
            insert_into_pinecone(vectors)
            print(f"Processed {len(file_keys)} files and inserted {len(vectors)} embeddings into Pinecone.")
        else:
            print("No valid embeddings found to insert into Pinecone.")

No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.


In [43]:
# Example usage s3://webscrape-lands-between/reddit_data/EldenringBuilds/
S3_BUCKET_NAME = 'webscrape-lands-between'
PREFIX = "reddit_data/EldenringBuilds"
process_s3_files(S3_BUCKET_NAME, PREFIX)

Inserted 53 records into Pinecone.
Processed 2 files and inserted 53 embeddings into Pinecone.
Inserted 106 records into Pinecone.
Processed 2 files and inserted 106 embeddings into Pinecone.


## Pinecone for Webpage Scrapes

In [25]:
# Initialize Pinecone and AWS
pc = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'),
              environment="us-east1-aws")

# Define the Pinecone index name and embeddings dimension
index_name = 'lands-between-index'
embedding_dimension = 768  # Update based on the embedding model you're using (e.g., 768 for BERT)

# Create the Pinecone index if it doesn't exist
if not any(index['name'] == index_name for index in pc.list_indexes()):
    pc.create_index(index_name, 
                    dimension=embedding_dimension,
                    spec=ServerlessSpec(cloud=CloudProvider.AWS,
                                        region=AwsRegion.US_EAST_1)
                   )

# Create an index instance
index = pc.Index(index_name)

# Load the pre-trained SentenceTransformer model
model = SentenceTransformer('bert-base-uncased')  # You can use any SentenceTransformer model


def retrieve_s3_files(bucket_name, prefix):
    """ Retrieve list of text file keys from an S3 bucket """
    response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
    return [item['Key'] for item in response.get('Contents', []) if item['Key'].endswith('.txt')]


def read_s3_file(bucket_name, file_key):
    """ Read a file from S3 and return its content """
    response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
    return response['Body'].read().decode('utf-8')


def generate_embeddings(text_data):
    """ Generate embeddings for a given text """
    return model.encode(text_data).tolist()


def insert_into_pinecone(embeddings, metadatas):
    # Use string for id (convert integers to strings)
    vectors = [
        {"id": str(i), "values": embedding, "metadata": metadata} 
        for i, (embedding, metadata) in enumerate(zip(embeddings, metadatas))
    ]

    # Upsert the vectors
    index.upsert(vectors=vectors)


def process_s3_files(bucket_name, prefix):
    """ Process S3 files, generate embeddings, and store them in Pinecone """

    file_keys = retrieve_s3_files(bucket_name, prefix)

    all_embeddings = []
    all_metadatas = []
    
    for file_key in file_keys:
        #print(f"Generating Embeddings for {file_key}")
        text_data = read_s3_file(bucket_name, file_key)
        embeddings = generate_embeddings(text_data)

        # Create metadata for each embedding (you can include other info here if needed)
        metadata = {"file_name": file_key}

        all_embeddings.append(embeddings)
        all_metadatas.append(metadata)
    print("Upsert into pineceone.")
    # Insert the embeddings into Pinecone
    insert_into_pinecone(all_embeddings, all_metadatas)

    print(f"Processed {len(all_embeddings)} files and inserted embeddings into Pinecone.")

# Example usage
S3_BUCKET_NAME = 'webscrape-lands-between'
PREFIX = "scraped_data"
process_s3_files(S3_BUCKET_NAME, PREFIX)

No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.


Upsert into pineceone.
Processed 391 files and inserted embeddings into Pinecone.
