# Generate Embeddings

In [3]:
pip install sentence-transformers
pip install pinecone

Collecting sentence-transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.4.1-py3-none-any.whl (275 kB)
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-3.4.1
Note: you may need to restart the kernel to use updated packages.


## Pinecone Client

In [25]:
from pinecone import Pinecone
import boto3
from sentence_transformers import SentenceTransformer
import json
import os
from pinecone import (
    Pinecone,
    ServerlessSpec,
    CloudProvider,
    AwsRegion,
    Metric,
    DeletionProtection,
    VectorType
)

# Initialize Pinecone and AWS
pc = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'),
              environment="us-east1-aws")

s3_client = boto3.client('s3')

# Define the Pinecone index name and embeddings dimension
index_name = 'lands-between-index'
embedding_dimension = 768  # Update based on the embedding model you're using (e.g., 768 for BERT)

# Create the Pinecone index if it doesn't exist
if not any(index['name'] == index_name for index in pc.list_indexes()):
    pc.create_index(index_name, 
                    dimension=embedding_dimension,
                    spec=ServerlessSpec(cloud=CloudProvider.AWS,
                                        region=AwsRegion.US_EAST_1)
                   )

# Create an index instance
index = pc.Index(index_name)

# Load the pre-trained SentenceTransformer model
model = SentenceTransformer('bert-base-uncased')  # You can use any SentenceTransformer model


def retrieve_s3_files(bucket_name, prefix):
    """ Retrieve list of text file keys from an S3 bucket """
    response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
    return [item['Key'] for item in response.get('Contents', []) if item['Key'].endswith('.txt')]


def read_s3_file(bucket_name, file_key):
    """ Read a file from S3 and return its content """
    response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
    return response['Body'].read().decode('utf-8')


def generate_embeddings(text_data):
    """ Generate embeddings for a given text """
    return model.encode(text_data).tolist()


def insert_into_pinecone(embeddings, metadatas):
    # Use string for id (convert integers to strings)
    vectors = [
        {"id": str(i), "values": embedding, "metadata": metadata} 
        for i, (embedding, metadata) in enumerate(zip(embeddings, metadatas))
    ]

    # Upsert the vectors
    index.upsert(vectors=vectors)


def process_s3_files(bucket_name, prefix):
    """ Process S3 files, generate embeddings, and store them in Pinecone """

    file_keys = retrieve_s3_files(bucket_name, prefix)

    all_embeddings = []
    all_metadatas = []
    
    for file_key in file_keys:
        #print(f"Generating Embeddings for {file_key}")
        text_data = read_s3_file(bucket_name, file_key)
        embeddings = generate_embeddings(text_data)

        # Create metadata for each embedding (you can include other info here if needed)
        metadata = {"file_name": file_key}

        all_embeddings.append(embeddings)
        all_metadatas.append(metadata)
    print("Upsert into pineceone.")
    # Insert the embeddings into Pinecone
    insert_into_pinecone(all_embeddings, all_metadatas)

    print(f"Processed {len(all_embeddings)} files and inserted embeddings into Pinecone.")

# Example usage
S3_BUCKET_NAME = 'webscrape-lands-between'
PREFIX = "scraped_data"
process_s3_files(S3_BUCKET_NAME, PREFIX)


No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.


Upsert into pineceone.
Processed 391 files and inserted embeddings into Pinecone.
