In [None]:
# Step 1: Set up AWS S3 client to fetch data from the bucket
s3_client = boto3.client('s3')

# Define bucket and object details (modify these as needed)
bucket_name = 'your-bucket-name'
s3_file_key = 'your-data-file.csv'

# Download the CSV file from S3 to local
s3_client.download_file(bucket_name, s3_file_key, 'local_data.csv')

# Step 2: Load the dataset (assuming CSV format with a 'text' column)
df = pd.read_csv('local_data.csv')
texts = df['text'].tolist()

# Step 3: Set up OpenAI API to generate embeddings (using GPT-4 embeddings)
openai.api_key = 'your-openai-api-key'

# Initialize ChromaDB client and create a collection for embeddings
client = chromadb.Client()
collection = client.create_collection("my_text_embeddings")

# Initialize LangChain's OpenAI embeddings wrapper
embeddings = OpenAIEmbeddings()

# Step 4: Convert each text entry into an embedding using GPT-4 and store it in ChromaDB
for text in texts:
    # Get embedding from OpenAI's GPT-4
    embedding = embeddings.embed_documents([text])
    
    # Add the text and its embedding to the ChromaDB collection
    collection.add(documents=[text], embeddings=[embedding])

print("Embeddings stored in ChromaDB successfully.")

# Step 5: Query the vector store using natural language query
def query_chroma(query_text):
    # Convert the natural language query into an embedding
    query_embedding = embeddings.embed_query(query_text)
    
    # Search for the most similar entries in the vector store
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=5  # Retrieve top 5 most similar results
    )
    
    return results

# Example: Querying the vector store
query = "Find entries related to customer satisfaction."
results = query_chroma(query)

# Step 6: Display the retrieved results
for i, result in enumerate(results['documents']):
    print(f"Result {i+1}: {result}")
