In [2]:
from pinecone import Pinecone
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import os
# import pandas as pd
load_dotenv()

True

In [35]:
class VectorSpaceAnalyzer:
    def __init__(self, api_key, index_name):  #environment, 
        self.api_key = api_key
        # self.environment = environment
        self.index_name = index_name
        self.pinecone = self.initialize_pinecone()
        self.index = self.pinecone.Index(self.index_name)
        self.all_vectors = None

    def initialize_pinecone(self):
        pinecone = Pinecone(api_key=self.api_key)
        index = pinecone.Index(self.index_name)
        # pinecone.init(api_key=self.api_key, environment=self.environment)
        return pinecone

    def fetch_vectors(self, batch_size=1000):
        if self.all_vectors is None:
            index_info = self.index.describe_index_stats()
            
            # Try different possible keys for total vector count
            total_vector_count = None
            for key in ['total_vector_count', 'totalVectorCount', 'vector_count']:
                if key in index_info:
                    total_vector_count = index_info[key]
                    break
            
            # If not found in top level, check in namespaces
            if total_vector_count is None and 'namespaces' in index_info:
                default_namespace = index_info['namespaces'].get('', {})
                for key in ['vector_count', 'vectorCount']:
                    if key in default_namespace:
                        total_vector_count = default_namespace[key]
                        break
            
            if total_vector_count is None:
                raise ValueError("Unable to determine total vector count from index stats")

            print(f"Total vector count: {total_vector_count}")
####
            # # pinecone = Pinecone(api_key=self.api_key)
            # # index = pinecone.Index(self.index_name)            
            # index_info = self.index.describe_index_stats()
            # print(index_info)
            # total_vector_count = index_info["totalVectorCount"]
            # # total_vector_count = index_info.get("totalVectorCount")
            # print(int(total_vector_count))
####



            vectors = []
            for i in range(0, int(total_vector_count), batch_size):
                ids = [str(j) for j in range(i, min(i+batch_size, total_vector_count))]
                response = self.index.fetch(ids)
                batch_vectors = [v['values'] for v in response['vectors'].values()]
                vectors.extend(batch_vectors)
            print(f"Fetched {len(vectors)} vectors.")
            self.all_vectors = np.array(vectors) #.reshape(-1, 1)
            print(f"Shape of all_vectors: {self.all_vectors.shape}")
        
        if self.all_vectors is None or len(self.all_vectors) == 0:
            raise ValueError("No vectors were fetched from the index.")
    
        if self.all_vectors.ndim == 1:
            self.all_vectors = self.all_vectors.reshape(1, -1)
        
        return self.all_vectors

    def find_optimal_clusters(self, max_clusters=20):
        vectors = self.fetch_vectors()

        
        if len(vectors) < 2:
            print("Not enough vectors to perform clustering.")
            return 1
    
        max_clusters = min(max_clusters, len(vectors) - 1)
        
        
        sse = []

        for k in range(1, max_clusters + 1):
            kmeans = KMeans(n_clusters=k, random_state=2024, n_init=10)
            kmeans.fit(vectors)
            sse.append(kmeans.inertia_)
        
        if len(sse) < 3:
            print("Not enough data points to determine optimal clusters. Using 1 cluster.")
            return 1

        # Determine the "elbow" point
        elbows = np.diff(sse, 2)
        optimal_clusters = np.argmax(elbows) + 2

        return optimal_clusters

    def analyze_vector_space(self):
        vectors = self.fetch_vectors()
        if len(vectors) < 2:
            print("Not enough vectors to perform analysis.")
            return None
        optimal_clusters = self.find_optimal_clusters()
        print(f"Optimal number of clusters: {optimal_clusters}")

        # Perform K-means clustering
        kmeans = KMeans(n_clusters=optimal_clusters, random_state=2024, n_init=10)
        cluster_labels = kmeans.fit_predict(vectors)

        # Perform PCA for visualization
        pca = PCA(n_components=2)
        vectors_2d = pca.fit_transform(vectors)

        # Visualize the clusters
        plt.figure(figsize=(10, 8))
        scatter = plt.scatter(vectors_2d[:, 0], vectors_2d[:, 1], c=cluster_labels, cmap='viridis')
        plt.colorbar(scatter)
        plt.title('PCA visualization of vector clusters')
        plt.xlabel('First Principal Component')
        plt.ylabel('Second Principal Component')
        plt.savefig('vector_clusters.png')
        plt.close()

        # Find centroids in original space
        centroids = kmeans.cluster_centers_

        # Function to find nearest vector to a given point
        def find_nearest_vector(point, vectors):
            distances = np.linalg.norm(vectors - point, axis=1)
            return np.argmin(distances)

        # Find nearest actual vectors to centroids
        nearest_to_centroids = [find_nearest_vector(centroid, vectors) for centroid in centroids]

        # Find outliers (e.g., points far from their cluster center)
        distances_to_centroid = np.min(kmeans.transform(vectors), axis=1)
        outlier_threshold = np.percentile(distances_to_centroid, 95)  # Top 5% as outliers
        outlier_indices = np.where(distances_to_centroid > outlier_threshold)[0]

        return {
            "cluster_labels": cluster_labels,
            "centroids": centroids,
            "nearest_to_centroids": nearest_to_centroids,
            "outlier_indices": outlier_indices
        }

    def generate_sample_queries(self, analysis_results, num_samples=5):
        cluster_labels, centroids, nearest_to_centroids, outlier_indices = (
            analysis_results["cluster_labels"],
            analysis_results["centroids"],
            analysis_results["nearest_to_centroids"],
            analysis_results["outlier_indices"]
        )

        # Generate sample queries for centroids
        centroid_queries = []
        for i, nearest_idx in enumerate(nearest_to_centroids):
            cluster_id = cluster_labels[nearest_idx]
            query = f"This is a sample query for the vectors in cluster {cluster_id}."
            centroid_queries.append(query)

        # Generate sample queries for outliers
        outlier_queries = []
        for i, outlier_idx in enumerate(outlier_indices[:num_samples]):
            query = f"This is a sample query for an outlier vector."
            outlier_queries.append(query)

        return {
            "centroid_queries": centroid_queries,
            "outlier_queries": outlier_queries
        }

In [36]:
# Example usage
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX = os.getenv("PINECONE_INDEX")
# api_key = "YOUR_API_KEY"
# environment = "YOUR_ENVIRONMENT"
# index_name = "YOUR_INDEX_NAME"

analyzer = VectorSpaceAnalyzer(PINECONE_API_KEY, PINECONE_INDEX) #environment,
analysis_results = analyzer.analyze_vector_space()
sample_queries = analyzer.generate_sample_queries(analysis_results)

print("Centroid sample queries:")
for query in sample_queries["centroid_queries"]:
    print(query)

print("\nOutlier sample queries:")
for query in sample_queries["outlier_queries"]:
    print(query)

Total vector count: 29754
Fetched 0 vectors.
Shape of all_vectors: (0,)


ValueError: No vectors were fetched from the index.