In [None]:
import sys
sys.path.append("../../")

from itops.storage.azure_blob.azure_blob_helper import AzureBlobHelper
from itops.config.configs import CONFIGS
from itops.storage.azure_blob.parquet_helper import ParquetHelper
from concurrent.futures import ThreadPoolExecutor
from sentence_transformers import SentenceTransformer

In [None]:
import os

# Get the number of CPU cores
num_of_cores = os.cpu_count()

print("Number of CPU cores:", num_of_cores)

In [None]:
MODEL_NAME = "all-MiniLM-L6-v2"
FILENAME = 'ITSM_Data-RUN0007-ITSM-PARQUET-MINI.parquet'
CONTAINER_NAME = 'itops'
ACCOUNT_NAME = 'stgtxtsql'
blob_helper = AzureBlobHelper(account_name= ACCOUNT_NAME,
                              container_name= CONTAINER_NAME,
                              account_key=CONFIGS.AZURE_BLOB_STORAGE_KEY)
file_helper = ParquetHelper(azure_blob_helper=blob_helper)
df = file_helper.read_file(FILENAME)
df_actual = df

In [None]:
df["themes"]

In [None]:
import numpy as np
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

class EmbeddingGenerator:
    def __init__(self, embedding_model_name):
        self.embedding_model_name = embedding_model_name

    def get_embedding_query_vector(self,query,model_name):
        """Get the vector of the query

        Args:
            query (string): user input

        Returns:
            _type_: vector of the query
        """
        model = SentenceTransformer(self.embedding_model_name)
        query_vector = model.encode(query)
        return query_vector

    def generate_embedding_dataset(self, df):
        embedding_list = []

        def embed_row(row):
            content = row["themes"]
            embedding = self.get_embedding_query_vector(content, self.embedding_model_name)
            print(f"Completed EMBEDDING for ROW: {row.name + 1}")  # row.name gives the index
            return np.array(embedding)

        # Use ThreadPoolExecutor for I/O bound tasks or ProcessPoolExecutor for CPU bound tasks
        with ThreadPoolExecutor() as executor:
            embedding_list = list(executor.map(embed_row, [df.iloc[i] for i in range(len(df))]))

        df["embeddings"] = embedding_list
        return df

# Example usage:
# df = pd.DataFrame({"themes": ["text1", "text2", "text3"]})
# generator = EmbeddingGenerator("your_model_name")
# df_with_embeddings = generator.generate_embedding_dataset(df)

In [None]:
generator = EmbeddingGenerator(MODEL_NAME)
df_with_embeddings = generator.generate_embedding_dataset(df)

In [None]:
df.columns

In [None]:
def get_embedding_query_vector(query,model_name):
        
        """Get the vector of the query

        Args:
            query (string): user input

        Returns:
            _type_: vector of the query
        """
        model = SentenceTransformer(model_name)
        query_vector = model.encode(query)
        return query_vector

In [None]:
embedding_list = []
for i in range(len(df)):
    content = df.iloc[i]["themes"]
    embedding = get_embedding_query_vector(content,MODEL_NAME)
    embedding_list.append(embedding)
    print(f"Completed {i+1} ROW")
df["embeddings"] = embedding_list

In [None]:
import numpy as np

# Convert embeddings to NumPy arrays for efficient comparison
arr1 = np.array(df_with_embeddings['embeddings'].tolist())
arr2 = np.array(df['embeddings'].tolist())

# Check for equality
equal_rows = np.array_equal(arr1, arr2)
print("Are all embeddings equal:", equal_rows)