In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, LoggingHandler
import logging
from tqdm.notebook import tqdm
import re

In [None]:
path = 'Data/'

In [None]:
texts_df = pd.read_parquet('{}topics_embeddings.parquet'.format(path))
texts_df

In [None]:
# Setup logging
np.set_printoptions(threshold=100)
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

# Load Sentence Transformer model
model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')

# Read the external embeddings file
embeddings_df = pd.read_parquet('embeddings.parquet')

# Optional: Ensure embedding column is in correct type
embeddings_df['embeddings'] = embeddings_df['embeddings'].apply(lambda x: np.array(x) if isinstance(x, list) else x)

# Build a quick lookup for known embeddings
embedding_lookup = {row['phrases']: row['embeddings'] for _, row in embeddings_df.iterrows()}

# Encode function using known embeddings where possible
def model_encode_with_known(model):
    embeddings = []
    for _, phrases in tqdm(texts_df['phrases_2'].items()):
        embedding = np.zeros(512)
        for phrase in phrases:
            
            phrase = re.sub("(\\d|\\W)+"," ",phrase)
            phrase = phrase.strip()
            phrase = phrase.lower()
            
            if phrase in embedding_lookup:
                emb = embedding_lookup[phrase][0]
            else:
                emb = model.encode(phrase, show_progress_bar=False)
            embedding += emb
        embeddings.append(embedding)
    return embeddings

# Apply encoding
texts_df['embedding_2'] = model_encode_with_known(model)
texts_df

In [None]:
texts_df.to_parquet('{}topics_embeddings_2.parquet'.format(path))