# Set the Working Directory

In [None]:
import os

working_dir = os.getcwd()

if os.path.isdir(working_dir):
    print("Working directory is ready!")
else:
    raise ValueError("Working directory does not exist")

# Set the Trace Directory

In [None]:
import yaml

# Load configuration file
config_path = os.path.join(working_dir, "configuration.yaml")
with open(config_path, "r") as f:
    config = yaml.safe_load(f)

# Resolve trace directory
trace_dir = os.path.join(working_dir, "data", config["working_trace"])

if os.path.isdir(trace_dir):
    print(f"✅ Trace directory ready: {trace_dir}")
else:
    raise FileNotFoundError(f"❌ Trace directory not found: {trace_dir}\n")

# Generate Text Embeddings

In [None]:
import numpy as np
from openai import OpenAI
from tqdm.notebook import tqdm
import yaml

text_features = [
    "features/user_defined_metadata",
    "features/llm_generated_description",
    "features/llm_generated_keywords",
]

for feature in text_features:
    text_dir = os.path.join(trace_dir, feature)
    
    texts = {}

    for filename in tqdm(os.listdir(text_dir)):
        if not filename.endswith(".txt"):
            continue  # Skip non-text files

        file_path = os.path.join(text_dir, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
            if not isinstance(text, str):
                text = text.decode()
                
        file_id = filename.replace(".txt", "")
        texts[file_id] = text

    print(f"✅ Loaded {len(texts)} file IDs from `{text_dir}`")

    # Initialize OpenAI client
    client = OpenAI(
        api_key=config["openai"]["api_key"],
    )

    output_dir = os.path.join(trace_dir, f"{text_dir}_embedding")
    os.makedirs(output_dir, exist_ok=True)

    for idx, file_id in enumerate(texts, start=1):
        output_file = os.path.join(output_dir, f"{file_id}.npy")
        
        if os.path.exists(output_file):
            print(f"✅ Embedding already exists: {file_id}.npy")
            continue
        
        print(f"🛠️ [{idx}/{len(texts)}] Generating: {file_id}.npy")

        # Generate text embeddings
        embedding = client.embeddings.create(
            input=texts[file_id],
            model="text-embedding-ada-002"
        )
        embedding = embedding.data[0].embedding
        
        # Save embeddings
        with open(output_file, "wb") as f:
            np.save(f, embedding)
            
        print(embedding)

        print(f"📦 Saved embedding: {file_id}.npy (shape: {np.shape(embedding)})")