In [5]:
import pandas as pd
import openai
import os
from tqdm.auto import tqdm

client = openai.OpenAI()

# Embedding model configuration
EMBEDDING_MODEL = "text-embedding-3-large"
EMBEDDING_DIMENSIONS = 3072

# --- 2. Load Data From Excel File ---
file_path = '../data/knowledge_base/additional_sources_ifrs.xlsx'

try:
    print(f"Loading data from '{file_path}'...")
    df = pd.read_excel(file_path, engine='openpyxl', sheet_name='Summary')
    print(f"Successfully loaded {len(df)} rows.")
except Exception as e:
    print(f"An error occurred while reading the Excel file: {e}")
    exit()

# --- 3. Define Embedding Function ---
def get_embedding(text: str, model: str, dimensions: int) -> list[float]:
    """Generates an embedding vector for a given text using the OpenAI API."""
    try:
        # Ensure text is a non-empty string
        text = str(text).strip()
        if not text:
            return []
        response = client.embeddings.create(input=[text], model=model, dimensions=dimensions)
        return response.data[0].embedding
    except Exception as e:
        print(f"An error occurred while getting embedding: {e}")
        return [] # Return an empty list to avoid breaking the process


# --- 4. Process DataFrame ---
# Combine relevant columns into a single text field for better contextual embedding
df['combined_text'] = df.apply(
    lambda row: f"Source: {row['Source']}\nLabel: {row['Label']}\n\n{row['Main text']}",
    axis=1
)

# Truncate to the first 5000 characters to manage cost and token limits
df['truncated_text'] = df['combined_text'].str[:5000]

# Generate embeddings with a progress bar
tqdm.pandas(desc=f"Generating {EMBEDDING_MODEL} embeddings")
df['embedding'] = df['truncated_text'].progress_apply(
    lambda text: get_embedding(text, model=EMBEDDING_MODEL, dimensions=EMBEDDING_DIMENSIONS)
)

# --- 5. Save to Parquet File ---
# Remove intermediate helper columns before saving
cols = ['combined_text', 'embedding']
# Define a descriptive output filename and save the result
output_filename = 'ifrs_knowledge_base_with_embeddings.parquet'
df[cols].to_parquet(output_filename, engine='pyarrow')

print(f"\nSuccessfully processed {len(df)} rows.")
print(f"DataFrame with embeddings saved to '{output_filename}'.")

Loading data from '../data/knowledge_base/additional_sources_ifrs.xlsx'...
Successfully loaded 52 rows.


Generating text-embedding-3-large embeddings:   0%|          | 0/52 [00:00<?, ?it/s]


Successfully processed 52 rows.
DataFrame with embeddings saved to 'ifrs_knowledge_base_with_embeddings.parquet'.
