# Set the Working Directory

In [None]:
import os

working_dir = os.getcwd()

if os.path.isdir(working_dir):
    print("Working directory is ready!")
else:
    raise ValueError("Working directory does not exist")

# Set the Trace Directory

In [None]:
import yaml

# Load configuration file
config_path = os.path.join(working_dir, "configuration.yaml")
with open(config_path, "r") as f:
    config = yaml.safe_load(f)

# Resolve trace directory
trace_dir = os.path.join(working_dir, "data", config["working_trace"])

if os.path.isdir(trace_dir):
    print(f"✅ Trace directory ready: {trace_dir}")
else:
    raise FileNotFoundError(f"❌ Trace directory not found: {trace_dir}\n")

# Load Descriptions and Metadata
⚠️ Please ensure [LLM-generated descriptions have been generated](llm_generated_description.ipynb) and [user-defined metadata has been processed](user_defined_metadata.ipynb) before running this cell.

In [None]:
from utils import load_pickle_file

# Load pickle files
metadata = load_pickle_file(os.path.join(trace_dir, "metadata.pickle"))
descriptions = load_pickle_file(os.path.join(trace_dir, "descriptions.pickle"))

# Generate LLM-Generated Keywords

In [None]:
from openai import OpenAI
import yaml

def gpt_wrapper(client, platform, keyword_pool, description, metadata, channeltags):
    prompt = (
        f"I will give you the description, metadata, and auxiliaries of a {platform} video. "
        f"I will give you a pool of keywords: {keyword_pool}. "
        "I want you to select keywords related to the video from this pool based on the description, metadata, and auxiliaries. "
        "I want you to only reply with the keywords and nothing else. "
        "Note that the metadata and auxiliaries may be empty or may provide no additional information.\n"
        f"The description is: \"{description}\"\n"
        f"The metadata is: \"{metadata}\"\n"
        f"The auxiliaries are: \"{channeltags}\"\n\n"
        "Your answer is:"
    )

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

# Initialize OpenAI client
client = OpenAI(
    api_key=config["openai"]["api_key"],
)

output_dir = os.path.join(trace_dir, "features", "llm_generated_keywords")
os.makedirs(output_dir, exist_ok=True)

for idx, file_id in enumerate(metadata, start=1):
    output_file = os.path.join(output_dir, f"{file_id}.txt")
    
    if os.path.exists(output_file):
        print(f"✅ Keywords already exist: {file_id}.txt")
        continue
    
    print(f"🛠️ [{idx}/{len(metadata)}] Generating: {file_id}.txt")

    # Generate keywords based on LLM-generated descriptions and user-defined metadata
    keywords = gpt_wrapper(
        client=client,
        platform="TikTok",
        keyword_pool=", ".join(config["keyword_pool"]),
        description=descriptions[file_id],
        metadata=metadata[file_id]["desc"],
        channeltags=", ".join(metadata[file_id]["channelTags"])
    )

    # Save keywords
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(keywords)

    print(f"📦 Saved keywords: {file_id}.txt")
    print(f"📝 Keywords {idx}/{len(metadata)}: {keywords}")

In [None]:
import os
import pickle
from tqdm.notebook import tqdm

keyword_dir = os.path.join(trace_dir, "features", "llm_generated_keywords")
output_path = os.path.join(trace_dir, "keywords.pickle")

keywords = {}

for file_id in tqdm(metadata, desc="Loading keywords"):
    keyword_file = os.path.join(keyword_dir, f"{file_id}.txt")
    
    with open(keyword_file, "r", encoding='utf-8') as f:
        text = f.read()
        if not isinstance(text, str):
            text = text.decode()

    keywords[file_id] = text

# Save keywords to pickle
with open(output_path, "wb") as f:
    pickle.dump(keywords, f)

print(f"✅ Saved {len(keywords)} keywords to {output_path}")