# Set the Working Directory

In [None]:
import os

working_dir = os.getcwd()

if os.path.isdir(working_dir):
    print("Working directory is ready!")
else:
    raise ValueError("Working directory does not exist")

# Set the Trace Directory

In [None]:
import yaml

# Load configuration file
config_path = os.path.join(working_dir, "configuration.yaml")
with open(config_path, "r") as f:
    config = yaml.safe_load(f)

# Resolve trace directory
trace_dir = os.path.join(working_dir, "data", config["working_trace"])

if os.path.isdir(trace_dir):
    print(f"✅ Trace directory ready: {trace_dir}")
else:
    raise FileNotFoundError(f"❌ Trace directory not found: {trace_dir}\n")

In [None]:
from utils import read_file_ids

# Read file IDs from the trace directory
file_ids = read_file_ids(trace_dir=trace_dir, overwrite=False)

print(f"📁 Total video files found: {len(file_ids)}")

# Load User-Defined Metadata

In [None]:
import json
import pickle
from tqdm.notebook import tqdm

def read_metadata(trace_dir, file_ids):
    metadata = {}
    empty_desc_count = 0
    empty_tags_count = 0

    for file_id in tqdm(file_ids, desc="Processing metadata"):
        metadata_path = os.path.join(trace_dir, "metadata", f"{file_id}.json")
        if not os.path.exists(metadata_path):
            raise FileNotFoundError(f"❌ Metadata file not found: {metadata_path}")

        with open(metadata_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        metadata[file_id] = {
            "desc": data.get("desc", ""),
            "contents": data.get("contents", ""),
            "channelTags": data.get("channelTags", [])
        }

        if not data.get("desc", "").strip():
            empty_desc_count += 1
        if not data.get("channelTags"):
            empty_tags_count += 1

    total = len(file_ids)
    empty_desc_ratio = empty_desc_count / total
    empty_tags_ratio = empty_tags_count / total

    return metadata, empty_desc_ratio, empty_tags_ratio    

# Load metadata and report statistics
metadata, empty_desc_ratio, empty_tags_ratio = read_metadata(trace_dir, file_ids)
print(f"📊 Empty description ratio: {empty_desc_ratio:.6f}")
print(f"📊 Empty channel tag ratio: {empty_tags_ratio:.6f}")

# Process User-Defined Metadata

## Extract and Rank Top Hashtags by Frequency

In [None]:
from collections import defaultdict

hashtag_counts = defaultdict(int)

for file_id, entry in metadata.items():
    for content in entry.get("contents", []):
        for tag in content.get("textExtra", []):
            if "hashtagId" in tag and "hashtagName" in tag:
                hashtag_key = (tag["hashtagId"], tag["hashtagName"])
                hashtag_counts[hashtag_key] += 1

# Sort hashtags by frequency (descending)
sorted_hashtags = sorted(hashtag_counts.items(), key=lambda x: -x[1])

print(f"🔢 Total unique hashtags: {len(sorted_hashtags)}")

# Display top 1% (or at least 10) for manual inspection
top_fraction = 0.01
top_n = max(10, int(len(sorted_hashtags) * top_fraction))

print(f"\n📋 Top {top_fraction:.0%} most frequent hashtags:")
for (hashtag_id, hashtag_name), count in sorted_hashtags[:top_n]:
    print(f"#{hashtag_name} (ID: {hashtag_id}) : {count} times")

## Visualize Top Hashtags

In [None]:
import japanize_matplotlib  # Support Japanese text (e.g., #fypシ) in matplotlib
import matplotlib.pyplot as plt
import numpy as np

fig, ax = plt.subplots(figsize=(6, 3))

labels = [hashtag[0][1] for hashtag in sorted_hashtags[:top_n]]
y = [hashtag[1] for hashtag in sorted_hashtags[:top_n]]
x = range(1, len(y) + 1)

ax.bar(x, y, color="#00A4EF")

ax.set_xlim(0, top_n + 1)
ax.set_xticks(np.arange(1, top_n + 1, 1))
ax.set_xticklabels(labels, rotation=60, ha="right")
ax.tick_params(axis='x', labelsize=14)

ax.tick_params(axis='y', labelsize=14)
ax.set_ylabel('Frequencies', fontsize=18)

ax.grid(axis="y", ls='--', zorder=3)

output_dir = os.path.join(trace_dir, "figures")
os.makedirs(output_dir, exist_ok=True)
fig.savefig(os.path.join(output_dir, f"top_hashtag_frequency_distribution.pdf"), bbox_inches="tight")

plt.show()

## Filter Meaningless Hashtags (e.g., `foryoupage`)

In [None]:
def select_meaningless_hashtags(trace_dir, sorted_hashtags, top_n, overwrite=False):
    pickle_path = os.path.join(trace_dir, "meaningless_hashtags.pickle")

    if os.path.isfile(pickle_path) and not overwrite:
        # Load from existing file
        with open(pickle_path, "rb") as f:
            meaningless_hashtags = pickle.load(f)
    else:
        # Manually select hashtags from top_n candidates
        meaningless_hashtags = []
        
        print("\n📝 Manual Selection: Common but Meaningless Hashtags")
        print("Some videos include frequently used hashtags that are unrelated to their actual content.")
        print("These are often added to boost visibility or influence the platform's recommendation algorithm.")
        print("Please review the following hashtags and indicate whether to include them in the 'meaningless' list.")
        print("Press [y] to include, [n] to skip (default: n):\n")

        for idx, (hashtag_meta, count) in enumerate(sorted_hashtags[:top_n]):
            hashtag_name = hashtag_meta[1]
            while True:
                response = input(f"{idx}. Include \"{hashtag_name}\"? [y/n]: ").strip().lower()
                if response == "":
                    response = "no"
                if response in {"yes", "no", "y", "n"}:
                    response = "yes" if response in {"yes", "y"} else "no"
                    break
                print("❌ Invalid input. Please enter 'y' or 'n'.")

            if response == "yes":
                meaningless_hashtags.append(hashtag_name)

        # Save selected hashtags
        with open(pickle_path, "wb") as f:
            pickle.dump(meaningless_hashtags, f)

    return meaningless_hashtags

# Run and display output
meaningless_hashtags = select_meaningless_hashtags(trace_dir, sorted_hashtags, top_n, False)
output = ", ".join(f"#{hashtag}" for hashtag in meaningless_hashtags)
print(f"\n📌 Selected common hashtags:\n{output}")

In [None]:
for idx, file_id in enumerate(tqdm(metadata, desc="Processing metadata")):
    text = metadata[file_id]["desc"]

    # Remove meaningless hashtags
    for hashtag in meaningless_hashtags:
        text = text.replace(f"#{hashtag}", "")
        text = text.replace(hashtag, "")

    metadata[file_id]["desc"] = text

# Save metadata to pickle
output_path = os.path.join(trace_dir, "metadata.pickle")

with open(output_path, "wb") as f:
    pickle.dump(metadata, f)

print(f"✅ Saved {len(metadata)} metadata to {output_path}")

## Pad Missing Metadata and Translate Non-English Metadata

⚠️ Please ensure [LLM-generated descriptions have been generated](llm_generated_description.ipynb) before running this cell.

In [None]:
from deep_translator import GoogleTranslator
from openai import OpenAI
import yaml

def load_text(file_name):
    description_file = os.path.join(trace_dir, "features", "llm_generated_description", file_name)
    with open(description_file) as file:
        text = file.read()
        if not isinstance(text, str):
            text = text.decode()
    return text

def gpt_wrapper(client, platform, description):
    prompt = (
        f"I will provide you with the description of a {platform} video. "
        "I want you to give a title to this video.\n"
        f"The description is: \"{description}\"\n"
        "Your answer is:"
    )
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip()

# Initialize OpenAI client
client = OpenAI(
    api_key=config["openai"]["api_key"],
)

output_dir = os.path.join(trace_dir, "features", "user_defined_metadata")
os.makedirs(output_dir, exist_ok=True)

for idx, file_id in enumerate(metadata, start=1):
    output_file = os.path.join(output_dir, f"{file_id}.txt")

    if os.path.exists(output_file):
        print(f"✅ Metadata already exists: {file_id}.txt")
        continue

    print(f"🛠️ [{idx}/{len(metadata)}] Generating: {file_id}.txt")

    # Load LLM-generated descriptions
    desc = metadata[file_id]["desc"]

    if desc.replace(" ", "") == "":
        # Generate metadata based on the descriptions
        desc = gpt_wrapper(client, "TikTok", load_text(f"{file_id}.txt"))
    else:
        # Translate metadata into English
        translation = GoogleTranslator(source="auto", target="en").translate(desc)
        if translation:
            desc = translation

    # Save processed metadata
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(desc)

    print(f"📦 Saved translated/generated metadata: {file_id}.txt")
    print(f"📝 Metadata {idx}/{len(file_ids)}: {desc}")