In [3]:
import pandas as pd
import os 

In [9]:
import os
import pandas as pd

def build_topic_matrix(data_root="data", min_count_threshold=5):
    platform_folders = {
        "tiktok": "tiktok",
        "twitter": "twitter",
        "youtube": "youtube",
        "bluesky": "bluesky"
    }

    date_cols = {
        "tiktok": "create_time",
        "twitter": "date",
        "youtube": "publishedAt",
        "bluesky": "created_at"
    }

    # Extract outlet names from filenames in any one platform
    outlet_set = set()
    sample_folder = os.path.join(data_root, platform_folders["twitter"])
    for f in os.listdir(sample_folder):
        if f.endswith("_with_sections.csv"):
            name = f.split("_")[0]
            outlet_set.add(name.lower())

    outlets = sorted(outlet_set)
    result = []

    for outlet in outlets:
        row = {"Outlet": outlet}
        for platform_key, folder in platform_folders.items():
            folder_path = os.path.join(data_root, folder)
            if not os.path.exists(folder_path):
                row[platform_key.capitalize()] = "Folder not found"
                continue

            try:
                matched_file = next(
                    (f for f in os.listdir(folder_path)
                     if f.lower().startswith(outlet.lower()) and platform_key.lower() in f.lower() and f.endswith("_with_sections.csv")),
                    None
                )

                if matched_file:
                    df = pd.read_csv(os.path.join(folder_path, matched_file))
                    date_col = date_cols.get(platform_key.lower())

                    if date_col in df.columns and "merged_section" in df.columns:
                        df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
                        df = df.dropna(subset=[date_col])

                        if len(df) >= min_count_threshold:
                            topic_counts = df["merged_section"].value_counts(normalize=True).round(3)
                            row[platform_key.capitalize()] = topic_counts.to_dict()
                        else:
                            row[platform_key.capitalize()] = "Not enough data"
                    else:
                        row[platform_key.capitalize()] = "Missing columns"
                else:
                    row[platform_key.capitalize()] = "File not found"
            except Exception as e:
                row[platform_key.capitalize()] = f"Error: {e}"
        result.append(row)

    return pd.DataFrame(result)


In [10]:
df_matrix = build_topic_matrix(data_root="data")
df_matrix


Unnamed: 0,Outlet,Tiktok,Twitter,Youtube,Bluesky
0,abc,File not found,"{'Politics': 0.214, 'Crime': 0.142, 'Entertain...",File not found,File not found
1,bbc,"{'Entertainment': 0.187, 'World': 0.175, 'Poli...",{'Entertainment': 1.0},File not found,File not found
2,breitbart,File not found,"{'Politics': 0.374, 'Entertainment': 0.108, 'G...",File not found,File not found
3,chicagotribune,File not found,"{'Sports': 0.19, 'Local News': 0.152, 'Politic...",File not found,File not found
4,cnn,"{'World': 0.281, 'Politics': 0.238, 'Breaking ...","{'Politics': 0.205, 'World': 0.205, 'Crime': 0...",File not found,"{'Politics': 0.225, 'Business & Economy': 0.12..."
5,dailywire,"{'Politics': 0.273, 'Entertainment': 0.182, 'L...","{'Politics': 0.534, 'World': 0.167, 'Defense &...",File not found,File not found
6,foxnews,"{'Politics': 0.449, 'World': 0.136, 'Crime': 0...","{'Politics': 0.39, 'Sports': 0.122, 'Crime': 0...",File not found,"{'Unknown': 0.993, 'I'm sorry, but I need the ..."
7,latimes,File not found,"{'Entertainment': 0.178, 'Sports': 0.132, 'Loc...",File not found,File not found
8,newsmax,File not found,"{'Politics': 0.506, 'Business & Economy': 0.11...",File not found,File not found
9,nypost,File not found,"{'Sports': 0.266, 'Entertainment': 0.126, 'Cri...",File not found,File not found
