In [3]:
import pandas as pd
import os 

In [14]:
import os
import pandas as pd
from collections import defaultdict

def build_topic_matrix(data_root="data", min_count_threshold=5):
    platform_folders = {
        "tiktok": "tiktok",
        "twitter": "twitter",
        "youtube": "youtube",
        "bluesky": "bluesky"
    }

    date_cols = {
        "tiktok": "create_time",
        "twitter": "date",
        "youtube": "publishedAt",
        "bluesky": "created_at"
    }

    # Prepare a nested dictionary: {outlet: {Platform: value}}
    outlet_matrix = defaultdict(dict)

    for platform_key, folder in platform_folders.items():
        folder_path = os.path.join(data_root, folder)
        if not os.path.isdir(folder_path):
            continue

        for file in os.listdir(folder_path):
            if file.endswith("_with_sections.csv"):
                try:
                    outlet_name = file.split("_")[0].lower()
                    file_path = os.path.join(folder_path, file)
                    df = pd.read_csv(file_path)

                    date_col = date_cols.get(platform_key.lower())
                    if date_col not in df.columns or "merged_section" not in df.columns:
                        outlet_matrix[outlet_name][platform_key.capitalize()] = "Missing columns"
                        continue

                    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
                    df = df.dropna(subset=[date_col])

                    if len(df) >= min_count_threshold:
                        topic_counts = df["merged_section"].value_counts(normalize=True).round(3)
                        outlet_matrix[outlet_name][platform_key.capitalize()] = topic_counts.to_dict()
                    else:
                        outlet_matrix[outlet_name][platform_key.capitalize()] = "Not enough data"
                except Exception as e:
                    outlet_matrix[outlet_name][platform_key.capitalize()] = f"Error: {e}"

    # Convert to final DataFrame
    rows = []
    for outlet, platform_data in outlet_matrix.items():
        row = {"Outlet": outlet}
        for platform in platform_folders.keys():
            row[platform.capitalize()] = platform_data.get(platform.capitalize(), "File not found")
        rows.append(row)

    return pd.DataFrame(rows)


In [15]:
df_matrix = build_topic_matrix(data_root="data")
df_matrix


Unnamed: 0,Outlet,Tiktok,Twitter,Youtube,Bluesky
0,foxnews,"{'Politics': 0.438, 'World News': 0.064, 'Ente...","{'Politics': 0.386, 'Sports': 0.089, 'Entertai...","{'Politics': 0.689, 'World': 0.16, 'Crime': 0....",File not found
1,npr,"{'Politics': 0.167, 'World News': 0.121, 'Brea...",File not found,"{'Politics': 0.241, 'World': 0.177, 'General N...",File not found
2,reuters,"{'Politics': 0.177, 'World News': 0.119, 'Ente...","{'Business': 0.218, 'Politics': 0.156, 'Financ...","{'World': 0.25, 'Politics': 0.207, 'Business &...","{'Business': 0.205, 'Sports': 0.202, 'Finance'..."
3,dailywire,"{'Politics': 0.273, 'Entertainment': 0.182, 'L...","{'Politics': 0.534, 'World News': 0.095, 'Inte...","{'Politics': 0.903, 'Entertainment': 0.032, 'J...",File not found
4,vice,"{'Entertainment': 0.442, 'Music': 0.116, 'Life...",File not found,"{'Arts & Culture': 0.407, 'Entertainment': 0.2...",File not found
5,huffpost,"{'Politics': 0.647, 'Sports': 0.088, 'Entertai...",File not found,"{'Politics': 0.444, 'Health': 0.333, 'Entertai...","{'Politics': 0.342, 'Entertainment': 0.167, 'L..."
6,nytimes,"{'Politics': 0.242, 'World News': 0.17, 'Break...","{'Sports': 0.276, 'Politics': 0.226, 'Opinion'...","{'Politics': 0.423, 'World': 0.282, 'Entertain...","{'Politics': 0.234, 'Sports': 0.071, 'World Ne..."
7,bbcnews,"{'Politics': 0.15, 'Entertainment': 0.14, 'Wor...",File not found,"{'World': 0.338, 'Unknown': 0.182, 'Politics':...",File not found
8,msnbc,"{'Politics': 0.712, 'Breaking News': 0.054, 'W...",File not found,"{'Politics': 0.674, 'World': 0.106, 'Breaking ...","{'Politics': 0.468, 'Opinion': 0.161, 'Breakin..."
9,nowthisimpact,"{'Politics': 0.473, 'Entertainment': 0.088, 'O...",File not found,"{'Politics': 0.473, 'Entertainment': 0.11, 'He...",File not found
