# Create a list of all tags {'tag' : frequency}

Extract tags from profiles data files and create a list of tags and frequencies.

In [8]:
import json
import os
from collections import Counter

# Profile data files containing tags
base_dir = os.path.abspath("..")  # go one level back to root
general_profiles = os.path.join(base_dir, "data", "scraped_data", "general_profiles_data.json")
profiles_by_comm = os.path.join(base_dir, "data", "scraped_data", "profiles_by_comm_data.json")
output_path = os.path.join(base_dir, "data", "tag_clustering", "all_tags.json")

all_tags = []

with open(general_profiles, "r", encoding="utf-8") as file:
    profiles = json.load(file)

    for profile in profiles.values():
        all_tags.extend(profile.get("tags", []))

with open(profiles_by_comm, "r", encoding="utf-8") as file:
    profiles_by_comm = json.load(file)

    for community in profiles_by_comm.values():
        for profile in community.values():
            all_tags.extend(profile.get("tags", []))

# Remove 'N/A' tag values
all_tags = [tag for tag in all_tags if tag != "N/A"]

# Count tag frequencies and save to JSON
tag_counts = dict(Counter(all_tags))

with open(output_path, "w", encoding="utf-8") as out:
    json.dump(tag_counts, out, indent=4)

print(f"Total unique tags (excluding 'N/A'): {len(tag_counts)}")
print(f"Saved to: {output_path}")

Total unique tags (excluding 'N/A'): 2965
Saved to: C:\Users\nastya\thesis\tum-thesis\data\tag_clustering\all_tags.json
