In [12]:
import pandas as pd
import json
from collections import defaultdict

# === Step 1: Load files ===
edges_df = pd.read_csv("/Users/ananyaparikh/Documents/Coding/Force-DirectedHierarchyGraph/NewData2/lastfm_asia_edges.csv")
target_df = pd.read_csv("/Users/ananyaparikh/Documents/Coding/Force-DirectedHierarchyGraph/NewData2/lastfm_asia_target.csv")

with open("/Users/ananyaparikh/Documents/Coding/Force-DirectedHierarchyGraph/NewData2/lastfm_asia_features.json") as f:
    features_dict = json.load(f)

# === Group node IDs by community ===
community_to_nodes = defaultdict(list)
for _, row in target_df.iterrows():
    community_to_nodes[row['target']].append(str(row['id']))

# === Automatically pick 5 communities with 300–800 nodes 
selected_communities = [c for c, nodes in community_to_nodes.items() if 300 <= len(nodes) <= 800][:5]

if len(selected_communities) < 5:
    print("Not enough communities with 500–600 nodes. Only found:", len(selected_communities))

# Collect node IDs from these communities
selected_nodes = set()
for c in selected_communities:
    selected_nodes.update(community_to_nodes[c])

print(f"Selected communities: {selected_communities}")
print(f"Total selected nodes: {len(selected_nodes)}")

# === Filter features.json ===
filtered_features = {nid: features_dict[nid] for nid in selected_nodes if nid in features_dict}

# === Filter edges.csv ===
filtered_edges_df = edges_df[
    edges_df['node_1'].astype(str).isin(selected_nodes) &
    edges_df['node_2'].astype(str).isin(selected_nodes)
]

# === Filter target.csv ===
filtered_target_df = target_df[target_df['id'].astype(str).isin(selected_nodes)]

# === Save filtered files ===
with open("filtered_lastfm_features.json", "w") as f:
    json.dump(filtered_features, f, indent=2)

filtered_edges_df.to_csv("filtered_lastfm_edges.csv", index=False)
filtered_target_df.to_csv("filtered_lastfm_target.csv", index=False)

print("Filtered files saved as:")
print("• filtered_lastfm_features_5com.json")
print("• filtered_lastfm_edges_5com.csv")
print("• filtered_lastfm_target_5com.csv")

Selected communities: [8, 3, 5, 6, 14]
Total selected nodes: 2599
Filtered files saved as:
• filtered_lastfm_features_5com.json
• filtered_lastfm_edges_5com.csv
• filtered_lastfm_target_5com.csv


In [13]:
# Load files
edges = pd.read_csv("filtered_lastfm_edges.csv")
targets = pd.read_csv("filtered_lastfm_target.csv")
with open("filtered_lastfm_features.json") as f:
    features = json.load(f)

# Basic stats
print(f"Number of nodes: {len(features)}")
print(f"Number of edges: {len(edges)}")
print("Communities and counts:")
print(targets['target'].value_counts())


Number of nodes: 2599
Number of edges: 8121
Communities and counts:
target
6     655
14    570
3     515
8     468
5     391
Name: count, dtype: int64
