In [17]:
import pandas as pd
import json
from collections import defaultdict

# === Step 1: Load original data using your full paths ===
edges_df = pd.read_csv("/Users/ananyaparikh/Documents/Coding/Force-DirectedHierarchyGraph/NewData2/lastfm_asia_edges.csv")
target_df = pd.read_csv("/Users/ananyaparikh/Documents/Coding/Force-DirectedHierarchyGraph/NewData2/lastfm_asia_target.csv")

with open("/Users/ananyaparikh/Documents/Coding/Force-DirectedHierarchyGraph/NewData2/lastfm_asia_features.json") as f:
    features_dict = json.load(f)

# === Step 2: Group node IDs by community ===
community_to_nodes = defaultdict(list)
for _, row in target_df.iterrows():
    community_to_nodes[row['target']].append(str(row['id']))

# === Step 3: Select 5 communities with 500–600 nodes each ===
selected_communities = [c for c, nodes in community_to_nodes.items() if 300 <= len(nodes) <= 800][:5]
if len(selected_communities) < 5:
    print("⚠️ Only found", len(selected_communities), "communities in the target size range.")
selected_nodes = set()
for c in selected_communities:
    selected_nodes.update(community_to_nodes[c])

print(f"Selected communities: {selected_communities}")
print(f"Total selected nodes: {len(selected_nodes)}")

# === Step 4: Filter features ===
filtered_features = {nid: features_dict[nid] for nid in selected_nodes if nid in features_dict}

# === Step 5: Filter edges ===
filtered_edges_df = edges_df[
    edges_df['node_1'].astype(str).isin(selected_nodes) &
    edges_df['node_2'].astype(str).isin(selected_nodes)
]

# === Step 6: Filter targets ===
filtered_target_df = target_df[target_df['id'].astype(str).isin(selected_nodes)]

# === Step 7: Save as correctly formatted files ===

# Save features as JSON
with open("/Users/ananyaparikh/Documents/Coding/Force-DirectedHierarchyGraph/NewData2/filtered_lastfm_features_5com.json", "w") as f:
    json.dump(filtered_features, f, indent=2)

# Save edges as .txt file (space-separated format)
with open("/Users/ananyaparikh/Documents/Coding/Force-DirectedHierarchyGraph/NewData2/filtered_lastfm_edges_5com.txt", "w") as f:
    for _, row in filtered_edges_df.iterrows():
        f.write(f"{row['node_1']} {row['node_2']}\n")

# Save community mapping as JSON
filtered_community_map = {
    str(row['id']): int(row['target']) for _, row in filtered_target_df.iterrows()
}
with open("/Users/ananyaparikh/Documents/Coding/Force-DirectedHierarchyGraph/NewData2/filtered_lastfm_community_5com.json", "w") as f:
    json.dump(filtered_community_map, f, indent=2)

print("✅ Done! Filtered files saved to:")
print("• filtered_lastfm_features_5com.json")
print("• filtered_lastfm_edges_5com.txt")
print("• filtered_lastfm_community_5com.json")

Selected communities: [8, 3, 5, 6, 14]
Total selected nodes: 2599
✅ Done! Filtered files saved to:
• filtered_lastfm_features_5com.json
• filtered_lastfm_edges_5com.txt
• filtered_lastfm_community_5com.json


In [16]:
# Load files
edges = pd.read_csv("filtered_lastfm_edges.csv")
targets = pd.read_csv("filtered_lastfm_target.csv")
with open("filtered_lastfm_features.json") as f:
    features = json.load(f)

# Basic stats
print(f"Number of nodes: {len(features)}")
print(f"Number of edges: {len(edges)}")
print("Communities and counts:")
print(targets['target'].value_counts())


Number of nodes: 2599
Number of edges: 8121
Communities and counts:
target
6     655
14    570
3     515
8     468
5     391
Name: count, dtype: int64
