# Extend profile data by tag_to_cluster mapping

## Ceate a dict: {tag: cluster}

In [3]:
import json
import os

base_dir = os.path.abspath("..")
input_path = os.path.join(base_dir, "data", "tag_clustering", "tags_clustered.json")
output_path = os.path.join(base_dir, "data", "bipartite_network", "tag_to_cluster_mapping.json")

# Load file with clusters
with open(input_path, "r", encoding="utf-8") as f:
    clustered = json.load(f)

# Invert to {tag: cluster}
tag_to_cluster = {}
for cluster, tags in clustered.items():
    for tag in tags:
        tag_to_cluster[tag] = cluster

# Save file with mapping
with open(output_path, "w", encoding="utf-8") as out:
    json.dump(tag_to_cluster, out, indent=2)

print(f"Saved tag-to-cluster mapping to: {output_path}")
print(f"Total tags mapped: {len(tag_to_cluster)}")


Saved tag-to-cluster mapping to: C:\Users\nastya\thesis\tum-thesis\data\bipartite_network\tag_to_cluster_mapping.json
Total tags mapped: 2965


## extend profiles file by adding
        "tags": [
            "Lung disease",
            "Oxygen Therapy",
            "Respiratory failure"
        ],
        "tags_clusters" [
		        "Respiratory desease", 
		        "Respiratory desease", 
		        "Respiratory desease"
        ],

### general_profiles_data.json

In [6]:
import json
import os

base_dir = os.path.abspath("..")
general_profiles_path = os.path.join(base_dir, "data", "scraped_data", "general_profiles_data.json")
tag_map_path = os.path.join(base_dir, "data", "bipartite_network", "tag_to_cluster_mapping.json")
output_path = os.path.join(base_dir, "data", "bipartite_network", "general_profiles_data_with_tag_clusters.json")

# Load files
with open(general_profiles_path, "r", encoding="utf-8") as f:
    general_profiles = json.load(f)

with open(tag_map_path, "r", encoding="utf-8") as f:
    tag_to_cluster = json.load(f)

# Extend each profile with tag_clusters
for username, profile in general_profiles.items():
    tags = profile.get("tags", [])
    tag_clusters = [tag_to_cluster.get(tag, "N/A") for tag in tags]
    profile["tag_clusters"] = tag_clusters

# Save the updated profiles
with open(output_path, "w", encoding="utf-8") as out:
    json.dump(general_profiles, out, indent=2)

print(f"Extended profiles saved to: {output_path}")

Extended profiles saved to: C:\Users\nastya\thesis\tum-thesis\data\bipartite_network\general_profiles_data_with_tag_clusters.json


### profile_by_comm_data.json

In [4]:
import json
import os

base_dir = os.path.abspath("..")
profiles_path = os.path.join(base_dir, "data", "scraped_data", "profiles_by_comm_data.json")
tag_map_path = os.path.join(base_dir, "data", "bipartite_network", "tag_to_cluster_mapping.json")
output_path = os.path.join(base_dir, "data", "bipartite_network", "profiles_by_comm_data_with_tag_clusters.json")

# Load files
with open(profiles_path, "r", encoding="utf-8") as f:
    profiles_data = json.load(f)

with open(tag_map_path, "r", encoding="utf-8") as f:
    tag_to_cluster = json.load(f)

# Extend each profile with tag_clusters
for comm, users in profiles_data.items():
    for username, profile in users.items():
        tags = profile.get("tags", [])
        tag_clusters = [tag_to_cluster.get(tag, "N/A") for tag in tags]
        profile["tag_clusters"] = tag_clusters

# Save the updated profiles
with open(output_path, "w", encoding="utf-8") as out:
    json.dump(profiles_data, out, indent=2)

print(f"Extended profiles saved to: {output_path}")

Extended profiles saved to: C:\Users\nastya\thesis\tum-thesis\data\bipartite_network\profiles_by_comm_data_with_tag_clusters.json


# Create Bipartite Network

## General Pattern

### Create an edge list (source-target node): 3 cols
coachtrip | repiratory desease | 3 (weight for req of co-occurence)

In [7]:
import json
import os
import networkx as nx
from collections import defaultdict, Counter
import pandas as pd

base_dir = os.path.abspath("..")
file_path = os.path.join(base_dir, "data", "bipartite_network", "general_patterns", "general_profiles_data_with_tag_clusters.json")
output_path = os.path.join(base_dir, "data", "bipartite_network", "general_patterns", "cluster_co-occurrence_edges.json")

# Load file with profiles & tag-to-cluster mapping
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# 1. Create an edge list (user, cluster, weight)
# user - source node
# cluster - target node
# weight - for frequency of co-occurrence
edge_counts = defaultdict(int)
users_with_tags = set()

for user, info in data.items():
    clusters = info.get("tag_clusters", [])
    for cluster in clusters:
        if cluster != "N/A":
            edge_counts[(user, cluster)] += 1
            users_with_tags.add(user)

# 2. Create a bipartite graph (2 types of nodes: Users & Clusters)
# Input: nodes (source & target) & edge list
B = nx.Graph()

# Add nodes (Users & Clusters)
users = set()
clusters = set()

for (user, cluster), weight in edge_counts.items():
    users.add(user)
    clusters.add(cluster)
    B.add_node(user, bipartite=0)  # users
    B.add_node(cluster, bipartite=1)  # clusters
    B.add_edge(user, cluster, weight=weight)

# Visualize an edge list
edge_list = [(u, c, w) for (u, c), w in edge_counts.items()]
df = pd.DataFrame(edge_list, columns=["user", "cluster", "weight"])
print(df.head(10))

# 3. Project to cluster-cluster co-occurrence network
projected = nx.bipartite.weighted_projected_graph(B, clusters)

# 4. Save the projected network edge list (cluster co-occurrence)
cooccurrence_edges = [
    {"source": u, "target": v, "weight": d["weight"]}
    for u, v, d in projected.edges(data=True)
]

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(cooccurrence_edges, f, indent=2)

print(f"\nProjected cluster co-occurrence edges saved to: {output_path}")

print("\nSummary:")
print(f"Total users in file: {len(data)}")
print(f"Users with at least one cluster: {len(users_with_tags)}")
print(f"Total user-cluster edges: {len(edge_counts)}")
print(f"Unique clusters: {len(clusters)}")
print(f"Edges in projected cluster-cluster graph: {projected.number_of_edges()}")

         user                              cluster  weight
0   Coachtrip  Respiratory Conditions & Treatments       3
1  Wegeners88                  Autoimmune Diseases       1
2  Wegeners88      Metabolic & Endocrine Disorders       2
3  Wegeners88        Preventive Health & Screening       1
4  Wegeners88              Cardiovascular Diseases       2
5  Wegeners88         Hematology & Blood Disorders       1
6  Wegeners88        Lifestyle, Diet & Supplements       3
7  Tumorboy20               Neurological Disorders       2
8  Bodypump51              Cardiovascular Diseases       8
9  Bodypump51        Lifestyle, Diet & Supplements       4

Projected cluster co-occurrence edges saved to: C:\Users\nastya\thesis\tum-thesis\data\bipartite_network\general_patterns\cluster_co-occurrence_edges.json

Summary:
Total users in file: 11518
Users with at least one cluster: 8554
Total user-cluster edges: 22138
Unique clusters: 23
Edges in projected cluster-cluster graph: 253


## Community-Specific Patterns

### Create for EACH community a separate network:
coachtrip | repiratory desease | 3 (weight) | +1 new col: comm_url

In [8]:
import json
import os
import networkx as nx
from collections import defaultdict
import pandas as pd

base_dir = os.path.abspath("..")
input_path = os.path.join(base_dir, "data", "bipartite_network", "community-specific_patterns", "profiles_by_comm_data_with_tag_clusters.json")
output_dir = os.path.join(base_dir, "data", "bipartite_network", "community-specific_patterns", "cluster_co-occurrence_edges_by_comm")
os.makedirs(output_dir, exist_ok=True)

# Load file with profiles by community & tag-to-cluster mapping
with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# total_communities = len(data)
valid_communities = 0
skipped_communities = 0

# Create for EACH community a SEPARATE network
for comm_url, users_dict in data.items():
    print(f"\nProcessing community: {comm_url}")
    edge_counts = defaultdict(int)
    
    # # 1. Create an edge list (user, cluster, weight)
    # user - source node
    # cluster - target node
    # weight - for frequency of co-occurrence
    for user, profile in users_dict.items():
        clusters = profile.get("tag_clusters", [])
        for cluster in clusters:
            if cluster != "N/A":
                edge_counts[(user, cluster)] += 1

    # Skip community where members have not set any tags
    if not edge_counts:
        print(f"Skipping {comm_url}: no user-cluster edges.")
        skipped_communities += 1
        continue
    valid_communities += 1
    
    # 2. Create a bipartite graph (2 types of nodes: Users & Clusters)
    # Input: nodes (source & target) & edge list
    B = nx.Graph()

    # Add nodes (Users & Clusters)
    users = set()
    clusters = set()
    
    for (user, cluster), weight in edge_counts.items():
        users.add(user)
        clusters.add(cluster)
        B.add_node(user, bipartite=0)
        B.add_node(cluster, bipartite=1)
        B.add_edge(user, cluster, weight=weight)
    
    # Visualize an edge list
    edge_list = [(user, cluster, weight, comm_url) for (user, cluster), weight in edge_counts.items()]
    df = pd.DataFrame(edge_list, columns=["user", "cluster", "weight", "comm_url"])
    print(df.head(10))
    
    # 3. Project to cluster-cluster co-occurrence network
    projected = nx.bipartite.weighted_projected_graph(B, clusters)
    
    # 4. Save the projected network edge list (cluster co-occurrence)
    cooccurrence_edges = [
        {"source": u, "target": v, "weight": d["weight"]}
        for u, v, d in projected.edges(data=True)
    ]
    
    # Get filename from "comm_url" (e.g. "/pmrgcauk" -> "pmrgcauk.json")
    community_filename = comm_url.strip("/").replace("/", "_") + "_co-occurrence.json"
    output_path = os.path.join(output_dir, community_filename)

    with open(output_path, "w", encoding="utf-8") as f_out:
        json.dump(cooccurrence_edges, f_out, indent=2)

    print(f"\nProjected cluster co-occurrence edges saved to: {output_path}")
    
print("\nSummary:")
print(f"Total communities: {len(data)}")
print(f"Communities with tags: {valid_communities}")
print(f"Communities skipped (no tags): {skipped_communities}")


Processing community: /pmrgcauk
            user                                            cluster  weight  \
0     DorsetLady                  Pain & Musculoskeletal Conditions       5   
1     DorsetLady                                Autoimmune Diseases       1   
2     DorsetLady                    Metabolic & Endocrine Disorders       1   
3     DorsetLady                      Lifestyle, Diet & Supplements       1   
4     DorsetLady  Reproductive & Sexual Health (Women's Health &...       1   
5  SheffieldJane                  Pain & Musculoskeletal Conditions       4   
6  SheffieldJane                    Metabolic & Endocrine Disorders       5   
7  SheffieldJane                                Autoimmune Diseases       3   
8  SheffieldJane                      Lifestyle, Diet & Supplements       1   
9  SheffieldJane                            Cardiovascular Diseases       3   

    comm_url  
0  /pmrgcauk  
1  /pmrgcauk  
2  /pmrgcauk  
3  /pmrgcauk  
4  /pmrgcauk  
5  /pmr