In [1]:
import pandas as pd
import numpy as np

In [2]:
# read data
data = pd.read_csv('../data/comments_data.csv')

# ignore columns parent_id
data = data.drop(columns=['parent_id'])

data.head()

Unnamed: 0,link,comment_id,username,text,created_at,like_count,reply_count
0,https://www.instagram.com/p/DOV0Rn2DNf-/,17916292923173899,lorenzoabim,Anjir Dahnil kebagian kue nya :),1757333808,192,16
1,https://www.instagram.com/p/DOV0Rn2DNf-/,18095910739726736,fndnrz,@risqi_aguspianto hahaha bukan pendukung prabo...,1757372912,0,0
2,https://www.instagram.com/p/DOV0Rn2DNf-/,17980524434883939,muchlizdarma1,@lorenzoabim kerjanya setaun sekali,1757387323,0,0
3,https://www.instagram.com/p/DOV0Rn2DNf-/,18173137474359052,ikbarsinaga999,@al_hazhouse.land Kaesang hahahha,1757676329,0,0
4,https://www.instagram.com/p/DOV0Rn2DNf-/,18038689634666892,rahmadsejati,"harapan rakyat sederhana 🙏 kerja nyata, hasil ...",1757334122,757,2


In [4]:
# convert comment_time to datetime
data['created_at'] = pd.to_datetime(data['created_at'], unit='s')

# extract date from comment_time
data['date'] = data['created_at'].dt.date

data.head()

Unnamed: 0,link,comment_id,username,text,created_at,like_count,reply_count,date
0,https://www.instagram.com/p/DOV0Rn2DNf-/,17916292923173899,lorenzoabim,Anjir Dahnil kebagian kue nya :),2025-09-08 12:16:48,192,16,2025-09-08
1,https://www.instagram.com/p/DOV0Rn2DNf-/,18095910739726736,fndnrz,@risqi_aguspianto hahaha bukan pendukung prabo...,2025-09-08 23:08:32,0,0,2025-09-08
2,https://www.instagram.com/p/DOV0Rn2DNf-/,17980524434883939,muchlizdarma1,@lorenzoabim kerjanya setaun sekali,2025-09-09 03:08:43,0,0,2025-09-09
3,https://www.instagram.com/p/DOV0Rn2DNf-/,18173137474359052,ikbarsinaga999,@al_hazhouse.land Kaesang hahahha,2025-09-12 11:25:29,0,0,2025-09-12
4,https://www.instagram.com/p/DOV0Rn2DNf-/,18038689634666892,rahmadsejati,"harapan rakyat sederhana 🙏 kerja nyata, hasil ...",2025-09-08 12:22:02,757,2,2025-09-08


#### Clean Text

In [None]:
import re

def clean_comment(text):
    if pd.isna(text):
        return None
    
    # delete mention (@username)
    # text = re.sub(r'@\w+', '', text)

    # lowercase
    text = text.lower()
    
    # delete link
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # delete non-printable characters except emoticons
    text = re.sub(r'[^\x00-\x7F]+', lambda m: m.group(0), text)
    
    # delete whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # delete emoji
    text = re.sub(r'[^\w\s,]', '', text)
    
    return text if text else None

In [7]:
# apply to dataset
data['clean_comment'] = data['text'].apply(clean_comment)

# drop null or empty
data = data.dropna(subset=['clean_comment']).reset_index(drop=True)

data.head()

Unnamed: 0,link,comment_id,username,text,created_at,like_count,reply_count,date,clean_comment
0,https://www.instagram.com/p/DOV0Rn2DNf-/,17916292923173899,lorenzoabim,Anjir Dahnil kebagian kue nya :),2025-09-08 12:16:48,192,16,2025-09-08,anjir dahnil kebagian kue nya
1,https://www.instagram.com/p/DOV0Rn2DNf-/,18095910739726736,fndnrz,@risqi_aguspianto hahaha bukan pendukung prabo...,2025-09-08 23:08:32,0,0,2025-09-08,hahaha bukan pendukung prabowo dari tahun 2014
2,https://www.instagram.com/p/DOV0Rn2DNf-/,17980524434883939,muchlizdarma1,@lorenzoabim kerjanya setaun sekali,2025-09-09 03:08:43,0,0,2025-09-09,kerjanya setaun sekali
3,https://www.instagram.com/p/DOV0Rn2DNf-/,18173137474359052,ikbarsinaga999,@al_hazhouse.land Kaesang hahahha,2025-09-12 11:25:29,0,0,2025-09-12,land kaesang hahahha
4,https://www.instagram.com/p/DOV0Rn2DNf-/,18038689634666892,rahmadsejati,"harapan rakyat sederhana 🙏 kerja nyata, hasil ...",2025-09-08 12:22:02,757,2,2025-09-08,"harapan rakyat sederhana kerja nyata, hasil t..."


#### Community

In [22]:
import re
import networkx as nx
import pandas as pd
import community as community_louvain
from pyvis.network import Network

In [23]:
# --- Build mention edges ---
mention_pattern = re.compile(r'@(\w+)')
edge_weights = {}
comment_count = data['username'].value_counts().to_dict()
like_sum = data.groupby('username')['like_count'].sum().to_dict()

for _, row in data.iterrows():
    author = row['username']
    text = str(row['text'])
    mentions = mention_pattern.findall(text)
    for mentioned in mentions:
        if author != mentioned:  # avoid self-loops
            edge_weights[(author, mentioned)] = edge_weights.get((author, mentioned), 0) + 1

In [24]:
# --- Build graph ---
G = nx.DiGraph()
for (a, b), w in edge_weights.items():
    G.add_edge(a, b, weight=w)

# --- Only keep users with at least 1 connection ---
connected_nodes = set(G.nodes())
G = G.subgraph(connected_nodes).copy()

# --- Add node attributes ---
for node in G.nodes():
    G.nodes[node]['comment_count'] = comment_count.get(node, 0)
    G.nodes[node]['like_count'] = like_sum.get(node, 0)

In [25]:
# --- Louvain communities (on undirected version) ---
partition = community_louvain.best_partition(G.to_undirected())
nx.set_node_attributes(G, partition, 'community')

In [26]:
# only keep users that appear in the graph
users_community = [
    {'username': node,
     'community': G.nodes[node]['community'],
     'comment_count': G.nodes[node]['comment_count'],
     'like_count': G.nodes[node]['like_count']}
    for node in G.nodes()
]

df_users = pd.DataFrame(users_community)
df_users.to_csv("../data/users_community.csv", index=False)

In [31]:
# --- Build Pyvis network ---
net = Network(height="800px", width="100%", notebook=True, directed=True)
net.force_atlas_2based()

for node in G.nodes():
    net.add_node(
        node,
        label=node,
        size=5 + G.nodes[node]['comment_count']*2,   # scale node size
        title=f"Comments: {G.nodes[node]['comment_count']}<br>Likes: {G.nodes[node]['like_count']}",
        group=G.nodes[node]['community']
    )

for u, v, d in G.edges(data=True):
    net.add_edge(u, v, value=d['weight'], title=f"Mentions: {d['weight']}")

net.show("mention_network.html", notebook=False)

mention_network.html


In [41]:

import re
import networkx as nx
import pandas as pd
import community as community_louvain
from pyvis.network import Network
import numpy as np

def analyze_instagram_mention_network(data, min_mentions=5, output_dir="../data/"):
    """
    Analyze Instagram comment mention network and detect communities
    
    Parameters:
    data: DataFrame with columns ['username', 'text', 'like_count']
    min_mentions: Minimum number of mentions for a user to be included
    output_dir: Directory to save output files
    """
    
    # --- Build mention edges ---
    print("Building mention network...")
    mention_pattern = re.compile(r'@(\w+)')
    edge_weights = {}
    
    # Calculate user statistics
    comment_count = data['username'].value_counts().to_dict()
    like_sum = data.groupby('username')['like_count'].sum().to_dict()
    
    # Extract mentions and build edge weights
    for _, row in data.iterrows():
        author = row['username']
        text = str(row['text']) if pd.notna(row['text']) else ""
        mentions = mention_pattern.findall(text)
        
        for mentioned in mentions:
            if author != mentioned:  # avoid self-loops
                edge_weights[(author, mentioned)] = edge_weights.get((author, mentioned), 0) + 1
    
    print(f"Found {len(edge_weights)} unique mention relationships")
    
    # --- Build initial graph ---
    G = nx.DiGraph()
    for (a, b), w in edge_weights.items():
        G.add_edge(a, b, weight=w)
    
    # --- Filter users with at least min_mentions total mentions (sent + received) ---
    print(f"Filtering users with at least {min_mentions} mentions...")
    
    # Calculate total mentions per user (both sent and received)
    mention_counts = {}
    for (author, mentioned), weight in edge_weights.items():
        mention_counts[author] = mention_counts.get(author, 0) + weight  # mentions sent
        mention_counts[mentioned] = mention_counts.get(mentioned, 0) + weight  # mentions received
    
    # Filter users
    active_users = {user for user, count in mention_counts.items() if count >= min_mentions}
    print(f"Keeping {len(active_users)} users with >= {min_mentions} mentions")
    
    # Create filtered graph
    G_filtered = nx.DiGraph()
    for (a, b), w in edge_weights.items():
        if a in active_users and b in active_users:
            G_filtered.add_edge(a, b, weight=w)
    
    G = G_filtered
    print(f"Final graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
    
    if G.number_of_nodes() == 0:
        print("No users meet the minimum mention criteria!")
        return None, None
    
    # --- Add node attributes ---
    for node in G.nodes():
        G.nodes[node]['comment_count'] = comment_count.get(node, 0)
        G.nodes[node]['like_count'] = like_sum.get(node, 0)
        G.nodes[node]['total_mentions'] = mention_counts.get(node, 0)
    
    # --- Detect communities using Louvain algorithm ---
    print("Detecting communities...")
    partition = community_louvain.best_partition(G.to_undirected())
    nx.set_node_attributes(G, partition, 'community')
    
    # Print community statistics
    communities = {}
    for node, comm in partition.items():
        if comm not in communities:
            communities[comm] = []
        communities[comm].append(node)
    
    print(f"\nFound {len(communities)} communities:")
    for comm_id, members in communities.items():
        print(f"Community {comm_id}: {len(members)} members")
    
    # --- Save community data to CSV ---
    users_community = []
    for node in G.nodes():
        users_community.append({
            'username': node,
            'community': G.nodes[node]['community'],
            'comment_count': G.nodes[node]['comment_count'],
            'like_count': G.nodes[node]['like_count'],
            'total_mentions': G.nodes[node]['total_mentions']
        })
    
    df_users = pd.DataFrame(users_community)
    csv_path = f"{output_dir}users_community.csv"
    df_users.to_csv(csv_path, index=False)
    print(f"Saved community data to {csv_path}")
    
    # --- Build interactive visualization with Pyvis ---
    print("Creating network visualization...")
    net = Network(height="1000px", width="100%", directed=True, bgcolor="#222222", font_color="white")
    
    # Use force-directed layout
    net.force_atlas_2based()
    
    # Define colors for different communities
    colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7', 
              '#DDA0DD', '#98D8C8', '#F7DC6F', '#BB8FCE', '#85C1E9']
    
    # Add nodes
    for node in G.nodes():
        community_id = G.nodes[node]['community']
        color = colors[community_id % len(colors)]
        
        # Scale node size based on total mentions
        size = max(10, min(50, 10 + G.nodes[node]['total_mentions'] * 2))
        
        net.add_node(
            node,
            label=node,
            size=size,
            title=f"User: {node}<br>Community: {community_id}<br>Comments: {G.nodes[node]['comment_count']}<br>Total Likes: {G.nodes[node]['like_count']}<br>Total Mentions: {G.nodes[node]['total_mentions']}",
            color=color,
            group=community_id
        )
    
    # Add edges
    for u, v, d in G.edges(data=True):
        # Scale edge width based on mention frequency
        width = min(10, max(1, d['weight']))
        net.add_edge(u, v, 
                    value=d['weight'], 
                    title=f"Mentions: {d['weight']}",
                    width=width)
    
    # Set physics options for better layout
    net.set_options("""
    var options = {
      "physics": {
        "forceAtlas2Based": {
          "gravitationalConstant": -50,
          "centralGravity": 0.01,
          "springLength": 100,
          "springConstant": 0.08
        },
        "maxVelocity": 50,
        "solver": "forceAtlas2Based",
        "timestep": 0.35,
        "stabilization": {"iterations": 150}
      }
    }
    """)
    
    # Save visualization
    html_path = "mention_network.html"
    net.show(html_path, notebook=False)
    print(f"Saved network visualization to {html_path}")
    
    return G, df_users

# --- Additional Analysis Functions ---

def print_network_stats(G):
    """Print basic network statistics"""
    if G is None:
        return
        
    print("\n=== NETWORK STATISTICS ===")
    print(f"Nodes: {G.number_of_nodes()}")
    print(f"Edges: {G.number_of_edges()}")
    print(f"Density: {nx.density(G):.4f}")
    
    if G.number_of_nodes() > 0:
        # Most mentioned users
        in_degree = dict(G.in_degree(weight='weight'))
        top_mentioned = sorted(in_degree.items(), key=lambda x: x[1], reverse=True)[:5]
        print(f"\nMost mentioned users:")
        for user, mentions in top_mentioned:
            print(f"  @{user}: {mentions} mentions")
        
        # Most active mentioners
        out_degree = dict(G.out_degree(weight='weight'))
        top_mentioners = sorted(out_degree.items(), key=lambda x: x[1], reverse=True)[:5]
        print(f"\nMost active mentioners:")
        for user, mentions in top_mentioners:
            print(f"  @{user}: {mentions} mentions sent")

def analyze_communities(df_users):
    """Analyze community characteristics"""
    if df_users is None or df_users.empty:
        return
        
    print("\n=== COMMUNITY ANALYSIS ===")
    community_stats = df_users.groupby('community').agg({
        'username': 'count',
        'comment_count': 'sum',
        'like_count': 'sum',
        'total_mentions': 'sum'
    }).rename(columns={'username': 'size'})
    
    print(community_stats)
    
    # Find most influential users in each community
    print("\nTop users per community (by total mentions):")
    for comm_id in df_users['community'].unique():
        comm_users = df_users[df_users['community'] == comm_id]
        top_user = comm_users.loc[comm_users['total_mentions'].idxmax()]
        print(f"Community {comm_id}: @{top_user['username']} ({top_user['total_mentions']} mentions)")

In [42]:
# To run the analysis:
G, df_users = analyze_instagram_mention_network(data, min_mentions=2)

Building mention network...
Found 1200 unique mention relationships
Filtering users with at least 2 mentions...
Keeping 408 users with >= 2 mentions
Final graph: 342 nodes, 419 edges
Detecting communities...

Found 87 communities:
Community 0: 67 members
Community 1: 2 members
Community 2: 5 members
Community 3: 2 members
Community 4: 2 members
Community 5: 13 members
Community 6: 2 members
Community 7: 13 members
Community 8: 26 members
Community 10: 7 members
Community 11: 2 members
Community 12: 2 members
Community 13: 5 members
Community 14: 2 members
Community 15: 2 members
Community 16: 2 members
Community 17: 7 members
Community 18: 20 members
Community 19: 2 members
Community 20: 2 members
Community 21: 2 members
Community 22: 5 members
Community 23: 2 members
Community 24: 2 members
Community 25: 2 members
Community 26: 2 members
Community 27: 5 members
Community 28: 2 members
Community 29: 2 members
Community 30: 2 members
Community 31: 2 members
Community 32: 2 members
Commu

In [45]:
print_network_stats(G)


=== NETWORK STATISTICS ===
Nodes: 342
Edges: 419
Density: 0.0036

Most mentioned users:
  @prabowo: 97 mentions
  @kemenpar: 21 mentions
  @presidenrepublikindonesia: 21 mentions
  @smindrawati: 16 mentions
  @gerindra: 14 mentions

Most active mentioners:
  @prasetyo_emilarie: 9 mentions sent
  @fahria.zxcc: 7 mentions sent
  @rafly.wibowo240101: 7 mentions sent
  @alamsyahsubakri: 7 mentions sent
  @tomsuryapanji: 6 mentions sent


In [46]:
analyze_communities(df_users)


=== COMMUNITY ANALYSIS ===


           size  comment_count  like_count  total_mentions
community                                                 
0            67             69         100             447
1             2              3          31               5
2             5              8          40              14
3             2              4           0               5
4             2              3           2               4
...         ...            ...         ...             ...
82            2              2           1               4
83            2              4           0               6
84            3              4           1               6
85            2              3           1               4
86            2              4          13               5

[87 rows x 4 columns]

Top users per community (by total mentions):
Community 0: @prabowo (235 mentions)
Community 1: @plutonioneon (3 mentions)
Community 2: @syakurhilmy_ (4 mentions)
Community 3: @bang (3 mentions)
Community 4: @w