In [None]:
!pip install pyvis

from google.colab import drive
import networkx as nx
import xml.etree.ElementTree as ET
from pyvis.network import Network
import random
import numpy as np

# Mount Google Drive
drive.mount('/content/drive')

# Load GraphML data
graphml_file_path = '/content/drive/MyDrive/ed_proj/network.graphml'  # Replace with the correct path

# Create a directed graph
G = nx.DiGraph()

# Parse the GraphML file
tree = ET.parse(graphml_file_path)
root = tree.getroot()

# Define namespaces
graphml_ns = {'g': 'http://graphml.graphdrawing.org/xmlns'}

# Extract nodes with attributes
for node in root.findall(".//g:node", namespaces=graphml_ns):
    node_id = node.get('id')
    name = node.find("g:data[@key='v_name']", namespaces=graphml_ns).text
    cluster = int(node.find("g:data[@key='v_cluster']", namespaces=graphml_ns).text)
    G.add_node(node_id, name=name, cluster=cluster)

# Extract edges with weights
for edge in root.findall(".//g:edge", namespaces=graphml_ns):
    source = edge.get('source')
    target = edge.get('target')
    weight = float(edge.find("g:data[@key='e_weight']", namespaces=graphml_ns).text)
    G.add_edge(source, target, weight=weight)

# Find the maximum weight in the graph
max_weight = max(edge[2]['weight'] for edge in G.edges(data=True))

# Scale the weights of all edges
for u, v, d in G.edges(data=True):
    d['weight'] = d['weight'] / max_weight if max_weight > 0 else 0

# Print the number of nodes and edges in the graph
num_nodes_in_graph = len(G.nodes)
num_edges_in_graph = len(G.edges)
print("Number of nodes in the graph:", num_nodes_in_graph)
print("Number of edges in the graph:", num_edges_in_graph)


Collecting pyvis
  Downloading pyvis-0.3.2-py3-none-any.whl (756 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m756.0/756.0 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Collecting jedi>=0.16 (from ipython>=5.3.0->pyvis)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi, pyvis
Successfully installed jedi-0.19.1 pyvis-0.3.2
Mounted at /content/drive
Number of nodes in the graph: 6169
Number of edges in the graph: 11056


In [None]:
import pandas as pd
import networkx as nx

# Load the user features data
user_features_file_path = '/content/drive/MyDrive/ed_proj/user_features.csv'  # Replace with the correct path
user_features_df = pd.read_csv(user_features_file_path)

# Define the percentile threshold for most criteria
percentile_threshold = 0.7  # Change this value to experiment with different percentiles

# Calculate the percentile values for each criterion
percentile_status_count = user_features_df['status_count'].quantile(percentile_threshold)
percentile_follower_count = user_features_df['follower_count'].quantile(percentile_threshold)
percentile_statuses_day = user_features_df['statuses_day'].quantile(percentile_threshold)
percentile_retweet_pro = user_features_df['retweet_pro'].quantile(percentile_threshold)
percentile_dmention_pro = user_features_df['dmention_pro'].quantile(percentile_threshold)
percentile_reply_pro = user_features_df['reply_pro'].quantile(percentile_threshold)
percentile_retweet_div = user_features_df['retweet_div'].quantile(percentile_threshold)
percentile_reply_div = user_features_df['reply_div'].quantile(percentile_threshold)
percentile_mention_div = user_features_df['mention_div'].quantile(percentile_threshold)
percentile_prostr = user_features_df['prostr'].quantile(0.4)
print(percentile_prostr)
# Define a separate threshold for 'prostr'
# prostr_threshold = 0.5  # Replace with the actual threshold you want to use

# Create a list to store the nodes where diffusion occurred
diffusion_nodes = []

# Iterate through rows in the user features DataFrame
for _, row in user_features_df.iterrows():
    uid = row['uid']

    # Check if the user is in the pro-ED group and meets the prostr threshold
    if row['cluster'] == 0 and row['prostr'] >= percentile_prostr:
        # Check if the user also exceeds the percentile thresholds for any other criterion
        if (row['status_count'] > percentile_status_count or
            row['follower_count'] > percentile_follower_count or
            row['statuses_day'] > percentile_statuses_day or
            row['retweet_pro'] > percentile_retweet_pro or
            row['dmention_pro'] > percentile_dmention_pro or
            row['reply_pro'] > percentile_reply_pro or
            row['retweet_div'] > percentile_retweet_div or
            row['reply_div'] > percentile_reply_div or
            row['mention_div'] > percentile_mention_div):

            node_name = f'n{int(uid)}'
            diffusion_nodes.append(node_name)

# Load the network data
network_file_path = '/content/drive/MyDrive/ed_proj/network.graphml'  # Replace with the correct path
G = nx.read_graphml(network_file_path)

# Create a DiGraph for the subgraph
subgraph = nx.DiGraph()

# Add nodes from the selected users to the subgraph
subgraph.add_nodes_from(diffusion_nodes)

# Add edges from the original graph based on your criteria
for source, target, data in G.edges(data=True):
    if source in diffusion_nodes and target in diffusion_nodes:
        subgraph.add_edge(source, target, **data)

# Output the first ten nodes and the size of the subgraph
print("First 10 nodes in the diffusion subgraph:", diffusion_nodes[:10])
print("Number of nodes in the diffusion subgraph:", len(subgraph.nodes))
print("Number of edges in the diffusion subgraph:", len(subgraph.edges))


0.0013024228400000005
First 10 nodes in the diffusion subgraph: ['n4', 'n44', 'n54', 'n62', 'n86', 'n90', 'n91', 'n142', 'n143', 'n155']
Number of nodes in the diffusion subgraph: 2802
Number of edges in the diffusion subgraph: 3158


In [None]:
# ... [previous code] ...

# Extract and calculate edge weight statistics
weights = [data['weight'] for _, _, data in G.edges(data=True)]
max_weight = max(weights)
min_weight = min(weights)
average_weight = sum(weights) / len(weights)
median_weight = np.median(weights)

# Print the statistics
print("Maximum Edge Weight:", max_weight)
print("Minimum Edge Weight:", min_weight)
print("Average Edge Weight:", average_weight)
print("Median Edge Weight:", median_weight)


Maximum Edge Weight: 1.0
Minimum Edge Weight: 0.0024937655860349127
Average Edge Weight: 0.0038110223717126718
Median Edge Weight: 0.0024937655860349127


In [None]:
import pandas as pd
import networkx as nx

# Load the user features data
user_features_file_path = '/content/drive/MyDrive/ed_proj/user_features.csv'  # Replace with the correct path
user_features_df = pd.read_csv(user_features_file_path)

# Create a list to store the pro-ED nodes
pro_ed_nodes = []

# Iterate through rows in the user features DataFrame to identify pro-ED nodes
for _, row in user_features_df.iterrows():
    uid = row['uid']
    if row['cluster'] == 0:  # Check if the user is in the pro-ED group
        node_name = f'n{int(uid)}'
        pro_ed_nodes.append(node_name)

# Load the network data
network_file_path = '/content/drive/MyDrive/ed_proj/network.graphml'  # Replace with the correct path
network_graph = nx.read_graphml(network_file_path)

# Create a DiGraph for the pro-ED subgraph
pro_ed_subgraph = nx.DiGraph()

# Add nodes and edges to the pro-ED subgraph
pro_ed_subgraph.add_nodes_from(pro_ed_nodes)
for source, target, data in network_graph.edges(data=True):
    if source in pro_ed_nodes and target in pro_ed_nodes:
        pro_ed_subgraph.add_edge(source, target, **data)

# Output the first ten nodes and the size of the pro-ED subgraph
num_nodes_in_pro_ed_subgraph = len(pro_ed_subgraph.nodes)
num_edges_in_pro_ed_subgraph = len(pro_ed_subgraph.edges)
print("First 10 nodes in the pro-ED subgraph:", pro_ed_nodes[:10])
print("Number of nodes in the pro-ED subgraph:", num_nodes_in_pro_ed_subgraph)
print("Number of edges in the pro-ED subgraph:", num_edges_in_pro_ed_subgraph)


First 10 nodes in the pro-ED subgraph: ['n2', 'n3', 'n4', 'n10', 'n31', 'n32', 'n39', 'n40', 'n41', 'n43']
Number of nodes in the pro-ED subgraph: 5683
Number of edges in the pro-ED subgraph: 8314


In [None]:
# Calculate degree centrality for nodes in the subgraph
degree_centrality = nx.degree_centrality(pro_ed_subgraph)

# Sort nodes by degree centrality in descending order
sorted_nodes_by_degree = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)

# Select the top 50 nodes with high degree centrality as seed nodes
top_degree_seed_nodes = [node[0] for node in sorted_nodes_by_degree[:50]]

# Initialize a queue to track nodes to be checked for influence
queue = top_degree_seed_nodes

# Define the probability of influence (you can adjust this value)
probability = 0.5

# Perform the Independent Cascade Model simulation on the subgraph
activated_nodes = set(top_degree_seed_nodes)
while queue:
    current_node = queue.pop(0)
    neighbors = list(pro_ed_subgraph.neighbors(current_node))

    for neighbor in neighbors:
        # Check if the neighbor is not already activated and influence occurs
        if neighbor not in activated_nodes and random.random() < probability:
            activated_nodes.add(neighbor)
            queue.append(neighbor)

# Print the activated nodes (nodes where information has diffused) in the subgraph
print("Activated Nodes (Nodes where information has diffused) in the subgraph:")
print(sorted(activated_nodes))

# Calculate the percentage of activated nodes in the subgraph
percentage_activated = len(activated_nodes) / num_nodes_in_pro_ed_subgraph * 100
print(f"Percentage of activated nodes in the subgraph: {percentage_activated:.2f}%")

Activated Nodes (Nodes where information has diffused) in the subgraph:
['n1002', 'n1003', 'n1004', 'n1005', 'n1010', 'n1012', 'n1013', 'n1014', 'n1015', 'n1017', 'n1021', 'n1028', 'n1031', 'n1036', 'n1043', 'n1044', 'n1048', 'n1050', 'n1051', 'n1052', 'n1053', 'n1055', 'n1056', 'n1057', 'n1059', 'n1060', 'n1062', 'n1065', 'n1070', 'n1072', 'n1074', 'n1075', 'n1078', 'n1079', 'n1090', 'n1092', 'n1098', 'n1111', 'n1122', 'n1125', 'n1138', 'n1160', 'n1191', 'n1197', 'n1253', 'n1281', 'n1300', 'n1315', 'n1320', 'n1321', 'n1323', 'n1325', 'n1350', 'n1356', 'n1408', 'n1413', 'n1414', 'n1415', 'n1416', 'n1429', 'n143', 'n1430', 'n1431', 'n1445', 'n1497', 'n1498', 'n1501', 'n1523', 'n1538', 'n1545', 'n1547', 'n155', 'n1550', 'n1575', 'n1583', 'n1585', 'n1586', 'n1588', 'n1598', 'n1617', 'n1653', 'n1657', 'n1659', 'n1660', 'n1663', 'n1664', 'n1666', 'n1667', 'n1669', 'n1694', 'n1696', 'n1700', 'n1712', 'n172', 'n1729', 'n1743', 'n1754', 'n1794', 'n1795', 'n1806', 'n1855', 'n187', 'n1871', 'n18

In [None]:
# Convert diffusion_nodes to a set for efficient intersection calculation
diffusion_nodes_set = set(diffusion_nodes)

# Calculate the intersection of diffusion nodes and activated nodes from LTM
activated_diffusion_nodes = diffusion_nodes_set.intersection(activated_nodes)

# Calculate the percentage
percentage_activated = len(activated_diffusion_nodes) / len(diffusion_nodes_set) * 100

print(f"Percentage of diffusion subgraph nodes activated by Independent cascade model: {percentage_activated:.2f}%")


Percentage of diffusion subgraph nodes activated by Independent cascade model: 10.31%


In [None]:
import random
import networkx as nx

# Perform diffusion with given seed nodes in a network
def perform_diffusion(graph, seed_nodes):
    activated_nodes = set(seed_nodes)
    queue = seed_nodes.copy()
    while queue:
        current_node = queue.pop(0)
        neighbors = list(graph.neighbors(current_node))
        for neighbor in neighbors:
            if neighbor not in activated_nodes:
                if random.random() < 0.9:  # Adjustable probability of influence
                    activated_nodes.add(neighbor)
                    queue.append(neighbor)
    return activated_nodes


In [None]:
# Calculate degree centrality for nodes in the pro-ED subgraph
degree_centrality_pro_ed = nx.degree_centrality(pro_ed_subgraph)

# Sort nodes by degree centrality in descending order
sorted_nodes_by_degree_pro_ed = sorted(degree_centrality_pro_ed.items(), key=lambda x: x[1], reverse=True)

# Select top 50 nodes with high degree centrality as seed nodes
top_degree_seed_nodes_pro_ed = [node[0] for node in sorted_nodes_by_degree_pro_ed[:50]]

# Perform diffusion with the selected seed nodes
activated_nodes_degree_pro_ed = perform_diffusion(pro_ed_subgraph, top_degree_seed_nodes_pro_ed)

# Print the selected seed nodes and the number of activated nodes
print("Selected Seed Nodes (Degree Centrality):")
print(top_degree_seed_nodes_pro_ed)
print("Number of Activated Nodes (Degree Centrality):", len(activated_nodes_degree_pro_ed))

# Calculate the percentage of activated nodes in the pro-ED subgraph
percentage_activated_degree_pro_ed = len(activated_nodes_degree_pro_ed) / num_nodes_in_pro_ed_subgraph * 100
print(f"Percentage of activated nodes in the pro-ED subgraph (Degree Centrality): {percentage_activated_degree_pro_ed:.2f}%")

Selected Seed Nodes (Degree Centrality):
['n264', 'n172', 'n237', 'n86', 'n32', 'n626', 'n192', 'n218', 'n4295', 'n574', 'n614', 'n493', 'n198', 'n543', 'n1138', 'n437', 'n87', 'n2072', 'n321', 'n1111', 'n255', 'n1974', 'n596', 'n220', 'n1125', 'n2663', 'n194', 'n801', 'n238', 'n568', 'n3270', 'n202', 'n416', 'n1653', 'n4545', 'n5121', 'n155', 'n432', 'n486', 'n1415', 'n5053', 'n565', 'n4850', 'n435', 'n717', 'n1281', 'n1408', 'n1538', 'n505', 'n506']
Number of Activated Nodes (Degree Centrality): 999
Percentage of activated nodes in the pro-ED subgraph (Degree Centrality): 17.58%


In [None]:
# Convert diffusion_nodes to a set for efficient intersection calculation
diffusion_nodes_set = set(diffusion_nodes)

# Calculate the intersection of diffusion nodes and activated nodes from LTM
activated_diffusion_nodes = diffusion_nodes_set.intersection(activated_nodes_degree_pro_ed)

# Calculate the percentage
percentage_activated = len(activated_diffusion_nodes) / len(diffusion_nodes_set) * 100

print(f"Percentage of diffusion subgraph nodes activated by activated_nodes_degree_pro_ed: {percentage_activated:.2f}%")


Percentage of diffusion subgraph nodes activated by activated_nodes_degree_pro_ed: 20.77%


In [None]:
# Greedy Influence Maximization Algorithm
def greedy_influence_maximization(graph, k, seed_nodes=[]):
    selected_seeds = seed_nodes
    while len(selected_seeds) < k:
        max_marginal_gain = 0
        next_seed = None
        for node in graph.nodes():
            if node not in selected_seeds:
                current_seeds = selected_seeds + [node]
                current_influence = len(perform_diffusion(graph, current_seeds))
                marginal_gain = current_influence - len(perform_diffusion(graph, selected_seeds))
                if marginal_gain > max_marginal_gain:
                    max_marginal_gain = marginal_gain
                    next_seed = node
        if next_seed is not None:
            selected_seeds.append(next_seed)
        else:
            break
    return selected_seeds

# Use the Greedy Influence Maximization algorithm to find seed nodes in the pro-ED subgraph
k_greedy_pro_ed = 50  # Number of seed nodes to select
seed_nodes_greedy_pro_ed = greedy_influence_maximization(pro_ed_subgraph, k_greedy_pro_ed, seed_nodes=top_degree_seed_nodes_pro_ed)

# Perform diffusion with the selected seed nodes from the Greedy algorithm
activated_nodes_greedy_pro_ed = perform_diffusion(pro_ed_subgraph, seed_nodes_greedy_pro_ed)

# Print the selected seed nodes and the number of activated nodes from the Greedy algorithm
print("Selected Seed Nodes (Greedy Algorithm):")
print(seed_nodes_greedy_pro_ed)
print("Number of Activated Nodes (Greedy Algorithm):", len(activated_nodes_greedy_pro_ed))

# Calculate the percentage of activated nodes in the pro-ED subgraph from the Greedy algorithm
percentage_activated_greedy_pro_ed = len(activated_nodes_greedy_pro_ed) / num_nodes_in_pro_ed_subgraph * 100
print(f"Percentage of activated nodes in the pro-ED subgraph (Greedy Algorithm): {percentage_activated_greedy_pro_ed:.2f}%")


Selected Seed Nodes (Greedy Algorithm):
['n264', 'n172', 'n237', 'n86', 'n32', 'n626', 'n192', 'n218', 'n4295', 'n574', 'n614', 'n493', 'n198', 'n543', 'n1138', 'n437', 'n87', 'n2072', 'n321', 'n1111', 'n255', 'n1974', 'n596', 'n220', 'n1125', 'n2663', 'n194', 'n801', 'n238', 'n568', 'n3270', 'n202', 'n416', 'n1653', 'n4545', 'n5121', 'n155', 'n432', 'n486', 'n1415', 'n5053', 'n565', 'n4850', 'n435', 'n717', 'n1281', 'n1408', 'n1538', 'n505', 'n506']
Number of Activated Nodes (Greedy Algorithm): 1032
Percentage of activated nodes in the pro-ED subgraph (Greedy Algorithm): 18.16%


In [None]:
# Convert diffusion_nodes to a set for efficient intersection calculation
diffusion_nodes_set = set(diffusion_nodes)

# Calculate the intersection of diffusion nodes and activated nodes from LTM
activated_diffusion_nodes = diffusion_nodes_set.intersection(activated_nodes_greedy_pro_ed)

# Calculate the percentage
percentage_activated = len(activated_diffusion_nodes) / len(diffusion_nodes_set) * 100

print(f"Percentage of diffusion subgraph nodes activated by activated_nodes_greedy_pro_ed: {percentage_activated:.2f}%")

Percentage of diffusion subgraph nodes activated by activated_nodes_greedy_pro_ed: 21.45%


In [None]:
import random

# Perform diffusion with the Linear Threshold Model
def perform_diffusion_ltm(graph, seed_nodes):
    activated_nodes = set(seed_nodes)
    queue = seed_nodes.copy()

    while queue:
        current_node = queue.pop(0)
        in_neighbors = list(graph.predecessors(current_node))

        for neighbor in in_neighbors:
            if neighbor not in activated_nodes:
                threshold = graph.nodes[neighbor]['threshold']
                num_influencing_neighbors = len(graph[neighbor])
                weighted_sum = sum(graph[neighbor][current_node]['weight'] for current_node in graph[neighbor])
                average_weighted_sum = weighted_sum / num_influencing_neighbors if num_influencing_neighbors > 0 else 0

                if average_weighted_sum >= threshold:
                    # print(average_weighted_sum, threshold)
                    activated_nodes.add(neighbor)
                    queue.append(neighbor)

    return activated_nodes

# Assign random thresholds to nodes in the pro-ED subgraph
for node in pro_ed_subgraph.nodes():
    pro_ed_subgraph.nodes[node]['threshold'] = random.uniform(0, 2)  # Adjustable threshold range

# Use the Linear Threshold Model to simulate diffusion on the pro-ED subgraph
activated_nodes_ltm_pro_ed = perform_diffusion_ltm(pro_ed_subgraph, top_degree_seed_nodes_pro_ed)

# Print the number of activated nodes with the Linear Threshold Model
print("Number of Activated Nodes (Linear Threshold Model):", len(activated_nodes_ltm_pro_ed))

# Calculate the percentage of activated nodes in the pro-ED subgraph with LTM
percentage_activated_ltm_pro_ed = len(activated_nodes_ltm_pro_ed) / num_nodes_in_pro_ed_subgraph * 100
print(f"Percentage of activated nodes in the pro-ED subgraph (Linear Threshold Model): {percentage_activated_ltm_pro_ed:.2f}%")


Number of Activated Nodes (Linear Threshold Model): 1404
Percentage of activated nodes in the pro-ED subgraph (Linear Threshold Model): 24.71%


In [None]:
# Convert diffusion_nodes to a set for efficient intersection calculation
diffusion_nodes_set = set(diffusion_nodes)

# Calculate the intersection of diffusion nodes and activated nodes from LTM
activated_diffusion_nodes = diffusion_nodes_set.intersection(activated_nodes_ltm_pro_ed)

# Calculate the percentage
percentage_activated = len(activated_diffusion_nodes) / len(diffusion_nodes_set) * 100

print(f"Percentage of diffusion subgraph nodes activated by LTM: {percentage_activated:.2f}%")


Percentage of diffusion subgraph nodes activated by LTM: 27.09%


In [None]:
# Visualization with pyvis
net = Network(height='750px', width='100%', bgcolor='#222222', font_color='white')

# Add nodes and edges to the network
for node in subgraph.nodes:
    # Nodes that are part of the diffusion subgraph but not activated by the LTM will be green
    # Nodes activated by the LTM will be red
    node_color = "#ff0000" if node in activated_nodes_ltm_pro_ed else "#00ff00"
    net.add_node(node, title=node, color=node_color)

for edge in subgraph.edges:
    net.add_edge(edge[0], edge[1])

# Save the network visualization as an HTML file
file_path = "/content/drive/MyDrive/ed_proj/diffusiongraph_vs_LTMactivated_visualization.html"
net.save_graph(file_path)

# If using in a Jupyter notebook, you can display it inline as well
# net.show("diffusiongraph_vs_LTMactivated_visualization.html")
