In [1]:
# scratch_location = r'/scratch/hmnshpl'
import getpass
import pandas as pd
import numpy as np
import networkx as nx

dataset_name = 'wikipedia'
scratch_location = rf'/scratch/{getpass.getuser()}'

In [2]:
from copy import deepcopy
import os
import sys

In [3]:
# Load data and train val test split
graph_df = pd.read_csv('{}/processed_data/{}/ml_{}.csv'.format(scratch_location,
                                                            dataset_name,
                                                            dataset_name)
                    )
edge_raw_features = np.load('{}/processed_data/{}/ml_{}.npy'.format(scratch_location,
                                                                    dataset_name,
                                                                    dataset_name)
                            )
node_raw_features = np.load('{}/processed_data/{}/ml_{}_node.npy'.format(scratch_location,
                                                                        dataset_name,
                                                                        dataset_name)
                            )

In [4]:
# Extract nodes, edges, and timestamps
edges = graph_df[['u', 'i', 'ts']].values

In [5]:
# get the timestamp of validate and test set
val_ratio = test_ratio = 0.15
val_time, test_time = list(np.quantile(graph_df.ts, [(1 - val_ratio - test_ratio), (1 - test_ratio)]))

In [6]:
# Split data into train, val, test (assuming you have timestamps for this)
train_edges = edges[edges[:, 2] < val_time]
val_edges = edges[(edges[:, 2] >= val_time) & (edges[:, 2] < test_time)]
test_edges = edges[edges[:, 2] >= test_time]

In [7]:
# Create a bipartite temporal graph
G = nx.Graph()
for edge in train_edges:
    source, target, timestamp = edge
    G.add_edge(source, target, timestamp=timestamp)

In [8]:
# Initialize PageRank scores
page_rank_scores = {node: 1.0 / G.number_of_nodes() for node in G.nodes()}
init_page_rank_scores = deepcopy(page_rank_scores)

In [9]:
# Parameters for Temporal PageRank
alpha = 0.85  # Damping factor
max_iter = 100
tol = 1e-6

def temporal_page_rank(G, alpha=0.85, max_iter=100, tol=1e-6):
    nodes = G.nodes()
    num_nodes = G.number_of_nodes()
    
    # Initialize PageRank scores
    pr = {node: 1.0 / num_nodes for node in nodes}
    temp_pr = pr.copy()

    for _ in range(max_iter):
        change = 0
        for node in nodes:
            rank_sum = sum(pr[neighbor] / len(G[neighbor]) for neighbor in G.neighbors(node) if 'timestamp' in G[node][neighbor])
            temp_pr[node] = (1 - alpha) / num_nodes + alpha * rank_sum
        
        # Calculate change for convergence check
        change = sum(abs(temp_pr[node] - pr[node]) for node in nodes)
        
        if change < tol:
            break
        
        pr = temp_pr.copy()

    return pr

In [10]:
# Calculate Temporal PageRank
page_rank_scores = temporal_page_rank(G, alpha=0.85, max_iter=100, tol=1e-6)

# Output the PageRank scores
# print(page_rank_scores)

In [11]:
# Sort nodes by PageRank scores
sorted_nodes = sorted(page_rank_scores.items(), key=lambda item: item[1], reverse=True)

# Calculate the top 30% of nodes
top_30_percent_count = int(len(sorted_nodes) * 0.30)
top_30_percent_nodes = sorted_nodes[:top_30_percent_count]

In [12]:
len(top_30_percent_nodes), len(page_rank_scores)

(2242, 7475)

In [13]:
100* (2242/7475)

29.9933110367893

In [14]:
# Output the top 30% of nodes and their PageRank scores
for node, score in top_30_percent_nodes:
    print(f'Node: {node:.0f}, PageRank Score: {score:.2e}, initial page_rank: {init_page_rank_scores[node]:.2e}')

Node: 234, PageRank Score: 1.12e-02, initial page_rank: 1.34e-04
Node: 8412, PageRank Score: 7.35e-03, initial page_rank: 1.34e-04
Node: 69, PageRank Score: 6.73e-03, initial page_rank: 1.34e-04
Node: 8807, PageRank Score: 5.09e-03, initial page_rank: 1.34e-04
Node: 18, PageRank Score: 4.75e-03, initial page_rank: 1.34e-04
Node: 8888, PageRank Score: 4.69e-03, initial page_rank: 1.34e-04
Node: 9070, PageRank Score: 4.52e-03, initial page_rank: 1.34e-04
Node: 8623, PageRank Score: 4.13e-03, initial page_rank: 1.34e-04
Node: 8889, PageRank Score: 3.87e-03, initial page_rank: 1.34e-04
Node: 8340, PageRank Score: 3.78e-03, initial page_rank: 1.34e-04
Node: 8296, PageRank Score: 3.67e-03, initial page_rank: 1.34e-04
Node: 9167, PageRank Score: 3.56e-03, initial page_rank: 1.34e-04
Node: 8735, PageRank Score: 3.56e-03, initial page_rank: 1.34e-04
Node: 41, PageRank Score: 2.99e-03, initial page_rank: 1.34e-04
Node: 8624, PageRank Score: 2.85e-03, initial page_rank: 1.34e-04
Node: 8261, PageR

In [15]:
# assert 0==1

AssertionError: 

In [16]:
# Set the working directory to the project root
project_root = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))
sys.path.append(project_root)

In [17]:
# os.getcwd()
project_root

'/home2/hmnshpl/projects/DyGLib'

In [22]:
# from TPR.temporal_page_rank import TemporalPageRankComputer, TemporalPageRankParams

In [21]:
from TPR import temporal_pagerank

In [23]:
# Extract nodes, edges, and timestamps
edges = graph_df[['u', 'i', 'ts']].values
nodes = np.unique(edges[:, :2])  # Get unique nodes from edges

In [24]:
# Temporal PageRank Parameters
alpha = 0.85
beta = 0.1
params = temporal_pagerank.TemporalPageRankParams(alpha, beta)

In [26]:
# Initialize TemporalPageRankComputer
tpr_computer = temporal_pagerank.TemporalPageRankComputer(nodes, [params])

In [27]:
# Update PageRank Scores
for edge in edges:
    src, trg, timestamp = edge
    tpr_computer.update((src, trg))
    
# Function to convert results to dictionary
def get_pagerank_scores(tpr_computer):
    scores = {}
    for i, node in enumerate(tpr_computer.active_mass[:, 0]):
        scores[node] = tpr_computer.temp_pr[i, 1]
    return scores

In [28]:
# Get PageRank scores
page_rank_scores = get_pagerank_scores(tpr_computer)

In [29]:
# Sort nodes by PageRank scores
sorted_nodes = sorted(page_rank_scores.items(), key=lambda item: item[1], reverse=True)

# Calculate the top 30% of nodes
new_top_30_percent_count = int(len(sorted_nodes) * 0.30)
new_top_30_percent_count = sorted_nodes[:new_top_30_percent_count]


In [34]:
# Output the top 30% of nodes and their PageRank scores
for node, score in top_30_percent_nodes:
    print(f'Node: {node:.0f}, PageRank Score: {score:.2e}') 

Node: 234, PageRank Score: 1.12e-02
Node: 8412, PageRank Score: 7.35e-03
Node: 69, PageRank Score: 6.73e-03
Node: 8807, PageRank Score: 5.09e-03
Node: 18, PageRank Score: 4.75e-03
Node: 8888, PageRank Score: 4.69e-03
Node: 9070, PageRank Score: 4.52e-03
Node: 8623, PageRank Score: 4.13e-03
Node: 8889, PageRank Score: 3.87e-03
Node: 8340, PageRank Score: 3.78e-03
Node: 8296, PageRank Score: 3.67e-03
Node: 9167, PageRank Score: 3.56e-03
Node: 8735, PageRank Score: 3.56e-03
Node: 41, PageRank Score: 2.99e-03
Node: 8624, PageRank Score: 2.85e-03
Node: 8261, PageRank Score: 2.80e-03
Node: 8424, PageRank Score: 2.66e-03
Node: 9073, PageRank Score: 2.62e-03
Node: 4086, PageRank Score: 2.53e-03
Node: 8589, PageRank Score: 2.47e-03
Node: 9072, PageRank Score: 2.43e-03
Node: 8578, PageRank Score: 2.42e-03
Node: 9049, PageRank Score: 2.40e-03
Node: 8576, PageRank Score: 2.32e-03
Node: 8251, PageRank Score: 2.30e-03
Node: 8371, PageRank Score: 2.14e-03
Node: 8242, PageRank Score: 2.13e-03
Node: 84

In [35]:
top_30_percent_nodes[node]

TypeError: list indices must be integers or slices, not numpy.float64

In [37]:
type(top_30_percent_nodes)

list