In [1]:
# scratch_location = r'/scratch/hmnshpl'
import getpass
import pandas as pd
import numpy as np
import networkx as nx

dataset_name = 'wikipedia'
scratch_location = rf'/scratch/{getpass.getuser()}'

In [2]:
from copy import deepcopy
import os
import sys

In [3]:
# Load data and train val test split
graph_df = pd.read_csv('{}/processed_data/{}/ml_{}.csv'.format(scratch_location,
                                                            dataset_name,
                                                            dataset_name)
                    )
edge_raw_features = np.load('{}/processed_data/{}/ml_{}.npy'.format(scratch_location,
                                                                    dataset_name,
                                                                    dataset_name)
                            )
node_raw_features = np.load('{}/processed_data/{}/ml_{}_node.npy'.format(scratch_location,
                                                                        dataset_name,
                                                                        dataset_name)
                            )

In [4]:
# Extract nodes, edges, and timestamps
edges = graph_df[['u', 'i', 'ts']].values

In [5]:
# get the timestamp of validate and test set
val_ratio = test_ratio = 0.495
val_time, test_time = list(np.quantile(graph_df.ts, [(1 - val_ratio - test_ratio), (1 - test_ratio)]))

In [None]:
# Split data into train, val, test (assuming you have timestamps for this)
train_edges = edges[edges[:, 2] < val_time]
val_edges = edges[(edges[:, 2] >= val_time) & (edges[:, 2] < test_time)]
test_edges = edges[edges[:, 2] >= test_time]

In [None]:
# Create a bipartite temporal graph
G = nx.Graph()
for edge in train_edges:
    source, target, timestamp = edge
    G.add_edge(source, target, timestamp=timestamp)

In [None]:
# Initialize PageRank scores
page_rank_scores = {node: 1.0 / G.number_of_nodes() for node in G.nodes()}
init_page_rank_scores = deepcopy(page_rank_scores)

In [None]:
# Parameters for Temporal PageRank
alpha = 0.85  # Damping factor
max_iter = 100
tol = 1e-6

def temporal_page_rank(G, alpha=0.85, max_iter=100, tol=1e-6):
    nodes = G.nodes()
    num_nodes = G.number_of_nodes()
    
    # Initialize PageRank scores
    pr = {node: 1.0 / num_nodes for node in nodes}
    temp_pr = pr.copy()

    for _ in range(max_iter):
        change = 0
        for node in nodes:
            rank_sum = sum(pr[neighbor] / len(G[neighbor]) for neighbor in G.neighbors(node) if 'timestamp' in G[node][neighbor])
            temp_pr[node] = (1 - alpha) / num_nodes + alpha * rank_sum
        
        # Calculate change for convergence check
        change = sum(abs(temp_pr[node] - pr[node]) for node in nodes)
        
        if change < tol:
            break
        
        pr = temp_pr.copy()

    return pr

In [None]:
# Calculate Temporal PageRank
page_rank_scores = temporal_page_rank(G, alpha=0.85, max_iter=100, tol=1e-6)

# Output the PageRank scores
# print(page_rank_scores)

In [None]:
# Sort nodes by PageRank scores
sorted_nodes = sorted(page_rank_scores.items(), key=lambda item: item[1], reverse=True)

# Calculate the top 30% of nodes
top_30_percent_count = int(len(sorted_nodes) * 0.30)
top_30_percent_nodes = sorted_nodes[:top_30_percent_count]

In [None]:
len(top_30_percent_nodes), len(page_rank_scores)

In [None]:
100* (2242/7475)

In [None]:
# Output the top 30% of nodes and their PageRank scores
for node, score in top_30_percent_nodes:
    print(f'Node: {node:.0f}, PageRank Score: {score:.2e}, initial page_rank: {init_page_rank_scores[node]:.2e}')

In [None]:
# assert 0==1

In [70]:
# Set the working directory to the project root
project_root = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))
sys.path.append(project_root)

In [71]:
# os.getcwd()
project_root

'/home2/hmnshpl/projects/DyGLib'

In [None]:
# from TPR.temporal_page_rank import TemporalPageRankComputer, TemporalPageRankParams

In [72]:
from TPR import temporal_pagerank

In [73]:
# Extract nodes, edges, and timestamps
edges = graph_df[['u', 'i', 'ts']].values
nodes = np.unique(edges[:, :2])  # Get unique nodes from edges

In [74]:
# Temporal PageRank Parameters
alpha = 0.85
beta = 0.1
params = temporal_pagerank.TemporalPageRankParams(alpha, beta)

In [75]:
# Initialize TemporalPageRankComputer
tpr_computer = temporal_pagerank.TemporalPageRankComputer(nodes, [params])

In [76]:
# Update PageRank Scores
for edge in edges:
    src, trg, timestamp = edge
    tpr_computer.update((src, trg))
    
# Function to convert results to dictionary
def get_pagerank_scores(tpr_computer):
    scores = {}
    for i, node in enumerate(tpr_computer.active_mass[:, 0]):
        scores[node] = tpr_computer.temp_pr[i, 1]
    return scores

In [77]:
# Get PageRank scores
page_rank_scores = get_pagerank_scores(tpr_computer)

In [78]:
page_rank_scores

{1.0: 0.15000000000000002,
 2.0: 42.74999999999977,
 3.0: 24.299999999999947,
 4.0: 28.34999999999991,
 5.0: 98.40000000000087,
 6.0: 40.64999999999979,
 7.0: 40.79999999999979,
 8.0: 6.600000000000005,
 9.0: 42.89999999999977,
 10.0: 8.70000000000001,
 11.0: 16.350000000000023,
 12.0: 0.7500000000000001,
 13.0: 41.84999999999978,
 14.0: 22.649999999999963,
 15.0: 7.350000000000007,
 16.0: 1.35,
 17.0: 16.200000000000024,
 18.0: 88.20000000000049,
 19.0: 25.04999999999994,
 20.0: 0.30000000000000004,
 21.0: 0.30000000000000004,
 22.0: 0.6000000000000001,
 23.0: 19.049999999999997,
 24.0: 1.0500000000000003,
 25.0: 16.65000000000002,
 26.0: 10.500000000000014,
 27.0: 1.5,
 28.0: 44.24999999999976,
 29.0: 1.2000000000000002,
 30.0: 0.15000000000000002,
 31.0: 0.6000000000000001,
 32.0: 1.2000000000000002,
 33.0: 0.15000000000000002,
 34.0: 3.449999999999999,
 35.0: 0.7500000000000001,
 36.0: 1.0500000000000003,
 37.0: 49.799999999999706,
 38.0: 0.45000000000000007,
 39.0: 0.1500000000000

In [None]:
# Sort nodes by PageRank scores
sorted_nodes = sorted(page_rank_scores.items(), key=lambda item: item[1], reverse=True)

# Calculate the top 30% of nodes
new_top_30_percent_count = int(len(sorted_nodes) * 0.30)
new_top_30_percent_count = sorted_nodes[:new_top_30_percent_count]


In [None]:
# Output the top 30% of nodes and their PageRank scores
for node, score in top_30_percent_nodes:
    print(f'Node: {node:.0f}, PageRank Score: {score:.2e}') 

In [None]:
top_30_percent_nodes[node]

In [None]:
type(top_30_percent_nodes)

### Incemental code check

In [6]:
graph_df.head()

Unnamed: 0.1,Unnamed: 0,u,i,ts,label,idx
0,0,1,8228,0.0,0.0,1
1,1,2,8229,36.0,0.0,2
2,2,2,8229,77.0,0.0,3
3,3,3,8230,131.0,0.0,4
4,4,2,8229,150.0,0.0,5


In [7]:
train_df = graph_df[graph_df['ts']<val_time]
train_df.head()

Unnamed: 0.1,Unnamed: 0,u,i,ts,label,idx
0,0,1,8228,0.0,0.0,1
1,1,2,8229,36.0,0.0,2
2,2,2,8229,77.0,0.0,3
3,3,3,8230,131.0,0.0,4
4,4,2,8229,150.0,0.0,5


In [8]:
len(train_df)

1575

In [9]:
# Set the working directory to the project root
project_root = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')) # this might cause issue
sys.path.append(project_root)

In [10]:
from preprocess_data import temporal_pr

In [11]:
tpr_scores_ts = temporal_pr.calc_inc_timestamp_pagerank(train_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['u'] = df['u'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['i'] = df['i'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ts'] = pd.to_datetime(df['ts'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http

Calculating PageRank: 100%|██████████| 1543/1543 [54:07<00:00,  2.10s/it]


In [12]:
len(tpr_scores_ts[0]), len(tpr_scores_ts.keys())

KeyError: 0

In [22]:
list(tpr_scores_ts.keys())[0]

Timestamp('1970-01-01 00:00:00')

In [24]:
tpr_scores_ts[list(tpr_scores_ts.keys())[3]]

{1.0: 0.0013928926949274005,
 8228.0: 0.002639169402188598,
 2.0: 0.0018083182640144667,
 8229.0: 0.0018083182640144667,
 3.0: 0.0018083182640144667,
 8230.0: 0.0018083182640144667,
 4.0: 0.0018083182640144667,
 8231.0: 0.0018083182640144667,
 5.0: 0.0018083182640144667,
 8232.0: 0.0018083182640144667,
 6.0: 0.0018083182640144667,
 8233.0: 0.0018083182640144667,
 7.0: 0.0018083182640144667,
 8234.0: 0.0018083182640144667,
 8.0: 0.0013928926949274005,
 9.0: 0.0017165374808758454,
 8235.0: 0.0016840327213946938,
 10.0: 0.0010269718440878144,
 8236.0: 0.004445439519215776,
 11.0: 0.0011445476487514708,
 8237.0: 0.0020548260593158943,
 12.0: 0.0018083182640144667,
 8238.0: 0.0018083182640144667,
 13.0: 0.0018083182640144667,
 8239.0: 0.0018083182640144667,
 14.0: 0.0018083182640144667,
 8240.0: 0.0018083182640144667,
 15.0: 0.0011612577928739702,
 8241.0: 0.0020941397013415504,
 16.0: 0.0018083182640144667,
 8242.0: 0.0018083182640144667,
 17.0: 0.0016267359877439034,
 8243.0: 0.0024667624

In [62]:
len(train_df)
len(train_df['u'].unique()), len(train_df['i'].unique()), len(train_df['ts'].unique()), len(train_df['u'].unique()) + len(train_df['i'].unique())

(324, 229, 1543, 553)

In [45]:
_df = pd.DataFrame.from_dict(tpr_scores_ts, orient='index')

In [43]:
_df.index[0].astype(str)

AttributeError: 'Timestamp' object has no attribute 'astype'

In [47]:
_df.index = _df.index.astype(str)
_df.head()

Unnamed: 0,1.0,8228.0,2.0,8229.0,3.0,8230.0,4.0,8231.0,5.0,8232.0,...,315.0,316.0,317.0,318.0,319.0,320.0,321.0,322.0,323.0,324.0
1970-01-01 00:00:00.000000000,0.001393,0.002639,0.001808,0.001808,0.001808,0.001808,0.001808,0.001808,0.001808,0.001808,...,0.000993,0.00103,0.001187,0.000993,0.001047,0.001086,0.001185,0.000993,0.001393,0.001165
1970-01-01 00:00:00.000000036,0.001393,0.002639,0.001808,0.001808,0.001808,0.001808,0.001808,0.001808,0.001808,0.001808,...,0.000993,0.00103,0.001187,0.000993,0.001047,0.001086,0.001185,0.000993,0.001393,0.001165
1970-01-01 00:00:00.000000077,0.001393,0.002639,0.001808,0.001808,0.001808,0.001808,0.001808,0.001808,0.001808,0.001808,...,0.000993,0.00103,0.001187,0.000993,0.001047,0.001086,0.001185,0.000993,0.001393,0.001165
1970-01-01 00:00:00.000000131,0.001393,0.002639,0.001808,0.001808,0.001808,0.001808,0.001808,0.001808,0.001808,0.001808,...,0.000993,0.00103,0.001187,0.000993,0.001047,0.001086,0.001185,0.000993,0.001393,0.001165
1970-01-01 00:00:00.000000150,0.001393,0.002639,0.001808,0.001808,0.001808,0.001808,0.001808,0.001808,0.001808,0.001808,...,0.000993,0.00103,0.001187,0.000993,0.001047,0.001086,0.001185,0.000993,0.001393,0.001165


In [64]:
tpr_scores_ts_ = temporal_pr.temporal_page_rank(temporal_pr.build_graph(train_df))
tpr_scores_ts = temporal_pr.optimized_temporal_page_rank(temporal_pr.build_graph(train_df))

In [65]:
tpr_scores_ts

array([2.89164558e+05, 3.40193598e+05, 1.80831826e-03, 1.80831826e-03,
       1.80831826e-03, 1.80831826e-03, 1.80831826e-03, 1.80831826e-03,
       1.80831826e-03, 1.80831826e-03, 1.80831826e-03, 1.80831826e-03,
       1.80831826e-03, 1.80831826e-03, 2.89164558e+05, 1.35716594e+44,
       4.42422552e+44, 2.61121150e+44, 9.40482626e+44, 1.51714041e+22,
       2.85452139e+22, 1.80831826e-03, 1.80831826e-03, 1.80831826e-03,
       1.80831826e-03, 1.80831826e-03, 1.80831826e-03, 6.64022808e+22,
       1.82040442e+23, 1.80831826e-03, 1.80831826e-03, 1.66671611e+46,
       1.42655983e+46, 9.37637429e+46, 2.73498882e+46, 2.71017994e+45,
       9.74975643e+45, 2.46106556e+30, 8.33541910e+29, 2.89164558e+05,
       3.40193598e+05, 1.79164525e+44, 6.44490099e+44, 3.22396556e+46,
       8.69854664e+45, 4.56874339e+46, 1.80831826e-03, 1.80831826e-03,
       2.89164558e+05, 3.40193598e+05, 2.89164558e+05, 3.40193598e+05,
       1.80831826e-03, 1.80831826e-03, 1.80831826e-03, 1.80831826e-03,
      

In [61]:
len(tpr_scores_ts)

553

In [66]:
tpr_df = pd.DataFrame.from_dict(tpr_scores_ts_, orient='index')

In [69]:
tpr_df.sort_index().to_csv('./TPR_basic.csv')

In [None]:
edges = train_df[['u', 'i', 'ts']].values

In [None]:
type(edges)

In [None]:
np.unique(edges[:, 2])

In [None]:
np.random.choice(train_df['ts'])

In [None]:
_ts =  np.random.choice(train_df['ts']) # 1835832.0

In [None]:
edges_up_to_ts = edges[edges[:, 2] <= _ts]