In [1]:
# scratch_location = r'/scratch/hmnshpl'
import getpass
import pandas as pd
import numpy as np
import networkx as nx
import copy
from collections import defaultdict
import heapq


dataset_name = 'wikipedia'
scratch_location = rf'/scratch/{getpass.getuser()}'

In [2]:
from copy import deepcopy
import os
import sys

## GET DATA

In [3]:
# Load data and train val test split
graph_df = pd.read_csv('{}/processed_data/{}/ml_{}.csv'.format(scratch_location,
                                                            dataset_name,
                                                            dataset_name)
                    )
edge_raw_features = np.load('{}/processed_data/{}/ml_{}.npy'.format(scratch_location,
                                                                    dataset_name,
                                                                    dataset_name)
                            )
node_raw_features = np.load('{}/processed_data/{}/ml_{}_node.npy'.format(scratch_location,
                                                                        dataset_name,
                                                                        dataset_name)
                            )

In [4]:
# Set the working directory to the project root
project_root = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')) # this might cause issue
sys.path.append(project_root)

## TPR CALCULATION

### METHOD1 - 

In [None]:
from preprocess_data import temporal_pr


tpr_scores_ts_ = temporal_pr.temporal_page_rank(temporal_pr.build_graph(graph_df))

In [None]:
# pd.DataFrame.from_dict(tpr_scores_ts_, orient='index').sort_index().to_csv('./TPR_basic.csv')

In [None]:
# Sort nodes by PageRank scores
sorted_nodes = sorted(tpr_scores_ts_.items(), key=lambda item: item[1], reverse=True)
sorted_nodes

### METHOD2

In [None]:
from TPR import temporal_pagerank

In [None]:
# Extract nodes, edges, and timestamps
edges = graph_df[['u', 'i', 'ts']].values
nodes = np.unique(edges[:, :2])  # Get unique nodes from edges

# Temporal PageRank Parameters
alpha = 0.85
beta = 0.1
params = temporal_pagerank.TemporalPageRankParams(alpha, beta)

# Initialize TemporalPageRankComputer
tpr_computer = temporal_pagerank.TemporalPageRankComputer(nodes, [params])

In [None]:
# Update PageRank Scores
for edge in edges:
    src, trg, timestamp = edge
    tpr_computer.update((src, trg))
    
# Function to convert results to dictionary
def get_pagerank_scores(tpr_computer):
    scores = {}
    for i, node in enumerate(tpr_computer.active_mass[:, 0]):
        scores[node] = tpr_computer.temp_pr[i, 1]
    return scores

In [None]:
# Get PageRank scores
page_rank_scores = get_pagerank_scores(tpr_computer)

In [None]:
pd.DataFrame.from_dict(page_rank_scores, orient='index').sort_index().to_csv('./TPR_classIimplementation.csv')

In [None]:
# Sort nodes by PageRank scores
sorted_nodes_method_2 = sorted(page_rank_scores.items(), key=lambda item: item[1], reverse=True)
sorted_nodes_method_2

In [None]:
[i[0] for i in sorted_nodes_method_2[:5]]

In [None]:
selected_df = graph_df[graph_df['u'].isin([i[0] for i in sorted_nodes_method_2[:5]])]

In [None]:
# selected_df.to_csv('./selected_df.csv')

In [None]:
graph_df[graph_df['i'] == 8735.0]

In [None]:
len(graph_df['u'].unique()), len(graph_df['i'].unique())

In [None]:
len(sorted_nodes_method_2)

### Method2 - Calculating Incremental TPR at each timestep

In [None]:
# Extract nodes, edges, and timestamps
edges = graph_df[['u', 'i', 'ts']].values
nodes = np.unique(edges[:, :2])  # Get unique nodes from edges

In [None]:
# Temporal PageRank Parameters
from TPR import temporal_pagerank


alpha = 0.85
beta = 0.1
params = temporal_pagerank.TemporalPageRankParams(alpha, beta)

# Initialize TemporalPageRankComputer
tpr_computer = temporal_pagerank.TemporalPageRankComputer(nodes, [params])

In [None]:
# Update PageRank Scores
for edge in edges:
    src, trg, timestamp = edge
    tpr_computer.update((src, trg), timestamp)

In [None]:
ts_tpr_dict = tpr_computer.timestamps_pr

# ts_tpr_df = pd.DataFrame.from_dict(ts_tpr_dict, orient='index')

In [None]:
len(ts_tpr_dict), len(set(edges[:, 2]))

In [None]:
list(ts_tpr_dict.keys())[:5]

In [None]:
lst_of_list = []
index = []
cntr = 0
for key, value in ts_tpr_dict.items():
    if cntr == 100:
        break
    # print(key, value.shape, type(value))
    index.append(int(key))
    lst_of_list.append([list(_v)[0] for _v in value])
    # cntr +=1

In [None]:
# [list(_v)[0] for _v in value]

In [None]:
ts_tpr_df = pd.DataFrame(lst_of_list, index=index)

In [None]:
ts_tpr_df.to_csv('./TS_TPR.csv')

In [None]:
ts_tpr_df.loc[:, 5].unique()

In [None]:
pd.DataFrame.from_dict(page_rank_scores, orient='index').T

In [None]:
assert 0 == 1, 'break here'

In [None]:
# Create a DataFrame from the timestamps_pr dictionary
# The keys (timestamps) will become the index of the DataFrame
# The values (PageRank scores) will become the rows of the DataFrame

pr_df = pd.DataFrame.from_dict(ts_tpr_dict, orient='index')

# Reset the index to make timestamp a column if needed
pr_df.reset_index(inplace=True)
pr_df.rename(columns={'index': 'timestamp'}, inplace=True)

In [None]:
# ts_tpr_dict[None]
# list(ts_tpr_dict.keys())
list(ts_tpr_dict.values())

In [None]:
timestamps_pr = tpr_computer.timestamps_pr

# Flatten the dictionary into a list of tuples
flattened_data = []

for timestamp, scores in timestamps_pr.items():
    for node_index, score in enumerate(scores):
        flattened_data.append((timestamp, node_index, score))

# Create a DataFrame from the flattened data
pr_df = pd.DataFrame(flattened_data, columns=['timestamp', 'node', 'pagerank_score'])

print(pr_df)

In [None]:
pd.DataFrame(index=list(ts_tpr_dict.keys()), columns=(ts_tpr_dict.values()))

In [None]:
# edges
# nodes
'''
create

'''

### From Scratch Implementation

In [None]:
import numpy as np
from collections import defaultdict

def temporal_pagerank(E, beta, alpha):
    r = defaultdict(float)
    s = defaultdict(float)

    for (u, v, t) in E:
        r[u] += (1 - alpha)
        s[u] += (1 - alpha)
        r[v] += s[u] * alpha
        
        if beta > 0 and beta < 1:
            s[v] += s[u] * (1 - beta) * alpha
            s[u] *= beta
        elif beta == 1:
            s[v] += s[u] * alpha
            s[u] = 0

    # Normalize r
    total_r = sum(r.values())
    for key in r:
        r[key] /= total_r

    return r


In [None]:
# Extract nodes, edges, and timestamps
edges = graph_df[['u', 'i', 'ts']].values
nodes = np.unique(edges[:, :2])  # Get unique nodes from edges

# Convert E to a more readable format if needed
edges_new = [(int(u), int(v), float(t)) for u, v, t in edges]

beta = 0.85
alpha = 0.15

r = temporal_pagerank(edges, beta, alpha)
print(r)

In [None]:
# edges_new
# type(edges)

In [None]:
def temporal_pagerank(E, beta, alpha, check_evolution = False):
    r = defaultdict(float)
    s = defaultdict(float)
    
    ts_tpr = {} if check_evolution else None
    
    if check_evolution:
        ts_tpr['None'] = r
    
    # Sort edges by time
    # E.sort(key=lambda x: x[2])
    sorted(edges, key=lambda x: x[2])
    
    for (u, v, t) in E:
        r[u] += (1 - alpha)
        s[u] += (1 - alpha)
        r[v] += s[u] * alpha
        
        if beta > 0 and beta < 1:
            s[v] += s[u] * (1 - beta) * alpha
            s[u] *= beta
        elif beta == 1:
            s[v] += s[u] * alpha
            s[u] = 0
        
        if check_evolution:
            ts_tpr[t] = r

    # Normalize r
    total_r = sum(r.values())
    for key in r:
        r[key] /= total_r
        
    return r, ts_tpr

In [None]:
# Extract nodes, edges, and timestamps
edges = graph_df[['u', 'i', 'ts']].values
nodes = np.unique(edges[:, :2])  # Get unique nodes from edges

# Convert E to a more readable format if needed
edges_new = [(int(u), int(v), float(t)) for u, v, t in edges]

beta = 0.85
alpha = 0.15

r2, ts_tpr= temporal_pagerank(edges, beta, alpha, True)
print(r2)

In [None]:
for key , _ in r.items():
    if r[key] != r2[key]:
        print(key)

In [None]:
# sorted(edges, key=lambda x: x[2])

len(ts_tpr)

In [None]:
len(set(edges[:, 2]))

In [None]:
# ts_tpr

In [None]:
list(ts_tpr.keys())[:5]

In [None]:
# Construct a new dictionary with outer keys as keys and inner dictionaries as values
new_dict = {outer_key: {inner_key: value} for outer_key, inner_dict in ts_tpr.items() for inner_key, value in inner_dict.items()}

# Create a DataFrame from the new dictionary
df = pd.DataFrame(new_dict)

# Transpose the DataFrame to align with the desired structure
df = df.T

# Set the index to the outer keys
# df.index = df.index.names = ['OuterKey']
# Correctly rename the index level
df.rename_axis('OuterKey', inplace=True)

# Limit to the first 10 columns
df = df.iloc[:, :10]

print(df)

In [None]:
ts_tpr_df = pd.DataFrame()
lst_lst = []
ts_lst = []

for ts, values in ts_tpr.items():
    lst_lst.append((pd.DataFrame.from_dict(ts_tpr['None'],
                        orient='index')
                        .sort_index()
                        .T
                        .iloc[0, :]
                        ).to_list())
    ts_lst.append(ts)

In [None]:
# ((pd.DataFrame.from_dict(ts_tpr['None'],
#                         orient='index')
#                         .sort_index()
#                         .T
#                         .iloc[0, :]
#                         ).to_list())

In [None]:
pd.DataFrame(lst_lst, index=ts_lst)

In [None]:
def temporal_pagerank(E, beta, alpha, check_evolution=False):
    r = defaultdict(float)
    s = defaultdict(float)
    
    ts_tpr = {} if check_evolution else None
    
    if check_evolution:
        ts_tpr['None'] = copy.deepcopy(r)
    
    # Sort edges by time
    E = sorted(E, key=lambda x: x[2]) # not necessary
    
    for (u, v, t) in E:
        r[u] += (1 - alpha)
        s[u] += (1 - alpha)
        r[v] += s[u] * alpha
        
        if beta > 0 and beta < 1:
            s[v] += s[u] * (1 - beta) * alpha
            s[u] *= beta
        elif beta == 1:
            s[v] += s[u] * alpha
            s[u] = 0
        
        if check_evolution:
            ts_tpr[t] = copy.deepcopy(r)

    # Normalize r
    total_r = sum(r.values())
    for key in r:
        r[key] /= total_r
        
    return r, ts_tpr

In [None]:
# Extract nodes, edges, and timestamps
edges = graph_df[['u', 'i', 'ts']].values
nodes = np.unique(edges[:, :2])  # Get unique nodes from edges

# Convert E to a more readable format if needed
edges_new = [(int(u), int(v), float(t)) for u, v, t in edges]

beta = 0.85
alpha = 0.15

In [None]:
r2, ts_tpr= temporal_pagerank(edges, beta, alpha, True)
print(r2)

In [None]:
import json

# Step 2: Dump the dictionary to a JSON file
with open(f'{scratch_location}/TS_TPR_real.json', 'w') as json_file:
    json.dump(ts_tpr, json_file, indent=4)

In [None]:
for cntr, i, j in enumerate(ts_tpr.items()):
    print(i, j)
    if cntr == 5:
        break

In [None]:
len(ts_tpr.items())

In [None]:
list(ts_tpr.keys())

In [None]:
graph_df.head()

In [None]:
ts_tpr[0.0], ts_tpr[36.0], ts_tpr[77.0], ts_tpr[131.0], ts_tpr[150.0]

### Optimized AF

In [None]:
from collections import defaultdict
import heapq

def temporal_pagerank_heap(E, beta, alpha, check_evolution=False):
    r = defaultdict(float)
    s = defaultdict(float)
    ts_tpr = {} if check_evolution else None
    
    # Use a heap to efficiently process edges in time order
    heapq.heapify(E)
    
    while E:
        u, v, t = heapq.heappop(E)
        
        # Update r and s values
        r[u] += 1 - alpha
        s[u] += 1 - alpha
        r[v] += s[u] * alpha
        
        if beta < 1:
            s[v] += s[u] * (1 - beta) * alpha
            s[u] *= beta
        else:
            s[v] += s[u] * alpha
            s[u] = 0
        
        # Store evolution if required
        if check_evolution:
            ts_tpr[t] = dict(r)  # Store only r values
    
    # Normalize r
    total_r = sum(r.values())
    for key in r:
        r[key] /= total_r
    
    return r, ts_tpr

In [None]:
# Extract nodes, edges, and timestamps
edges = graph_df[['u', 'i', 'ts']].values
nodes = np.unique(edges[:, :2])  # Get unique nodes from edges

# Convert E to a more readable format if needed
edges_new = [(int(u), int(v), float(t)) for u, v, t in edges]

beta = 0.85
alpha = 0.15

In [None]:
r2, ts_tpr= temporal_pagerank_heap(edges_new, beta, alpha, True)
print(r2)

In [None]:
list(ts_tpr.keys())[:5]

In [None]:
# Collect rows in a list
rows = []

# Iterate over the first 5 keys in ts_tpr
for key in list(ts_tpr.keys())[:5]:
    row = ts_tpr[key]
    # row_df = pd.DataFrame.from_dict(row, orient='index').T
    # row_df['timestamp'] = key  # Add the timestamp as a column
    rows.append(row)

# # Concatenate all rows into a single DataFrame
# df = pd.concat(rows, ignore_index=True)

# # If needed, move the timestamp column to the front
# cols = df.columns.tolist()
# cols = [cols[-1]] + cols[:-1]
# df = df[cols]

# df

In [None]:
rows[0]

In [None]:
def temporal_pagerank_heap_np(E, beta, alpha, check_evolution=False):
    # Convert edges to a NumPy array
    E = np.array(E, dtype=[('u', int), ('v', int), ('t', float)])
    
    # Get unique nodes and create a mapping
    unique_nodes = np.unique(np.concatenate((E['u'], E['v'])))
    node_to_index = {node: idx for idx, node in enumerate(unique_nodes)}
    n = len(unique_nodes)
    
    # Initialize r and s arrays
    r = np.zeros(n)
    s = np.zeros(n)
    
    ts_tpr = [] if check_evolution else None
    
    # Use a heap to efficiently process edges in time order
    heap = [(t, u, v) for u, v, t in E]
    heapq.heapify(heap)
    
    while heap:
        t, u, v = heapq.heappop(heap)
        
        # Get node indices
        u_idx = node_to_index[u]
        v_idx = node_to_index[v]
        
        # Update r and s values
        delta = 1 - alpha
        r[u_idx] += delta
        s[u_idx] += delta
        r[v_idx] += s[u_idx] * alpha
        
        if beta < 1:
            s_v_increment = s[u_idx] * (1 - beta) * alpha
            s[v_idx] += s_v_increment
            s[u_idx] *= beta
        else:
            s[v_idx] += s[u_idx] * alpha
            s[u_idx] = 0
        
        # Store evolution if required
        if check_evolution:
            ts_tpr.append((t, r.copy()))  # Store r values at current timestamp
    
    # Normalize r
    total_r = r.sum()
    if total_r > 0:
        r /= total_r
    
    if check_evolution:
        ts_tpr = np.array(ts_tpr, dtype=[('t', float), ('r', float, n)])
    
    return r, ts_tpr

# Example usage
E = [(1, 2, 0.1), (2, 3, 0.2), (1, 3, 0.3)]
beta = 0.5
alpha = 0.85
r, ts_tpr = temporal_pagerank_heap_np(E, beta, alpha, check_evolution=True)
print("Final PageRank values:", r)
print("Temporal PageRank values:", ts_tpr)


In [None]:
# Extract nodes, edges, and timestamps
edges = graph_df[['u', 'i', 'ts']].values
nodes = np.unique(edges[:, :2])  # Get unique nodes from edges

# Convert E to a more readable format if needed
edges_new = [(int(u), int(v), float(t)) for u, v, t in edges]

beta = 0.85
alpha = 0.15

In [None]:
r2, ts_tpr= temporal_pagerank_heap_np(edges_new, beta, alpha, True)
print(r2)

In [None]:
# edges_new
len(r2)

In [None]:
ts_lst = []
rows = []
for ts, row in ts_tpr:
    ts_lst.append(ts)
    rows.append(row)
    

In [None]:
rows[:5]

In [None]:
# np.argmax(rows[2])
# above implementation makes a node point to a numbered index, I want to update the same index in numpy array as the node instance. i.e. if node lets say is 8223 
# then I want to update the 8223rd index of numpy array with correct tpr value. This way I can have the correct node that I need to remove from the list.
#

In [None]:
def temporal_pagerank_heap_np(E, beta, alpha, check_evolution=False):
    # Convert edges to a NumPy array
    E = np.array(E, dtype=[('u', int), ('v', int), ('t', float)])
    
    # Get the maximum node index to size the r and s arrays appropriately
    max_node = max(E['u'].max(), E['v'].max())
    
    # Initialize r and s arrays
    r = np.zeros(max_node + 1)
    s = np.zeros(max_node + 1)
    
    ts_tpr = [] if check_evolution else None
    
    # Use a heap to efficiently process edges in time order
    heap = [(t, u, v) for u, v, t in E]
    heapq.heapify(heap)
    
    while heap:
        t, u, v = heapq.heappop(heap)
        
        # Update r and s values
        delta = 1 - alpha
        r[u] += delta
        s[u] += delta
        r[v] += s[u] * alpha
        
        if beta < 1:
            s_v_increment = s[u] * (1 - beta) * alpha
            s[v] += s_v_increment
            s[u] *= beta
        else:
            s[v] += s[u] * alpha
            s[u] = 0
        
        # Store evolution if required
        if check_evolution:
            # ts_tpr.append((t, r.copy()))  # Store r values at current timestamp
            # Normalize r before appending
            total_r = r.sum()
            if total_r > 0:
                ts_tpr.append((t, r.copy() / total_r))
    
    # Normalize r
    total_r = r.sum()
    if total_r > 0:
        r /= total_r
    
    if check_evolution:
        ts_tpr = np.array(ts_tpr, dtype=[('t', float), ('r', float, max_node + 1)])
    
    return r, ts_tpr

# Example usage
E = [(1, 2, 0.1), (2, 3, 0.2), (1, 3, 0.3),]
beta = 0.5
alpha = 0.85
r, ts_tpr = temporal_pagerank_heap_np(E, beta, alpha, check_evolution=True)
print("Final PageRank values:", r)
print("Temporal PageRank values:", ts_tpr)


In [None]:
# Extract nodes, edges, and timestamps
edges = graph_df[['u', 'i', 'ts']].values
nodes = np.unique(edges[:, :2])  # Get unique nodes from edges

# Convert E to a more readable format if needed
edges_new = [(int(u), int(v), float(t)) for u, v, t in edges]

beta = 0.85
alpha = 0.15

In [None]:
r2, ts_tpr= temporal_pagerank_heap_np(edges_new, beta, alpha, True)
print(r2)

In [None]:
ts_lst = []
rows = []
for ts, row in ts_tpr:
    ts_lst.append(ts)
    rows.append(row)

row

In [None]:
len(ts_lst), len(rows)

In [None]:
# pd.DataFrame()

In [None]:
# Create a DataFrame
df = pd.DataFrame(rows, index=ts_lst)

# Optionally, reset the index to make timestamp a column
df.reset_index(inplace=True)
df.rename(columns={'index': 'timestamp'}, inplace=True)

df.head()

In [None]:
df.tail()

In [None]:
# check if TPR is strictly increasing
check_df = df.copy(True)
check_df.head()

In [None]:
# def dataframe_size_gb(df):
#     # Get the memory usage of the DataFrame in bytes
#     memory_bytes = df.memory_usage(deep=True).sum()
    
#     # Convert to gigabytes
#     memory_gb = memory_bytes / (1024 ** 3)
    
#     return memory_gb

# # Example usage
# size_gb = dataframe_size_gb(df)
# print(f"The DataFrame is using approximately {size_gb:.2f} GB of memory")

In [None]:
# df.to_csv(f'{scratch_location}/final_ts_tpr.csv')

In [None]:
from joblib import Parallel, delayed

# Transpose the DataFrame
df_transposed = check_df.T

# Convert DataFrame to NumPy array for faster processing
pagerank_matrix = df_transposed.values

def is_strictly_increasing(pageranks):
    return np.all(pageranks[:-1] < pageranks[1:])

# Parallel processing
def process_node(node_idx):
    pageranks = pagerank_matrix[node_idx]
    return df_transposed.index[node_idx] if is_strictly_increasing(pageranks) else None

# Parallelize the node processing
results = Parallel(n_jobs=-1)(delayed(process_node)(i) for i in range(len(df_transposed)))

# Filter out None values and convert to DataFrame
strictly_increasing_nodes = [res for res in results if res is not None]
result_df = pd.DataFrame(strictly_increasing_nodes, columns=['node'])

In [None]:
# result_df

# Sort ts_tpr by timestamp if not already sorted
# ts_tpr.sort(key=lambda x: x[0])
sorted(ts_tpr, key=lambda x: x[0])

# Extract timestamps and PageRank values
timestamps, pagerank_arrays = zip(*ts_tpr)
timestamps = np.array(timestamps)
pagerank_arrays = np.array(pagerank_arrays)

# Calculate mean shifts between consecutive timestamps
mean_shifts = []
for i in range(1, len(timestamps)):
    prev_pagerank = pagerank_arrays[i - 1]
    curr_pagerank = pagerank_arrays[i]
    mean_shift = np.mean(np.abs(curr_pagerank - prev_pagerank))
    mean_shifts.append((timestamps[i], mean_shift))

# Identify timesteps with highest mean shift
mean_shifts.sort(key=lambda x: x[1], reverse=True)
top_mean_shifts = mean_shifts[:10]  # Top 10 timesteps with highest mean shift

# print("Timestamps with highest mean shift:")
for timestep, shift in top_mean_shifts:
    print(f"Timestamp: {timestep}, Mean Shift: {shift}")

In [None]:
# len(mean_shifts)

# result_df

# Sort ts_tpr by timestamp if not already sorted
# ts_tpr.sort(key=lambda x: x[0])
sorted(ts_tpr, key=lambda x: x[0])

# Extract timestamps and PageRank values
timestamps, pagerank_arrays = zip(*ts_tpr)
timestamps = np.array(timestamps)
pagerank_arrays = np.array(pagerank_arrays)

# Calculate mean shifts between consecutive timestamps
mean_shifts = []
for i in range(1, len(timestamps)):
    prev_pagerank_mean = np.mean(pagerank_arrays[i - 1])
    curr_pagerank_mean = np.mean(pagerank_arrays[i])
    mean_shift = (np.abs(prev_pagerank_mean - curr_pagerank_mean))
    mean_shifts.append((timestamps[i], mean_shift))

# Identify timesteps with highest mean shift
mean_shifts.sort(key=lambda x: x[1], reverse=True)
top_mean_shifts = mean_shifts[:10]  # Top 10 timesteps with highest mean shift

# print("Timestamps with highest mean shift:")
for timestep, shift in top_mean_shifts:
    print(f"Timestamp: {timestep}, Mean Shift: {shift}")

In [None]:
top_ts = [ts for ts, _ in top_mean_shifts]
# top_ts

graph_df[graph_df['ts'].isin(top_ts)]

In [None]:
# Sort ts_tpr by timestamp if not already sorted
# ts_tpr.sort(key=lambda x: x[0])
sorted(ts_tpr, key=lambda x: x[0])

# Extract timestamps and PageRank values
timestamps, pagerank_arrays = zip(*ts_tpr)
timestamps = np.array(timestamps)
pagerank_arrays = np.array(pagerank_arrays)

# Calculate mean shifts using vectorized operations
prev_pagerank = pagerank_arrays[:-1]
curr_pagerank = pagerank_arrays[1:]
mean_shifts = np.mean(np.abs(curr_pagerank - prev_pagerank), axis=1)

# Combine timestamps with their corresponding mean shifts
mean_shifts = np.column_stack((timestamps[1:], mean_shifts))

# Determine the threshold to remove top x% of timesteps
x = 10  # percentage
threshold_index = int(len(mean_shifts) * x / 100)

# Sort mean shifts in descending order and select the top x%
mean_shifts_sorted = mean_shifts[mean_shifts[:, 1].argsort()[::-1]]
top_mean_shifts = mean_shifts_sorted[:threshold_index]

# Extract the timestamps to remove
timestamps_to_remove = top_mean_shifts[:, 0]

# Filter out the timesteps to remove from ts_tpr
filtered_ts_tpr = [entry for entry in ts_tpr if entry[0] not in timestamps_to_remove]

# Convert back to array if needed
filtered_timestamps, filtered_pagerank_arrays = zip(*filtered_ts_tpr)
filtered_timestamps = np.array(filtered_timestamps)
filtered_pagerank_arrays = np.array(filtered_pagerank_arrays)

print("Filtered Timestamps:", filtered_timestamps)
print("Filtered PageRank Arrays:", filtered_pagerank_arrays)

In [None]:
# timestamps_to_remove

In [None]:
sorted(ts_tpr, key=lambda x: x[0])

# Extract timestamps and PageRank values
timestamps, pagerank_arrays = zip(*ts_tpr)
timestamps = np.array(timestamps)
pagerank_arrays = np.array(pagerank_arrays)

# Calculate mean shifts between consecutive timestamps
mean_shifts = []
for i in range(1, len(timestamps)):
    prev_pagerank = pagerank_arrays[i - 1]
    curr_pagerank = pagerank_arrays[i]
    mean_shift = np.mean(np.abs(curr_pagerank - prev_pagerank))
    mean_shifts.append((timestamps[i], mean_shift))

# Identify timesteps with highest mean shift
mean_shifts.sort(key=lambda x: x[1], reverse=True)
top_mean_shifts = mean_shifts[:10]  # Top 10 timesteps with highest mean shift

print("Timestamps with highest mean shift:")
for timestep, shift in top_mean_shifts:
    print(f"Timestamp: {timestep}, Mean Shift: {shift}")

In [None]:
# from preprocess_data import temporal_pr

# mean_shifts = temporal_pr.mean_shift_removal(graph_df)
# mean_shifts

In [None]:
! python ../train_link_prediction.py --dataset_name wikipedia --model_name TGN --patch_size 2 --max_input_sequence_length 64 --num_runs 5 --gpu 0 --sparsify True --strategy ts_tpr_remove_MSS --sampling_upto 0.9


### Distance Metric Test

In [5]:
from preprocess_data import temporal_pr

In [6]:
# Randomly sample 30% of the DataFrame
sampled_df = graph_df.sample(frac=0.3, random_state=42)
sampled_df.head()

Unnamed: 0.1,Unnamed: 0,u,i,ts,label,idx
56943,56943,401,9012,1013245.0,0.0,56944
48189,48189,125,8333,870111.0,0.0,48190
30351,30351,2784,8875,587471.0,0.0,30352
37283,37283,616,8973,690685.0,0.0,37284
45560,45560,3603,8785,833643.0,0.0,45561


In [15]:
# mean_shift, euclidean, jaccard, cosine
# jaccard throws error
mean_shifts = temporal_pr.compute_mean_shifts_with_metrics(sampled_df, metric='jaccard')
len(mean_shifts)

Starting mean shift and metrics computation...
Running temporal PageRank computation...
	 inside tpr heap method
	 heapify successful


	 out of loop.
Temporal PageRank computation completed.
Sorting timestamps...
Sorting completed.
Calculating mean shifts and distance metrics...


Processing for jaccard...: 100%|██████████| 47241/47241 [00:08<00:00, 5667.30it/s]

Mean shifts and metrics calculation completed.
Sorting by the selected metric completed.





47241

In [8]:
###  New test -  remove similar edges/timestep based on cosine similarity
# Logic - similar type of interactions would reinforce model's confidence about an edge in future. (i.e. target rote memorization)
# If I remove enough number of interactions then this would lead to model confused
# first need to check if interaction between same nodes have same cosine score

In [9]:
mean_shifts = temporal_pr.compute_mean_shifts_with_metrics(sampled_df, metric='cosine')
len(mean_shifts)

Starting mean shift and metrics computation...
Running temporal PageRank computation...
	 inside tpr heap method
	 heapify successful
	 out of loop.
Temporal PageRank computation completed.
Sorting timestamps...
Sorting completed.
Calculating mean shifts and distance metrics...


Processing for cosine...: 100%|██████████| 47241/47241 [00:02<00:00, 20807.19it/s]

Mean shifts and metrics calculation completed.
Sorting by the selected metric completed.





47241

In [20]:
sorted_mean_shifts = sorted(mean_shifts, key=lambda x: x[1], reverse=True)
sorted_mean_shifts

[(150.0, 0.5),
 (432.0, 0.3333333333333333),
 (563.0, 0.25),
 (578.0, 0.2),
 (742.0, 0.16666666666666666),
 (809.0, 0.14285714285714285),
 (854.0, 0.125),
 (1072.0, 0.1111111111111111),
 (1308.0, 0.1),
 (1314.0, 0.09090909090909091),
 (1451.0, 0.08333333333333333),
 (1543.0, 0.07692307692307693),
 (1771.0, 0.07142857142857142),
 (1805.0, 0.06666666666666667),
 (1925.0, 0.0625),
 (2053.0, 0.05714285714285714),
 (2079.0, 0.05405405405405406),
 (2377.0, 0.05128205128205128),
 (2509.0, 0.04878048780487805),
 (2588.0, 0.046511627906976744),
 (3123.0, 0.0425531914893617),
 (3226.0, 0.04081632653061224),
 (3314.0, 0.0392156862745098),
 (3504.0, 0.03773584905660377),
 (3520.0, 0.03636363636363636),
 (4432.0, 0.03508771929824561),
 (4487.0, 0.03389830508474576),
 (4508.0, 0.03278688524590164),
 (4709.0, 0.03125),
 (2022.0, 0.030303030303030304),
 (4909.0, 0.030303030303030304),
 (5705.0, 0.029411764705882353),
 (5798.0, 0.02857142857142857),
 (6112.0, 0.027777777777777776),
 (6123.0, 0.02702702

In [13]:
def is_strictly_increasing(pageranks):
    return np.all(pageranks[:-1] < pageranks[1:])

In [21]:
ts_list = [ts for ts, _ in sorted_mean_shifts]
is_strictly_increasing(ts_list)

True