In [1]:
# Preliminaries

# scratch_location = r'/scratch/hmnshpl'
import os
import sys
import heapq
import getpass
import numpy as np
import pandas as pd
import networkx as nx
from copy import deepcopy
from collections import defaultdict
from tqdm import tqdm

dataset_name = 'reddit'
scratch_location = rf'/scratch/{getpass.getuser()}'


## Load Data
# Load data and train val test split
graph_df = pd.read_csv('{}/processed_data/{}/ml_{}.csv'.format(scratch_location,
                                                            dataset_name,
                                                            dataset_name)
                    )
edge_raw_features = np.load('{}/processed_data/{}/ml_{}.npy'.format(scratch_location,
                                                                    dataset_name,
                                                                    dataset_name)
                            )
node_raw_features = np.load('{}/processed_data/{}/ml_{}_node.npy'.format(scratch_location,
                                                                        dataset_name,
                                                                        dataset_name)
                            )

# Set the working directory to the project root
project_root = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')) # this might cause issue
sys.path.append(project_root)

In [2]:
from preprocess_data.temporal_pr import temporal_pagerank_with_timestamps, calc_timestamp_pagerank,\
    calc_inc_timestamp_pagerank, optimized_calc_inc_timestamp_pagerank,\
    get_temporal_pagerank, mean_shift_removal, mean_shift_removal2, compute_mean_shifts_with_metrics, \
    calculate_temporal_edge_rank, calculate_combined_temporal_edgerank

In [3]:
# get the timestamp of validate and test set
val_ratio = test_ratio = 0.15
print(val_ratio, test_ratio)
val_time, test_time = list(np.quantile(graph_df.ts, [(1 - val_ratio - test_ratio), (1 - test_ratio)]))
print(val_time, test_time)

0.15 0.15
1882469.2648 2261813.6575999996


In [4]:
train_graph_df = graph_df[graph_df['ts'] < val_time]
train_graph_df.head()

Unnamed: 0.1,Unnamed: 0,u,i,ts,label,idx
0,0,1,10001,0.0,0.0,1
1,1,2,10002,6.32,0.0,2
2,2,3,10003,7.026,0.0,3
3,3,4,10003,13.599,0.0,4
4,4,5,10004,16.811,0.0,5


In [5]:
print(len(train_graph_df))
print(len(train_graph_df) / len(graph_df))  # is 70% of the dataset

470713
0.7000001487106047


### Old Codes

In [None]:
tmp_graph = train_graph_df.copy(deep=True)
upto=0.7
metric='chebyshev'
# wasserstein, kl_divergence
# jensen_shannon_divergence -- 1m47s
# wasserstein -- 3m31s
# kl_divergence -- 32s


# tmp_graph = tmp_graph.sort_values(by=['u', 'i', 'ts'])

# # Exclude the first and last rows based on 'u' and 'i'
# grouped = tmp_graph.groupby(['u', 'i'])
# modified_df = grouped.apply(lambda x: x.iloc[1:-1]).reset_index(drop=True)

In [None]:
# total_groups = len(grouped)
# rows_removed = total_groups * 2 
# total_rows_original = len(tmp_graph)
# percentage_removed = (rows_removed / total_rows_original) * 100
# percentage_removed, len(modified_df) / len(tmp_graph)

# # Group by 'u' and 'i' and capture the first and last interactions
# first_interactions = grouped.first().reset_index()
# last_interactions = grouped.last().reset_index()

# (len(first_interactions) + len(last_interactions)) / len(tmp_graph)

In [None]:
# based on maximum mean shift strategy
mean_shifts = compute_mean_shifts_with_metrics(tmp_graph, metric=metric)

In [None]:
print(f'remove upto: {(1-upto):.2%}', 'length of mean shift is: ', len(mean_shifts), end=' ')

threshold_index = int(len(mean_shifts) * (1-upto))

print(f'{threshold_index=}')
top_mean_shifts = mean_shifts[:threshold_index]
print(len(top_mean_shifts), f'{len(top_mean_shifts) / len(mean_shifts):.2%}' )

top_x_percent_timestamps = [ts for ts, _ in top_mean_shifts]
print(len(top_x_percent_timestamps), f'{len(top_x_percent_timestamps) / len(train_graph_df["ts"]):.2%}')

# sampled_df = modified_df[~modified_df['ts'].isin(top_x_percent_timestamps)]
sampled_df = tmp_graph[~tmp_graph['ts'].isin(top_x_percent_timestamps)]
print(len(sampled_df['ts']), len(sampled_df['ts']) / len(tmp_graph['ts']))
print('data sampling successful.')

In [None]:
len(top_x_percent_timestamps) #  330
len(tmp_graph['ts'])  # 110232

len(set(tmp_graph['ts']).difference(set(top_x_percent_timestamps)))  # 106602
len(set(tmp_graph['ts']).difference(set(top_x_percent_timestamps))) / len(tmp_graph['ts'])  # 106602

In [None]:
filename = f'{scratch_location}/sparsified_data/{dataset_name}_{metric}_sparsified_{upto}.csv'
# sampled_df.drop(['Unnamed: 0'], axis=1).to_csv(filename)
print(filename)

In [None]:
assert 0 == 1

In [None]:
def sparsify_data(tmp_graph, metric, upto, dataset_name, save=False):
    # tmp_graph = tmp_graph.sort_values(by=['u', 'i', 'ts'])

    # # Exclude the first and last rows based on 'u' and 'i'
    # grouped = tmp_graph.groupby(['u', 'i'])
    # modified_df = grouped.apply(lambda x: x.iloc[1:-1]).reset_index(drop=True)
    
    # based on maximum mean shift strategy
    mean_shifts = compute_mean_shifts_with_metrics(tmp_graph, metric=metric)

    print('back to sparsify_data file....')

    threshold_index = int(len(mean_shifts) * (1-upto))
    top_mean_shifts = mean_shifts[:threshold_index]

    top_x_percent_timestamps = [ts for ts, _ in top_mean_shifts]

    sampled_df = tmp_graph[~tmp_graph['ts'].isin(top_x_percent_timestamps)]
    print('data sampling successful.')
    
    if save:
        filename = f'{scratch_location}/sparsified_data/{dataset_name}_{metric}_sparsified_{upto}.csv'
        sampled_df.drop(['Unnamed: 0'], axis=1).to_csv(filename)
        print(filename, ' saved.')
    return sampled_df

In [6]:
list_of_upto = [0.9, 0.8, 0.7]
metrics= ['ts_tpr_remove_combined_ter']  # ['cosine', 'euclidean', 'jaccard','kl_divergence', 'jensen_shannon_divergence', 'wasserstein']
tmp_graph = train_graph_df.copy(deep=True)

for upto in list_of_upto:
    print(f'{upto}')
    for metric in metrics:
        print(f'\t{metric}', end=' ')
        # sampled_df = sparsify_data(tmp_graph, metric, upto, dataset_name, save=True)
        ter_dict = calculate_combined_temporal_edgerank(tmp_graph, dataset_name=dataset_name)
        
        sorted_ter_dict = dict(sorted(ter_dict.items(), key=lambda x: x[1], reverse=True))
        sorted_ter_dict = list(sorted_ter_dict.items())
    
        print('back to sparsify_data file....')

        threshold_index = int(len(sorted_ter_dict) * (1-upto))

        top_mean_shifts = sorted_ter_dict[:threshold_index]

        top_x_percent_timestamps = [ts for ts, _ in top_mean_shifts]

        sampled_df = tmp_graph[~tmp_graph['ts'].isin(top_x_percent_timestamps)]
        sampled_df.drop(['Unnamed: 0'], axis=1).to_csv(filename)
        print('done')

0.9
In temporal_pagerank_heap_np dataset_name='reddit'
mmap_file='/scratch/hmnshpl/reddit_ts_tpr_data.dat'
	 inside tpr heap method
Precomputed memory-mapped array loaded.      
	 heapify successful
	 out of loop.
Calculated TPR      


Calculating cumulative outgoing degree:  25%|██▍       | 115/469 [01:29<05:40,  1.04it/s]

: 

In [None]:
import os
import numpy as np

# mmap_file = 'path/to/your/mmap_file'
mmap_file=f'{scratch_location}/{dataset_name}_ts_tpr_data.dat'

# Convert edges to a NumPy array
E = tmp_graph[['u', 'i', 'ts']]
E = np.array(E, dtype=[('u', int), ('v', int), ('t', float)])

# Get the maximum node index to size the r and s arrays appropriately
max_node = max(E['u'].max(), E['v'].max())

if os.path.exists(mmap_file):
    stats = os.stat(mmap_file)
    mmap_size = stats.st_size
    dtype = np.dtype([('t', float), ('r', float, max_node + 1)])
    dtype_size = dtype.itemsize
    data_size = mmap_size // dtype_size
    print(f"File size: {mmap_size/1024**3} Giga bytes")
    print(f"Data type size: {dtype_size} bytes")
    print(f"Number of items: {data_size}")

In [None]:
# import pandas as pd
# sampled_df=pd.read_csv(filename)
# # Remove columns with 'Unnamed:' in their name
# sampled_df = sampled_df.loc[:, ~sampled_df.columns.str.contains('^Unnamed')]
# sampled_df.head()

In [None]:
def generate_shell_script(result_path, save_folder, email, dataset_name, model_name, patch_size,
                        max_input_sequence_length, num_runs, gpu, sparsify, strategy, sampling_upto,
                        num_cpus, num_gpus, gnode_name):
    
    # set Default variables
    result_path = "/home2/hmnshpl/projects/results" if result_path is None else result_path
    save_folder = "DygLib" if save_folder is None else save_folder
    email = "himanshu.pal@research.iiit.ac.in" if email is None else email
    dataset_name = "wikipedia" if dataset_name is None else dataset_name
    model_name = "TGN" if model_name is None else model_name
    patch_size = 2 if patch_size is None else patch_size
    max_input_sequence_length = 64 if max_input_sequence_length is None else max_input_sequence_length
    num_runs = 5 if num_runs is None else num_runs
    gpu = 0 if gpu is None else gpu
    sparsify = True if sparsify is None else sparsify
    strategy = "ts_tpr_remove_cosine" if strategy is None else strategy
    sampling_upto = 0.7 if sampling_upto is None else sampling_upto
    num_cpus = 9 if num_cpus is None else num_cpus
    num_gpus = 1 if num_gpus is None else num_gpus
    if gnode_name is None or 'gnode' not in gnode_name:
        raise ValueError("Please provide a valid gnode.")

    # Generate shell script content
    script_content = f"""#!/bin/bash
#SBATCH -A research
#SBATCH -n {num_cpus}
#SBATCH --gres=gpu:{num_gpus}
#SBATCH --mem-per-cpu=2G
#SBATCH --output={result_path}/{save_folder}/Link_Prediciton_{strategy}_{sampling_upto}.txt
#SBATCH --nodelist {gnode_name}
#SBATCH --time=96:00:00
#SBATCH --mail-user={email}
#SBATCH --mail-type=ALL

source ~/.bashrc

conda activate tg

python train_link_prediction.py --dataset_name {dataset_name} --model_name {model_name} --patch_size {patch_size} --max_input_sequence_length {max_input_sequence_length} --num_runs {num_runs} --gpu {gpu} --sparsify {sparsify} --strategy {strategy} --sampling_upto {sampling_upto}
    """

    # Specify the output filename dynamically
    output_filename = f"../LP_{strategy}_{sampling_upto}.sh"

    # Write to file
    with open(output_filename, "w") as file:
        file.write(script_content)

    print(f"Shell script '{output_filename}' has been successfully generated.")

# Example usage
# generate_shell_script("/home2/hmnshpl/projects/results", "DygLib", "himanshu.pal@research.iiit.ac.in",
#                     "wikipedia", "TGN", 2, 64, 5, 0, True, "ts_tpr_remove_wasserstein", 0.7, 9, 1, 'gnode085')


In [None]:
available_gnodes =  ['gnode074', 'gnode078', 'gnode067']
strategies = ['ts_tpr_remove_chebyshev'] # ['ts_tpr_remove_MSS', 'ts_tpr_remove_mss_2', 'ts_tpr_remove_kl_divergence', 'ts_tpr_remove_jensen_shannon_divergence',
            # 'ts_tpr_remove_cosine', 'ts_tpr_remove_euclidean', 'ts_tpr_remove_jaccard']
samplings = [0.7, 0.8, 0.9]

# Mapping of strategies to gnodes
strategy_to_gnode = {
    'ts_tpr_remove_wasserstein': 'gnode074',
    'ts_tpr_remove_kl_divergence': 'gnode078',
    'ts_tpr_remove_jensen_shannon_divergence': 'gnode067',
    'ts_tpr_remove_MSS':'gnode074',
    'ts_tpr_remove_mss_2':'gnode078',
    'ts_tpr_remove_cosine': 'gnode067',
    'ts_tpr_remove_euclidean': 'gnode074',
    'ts_tpr_remove_jaccard': 'gnode078', 
    'ts_tpr_remove_chebyshev': 'gnode080', 
}

for strategy in strategies:
    for sampling in samplings:
        gnode = strategy_to_gnode[strategy]
        generate_shell_script("/home2/hmnshpl/projects/results", "DygLib", "himanshu.pal@research.iiit.ac.in",
                    "wikipedia", "TGN", 2, 64, 5, 0, True, strategy, sampling, 9, 1, gnode)

In [None]:
tmp_graph = train_graph_df.copy(deep=True)
# upto=0.7
list_of_upto = [0.9, 0.8, 0.7]

for upto in list_of_upto:
    tmp_graph = tmp_graph.sort_values(by=['u', 'i', 'ts'])

    # Exclude the first and last rows based on 'u' and 'i'
    grouped = tmp_graph.groupby(['u', 'i'])
    modified_df = grouped.apply(lambda x: x.iloc[1:-1]).reset_index(drop=True)

    mean_shifts = mean_shift_removal2(tmp_graph)

    print('back to sparsify_data file....')

    threshold_index = int(len(mean_shifts) * (1-upto))
    top_mean_shifts = mean_shifts[:threshold_index]

    top_x_percent_timestamps = [ts for ts, _ in top_mean_shifts]

    sampled_df = modified_df[~modified_df['ts'].isin(top_x_percent_timestamps)]

    filename = f'{scratch_location}/sparsified_data/{dataset_name}_mss2_sparsified_{upto}.csv'
    sampled_df.drop(['Unnamed: 0'], axis=1).to_csv(filename)
    print(filename, ' saved.')

### TER

In [None]:
tmp_graph = train_graph_df.copy(deep=True)
ter_dict = calculate_temporal_edge_rank(train_graph_df)
len(ter_dict)

In [None]:
sorted_ter_dict = dict(sorted(ter_dict.items(), key=lambda x: x[1], reverse=True))
sorted_ter_dict = list(sorted_ter_dict.items())

In [None]:
upto = 0.7
print(f'remove upto: {(1-upto):.2%}', 'length of mean shift is: ', len(sorted_ter_dict), end=' ')

threshold_index = int(len(sorted_ter_dict) * (1-upto))

print(f'{threshold_index=}')
top_mean_shifts = sorted_ter_dict[:threshold_index]
print(len(top_mean_shifts), f'{len(top_mean_shifts) / len(sorted_ter_dict):.2%}' )

top_x_percent_timestamps = [ts for ts, _ in top_mean_shifts]
print(len(top_x_percent_timestamps), f'{len(top_x_percent_timestamps) / len(train_graph_df["ts"]):.2%}')

# sampled_df = modified_df[~modified_df['ts'].isin(top_x_percent_timestamps)]
sampled_df = tmp_graph[~tmp_graph['ts'].isin(top_x_percent_timestamps)]
print(len(sampled_df['ts']), len(sampled_df['ts']) / len(tmp_graph['ts']))
print('data sampling successful.')

In [None]:
filename = f'{scratch_location}/sparsified_data/{dataset_name}_TER_sparsified_{upto}.csv'
sampled_df.drop(['Unnamed: 0'], axis=1).to_csv(filename)
print(filename)

### Combined TER

In [None]:
tmp_graph = train_graph_df.copy(deep=True)
ter_dict = calculate_combined_temporal_edgerank(train_graph_df)
len(ter_dict)

In [None]:
print(len(train_graph_df['ts']), len(train_graph_df['ts'].unique()))

sorted_ter_dict = dict(sorted(ter_dict.items(), key=lambda x: x[1], reverse=True))
sorted_ter_dict = list(sorted_ter_dict.items())

In [None]:
upto = 0.9
print(f'remove upto: {(1-upto):.2%}', 'length of mean shift is: ', len(sorted_ter_dict), end=' ')

threshold_index = int(len(sorted_ter_dict) * (1-upto))

print(f'{threshold_index=}')
top_mean_shifts = sorted_ter_dict[:threshold_index]
print(len(top_mean_shifts), f'{len(top_mean_shifts) / len(sorted_ter_dict):.2%}' )

top_x_percent_timestamps = [ts for ts, _ in top_mean_shifts]
print(len(top_x_percent_timestamps), f'{len(top_x_percent_timestamps) / len(train_graph_df["ts"]):.2%}')

# sampled_df = modified_df[~modified_df['ts'].isin(top_x_percent_timestamps)]
sampled_df = tmp_graph[~tmp_graph['ts'].isin(top_x_percent_timestamps)]
print(len(sampled_df['ts']), len(sampled_df['ts']) / len(tmp_graph['ts']))
print('data sampling successful.')

filename = f'{scratch_location}/sparsified_data/{dataset_name}_Combined_TER_sparsified_{upto}.csv'
sampled_df.drop(['Unnamed: 0'], axis=1).to_csv(filename)
print(filename)

#### New Incremental Sparsification code

In [None]:
""" Objective: To sparsify a network incrementally as in remove first top 10% timesteps then recalculate top 10% and so on... """

tmp_graph = train_graph_df.copy(deep=True)
upto=0.7
# metric='chebyshev'

# based on maximum mean shift strategy
# mean_shifts = compute_mean_shifts_with_metrics(tmp_graph, metric=metric)
mean_shifts = mean_shift_removal(tmp_graph)

print(f'remove upto: {(1-upto):.2%}', 'length of mean shift is: ', len(mean_shifts), end=' ')

threshold_index = int(len(mean_shifts) * (1-upto))

print(f'{threshold_index=}')
top_mean_shifts = mean_shifts[:threshold_index]
print(len(top_mean_shifts), f'{len(top_mean_shifts) / len(mean_shifts):.2%}' )

top_x_percent_timestamps = [ts for ts, _ in top_mean_shifts]
print(len(top_x_percent_timestamps), f'{len(top_x_percent_timestamps) / len(train_graph_df["ts"]):.2%}')

# sampled_df = modified_df[~modified_df['ts'].isin(top_x_percent_timestamps)]
sampled_df = tmp_graph[~tmp_graph['ts'].isin(top_x_percent_timestamps)]
print(len(sampled_df['ts']), len(sampled_df['ts']) / len(tmp_graph['ts']))
print('data sampling successful.')

In [None]:
sampled_df_simple = sampled_df.copy(deep=True)   # non recursive removal

In [None]:
tmp_graph = train_graph_df.copy(deep=True)
upto = 0.7

# Calculate the percentage of data to be removed
removal = (1 - upto) * 100

# Loop through the specified percentage
for i in tqdm(range(int(removal / 10)), desc='Processing'):
    # Calculate mean shifts for the current graph state
    mean_shifts = mean_shift_removal(tmp_graph)
    
    # Determine the index for the top 10% of mean shifts
    threshold_index = int(len(mean_shifts) * 0.1)
    
    # Select the top 10% of mean shifts
    top_mean_shifts = mean_shifts[:threshold_index]

    # Extract the corresponding timestamps
    top_x_percent_timestamps = [ts for ts, _ in top_mean_shifts]

    # Remove rows corresponding to the top 10% of timestamps
    tmp_graph = tmp_graph[~tmp_graph['ts'].isin(top_x_percent_timestamps)]


In [None]:
print(len(tmp_graph['ts']), len(tmp_graph['ts']) / len(train_graph_df['ts']))

In [None]:
len(tmp_graph['ts'].unique()) / len(train_graph_df['ts']), len(sampled_df_simple['ts']) / len(train_graph_df['ts'])

In [None]:
tmp_graph = train_graph_df.copy(deep=True)
full_data_percent = 1

upto = 0.7
total_removal_target = (full_data_percent - upto)  # Total of 10% removal
removal_rate = 0.1
remaining_steps = steps = int(total_removal_target // removal_rate)  # Number of steps

print(f'To sparsify data {upto:.2%} at removal rate of {removal_rate:.2%}, we need {steps} steps.')

for step in tqdm(range(steps), desc='Processing'):
    current_removal = np.round(total_removal_target / (remaining_steps), decimals=3)
    print(current_removal, remaining_steps)
    full_data_percent -= current_removal
    total_removal_target = (1 - upto/full_data_percent)
    remaining_steps-=1


In [None]:
tmp_graph = train_graph_df.copy(deep=True)
full_data_percent = 1

upto = 0.7
total_removal_target = (full_data_percent - upto)  # Total of 30% removal
removal_rate = 0.1
remaining_steps = steps = int(total_removal_target // removal_rate)  # Number of steps

print(f'To sparsify data to {upto:.2%} at a removal rate of {removal_rate:.2%}, we need {steps} steps.')

for step in tqdm(range(steps), desc='Processing'):
    current_removal = removal_rate / (full_data_percent)  # Adjust the removal percentage
    full_data_percent -= current_removal * full_data_percent  # Apply the removal

    print(f"Step {step + 1}: Removing {current_removal * 100:.3f}% of the original data")
    
    remaining_steps -= 1
full_data_percent

In [None]:
tmp_graph = train_graph_df.copy(deep=True)
upto = 0.7  # Target percentage of data to keep

# Calculate the total percentage of data to be removed
total_removal_target = 1 - upto
remaining_data_fraction = 1.0  # Start with 100% of the data

# Number of steps to remove 10% each time
steps = int(total_removal_target / 0.1)

print(f"Target: Reduce to {upto:.2%} data in {steps} steps.")

for i in tqdm(range(steps), desc='Processing'):
    # Adjust the removal target based on the remaining data
    current_removal = 0.1 / remaining_data_fraction
    
    # Calculate mean shifts for the current graph state
    mean_shifts = mean_shift_removal(tmp_graph)
    
    # Determine the index for the required top percentage of mean shifts
    threshold_index = int(len(mean_shifts) * current_removal)
    
    # Select the top mean shifts
    top_mean_shifts = mean_shifts[:threshold_index]

    # Extract the corresponding timestamps
    top_x_percent_timestamps = [ts for ts, _ in top_mean_shifts]

    # Remove rows corresponding to the top timestamps
    tmp_graph = tmp_graph[~tmp_graph['ts'].isin(top_x_percent_timestamps)]
    
    # Update the remaining data fraction
    remaining_data_fraction -= current_removal


In [None]:
len(tmp_graph['ts'].unique()) / len(train_graph_df['ts']) , len(sampled_df_simple['ts']) / len(train_graph_df['ts'])

In [None]:
rec_sparse_ts = list(tmp_graph['ts'])
non_rec_sparse_ts = list(sampled_df_simple['ts'])

len(rec_sparse_ts), len(non_rec_sparse_ts), len(non_rec_sparse_ts) - len(rec_sparse_ts)

# non-rec-sparsification has more ts than rec-sparsification

In [None]:
len(set(rec_sparse_ts)), len(set(non_rec_sparse_ts))

In [None]:
len(set(non_rec_sparse_ts).difference(set(rec_sparse_ts)))  # since there are 1268-159 new timestamps in new rec-sparsification mean we have change
# in temporal rank of nodes in iteratively sparsified network

In [None]:
filename = f'{scratch_location}/sparsified_data/{dataset_name}_rec_mss_sparsified_{upto}.csv'
sampled_df.drop(['Unnamed: 0'], axis=1).to_csv(filename)
print(filename)