In [1]:
# Preliminaries

# scratch_location = r'/scratch/hmnshpl'
import os
import sys
import heapq
import getpass
import numpy as np
import pandas as pd
import networkx as nx
from copy import deepcopy
from collections import defaultdict

dataset_name = 'wikipedia'
scratch_location = rf'/scratch/{getpass.getuser()}'


## Load Data
# Load data and train val test split
graph_df = pd.read_csv('{}/processed_data/{}/ml_{}.csv'.format(scratch_location,
                                                            dataset_name,
                                                            dataset_name)
                    )
edge_raw_features = np.load('{}/processed_data/{}/ml_{}.npy'.format(scratch_location,
                                                                    dataset_name,
                                                                    dataset_name)
                            )
node_raw_features = np.load('{}/processed_data/{}/ml_{}_node.npy'.format(scratch_location,
                                                                        dataset_name,
                                                                        dataset_name)
                            )

# Set the working directory to the project root
project_root = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')) # this might cause issue
sys.path.append(project_root)

In [2]:
# get the timestamp of validate and test set
val_ratio = test_ratio = 0.15
print(val_ratio, test_ratio)
val_time, test_time = list(np.quantile(graph_df.ts, [(1 - val_ratio - test_ratio), (1 - test_ratio)]))
print(val_time, test_time)

train_graph_df = graph_df[graph_df['ts'] < val_time]
train_graph_df.head()

0.15 0.15
1862652.1 2218288.5999999996


Unnamed: 0.1,Unnamed: 0,u,i,ts,label,idx
0,0,1,8228,0.0,0.0,1
1,1,2,8229,36.0,0.0,2
2,2,2,8229,77.0,0.0,3
3,3,3,8230,131.0,0.0,4
4,4,2,8229,150.0,0.0,5


In [14]:
print(len(train_graph_df) / len(graph_df))

0.700001270050929


In [20]:
# sample filename - /scratch/hmnshpl/sparsified_data/wikipedia_kl_divergence_sparsified_0.9.csv
upto = 0.7
strategy = 'kl_divergence' # metrics=['kl_divergence', 'jensen_shannon_divergence', 'wasserstein']

filename = f'{scratch_location}/sparsified_data/{dataset_name}_{strategy}_sparsified_{upto}.csv'
print(filename)
df = pd.read_csv(filename)
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head()

/scratch/hmnshpl/sparsified_data/wikipedia_kl_divergence_sparsified_0.7.csv


Unnamed: 0,u,i,ts,label,idx
0,2,8229,126816.0,0.0,4998
1,2,8229,126945.0,0.0,5004
2,2,8229,127097.0,0.0,5013
3,2,8229,127169.0,0.0,5020
4,2,8229,127263.0,0.0,5022


In [21]:
len(df) / len(train_graph_df)

0.6198563030698889

In [12]:
removed_df = train_graph_df[~train_graph_df['ts'].isin(df['ts'])]
removed_df.head()

Unnamed: 0.1,Unnamed: 0,u,i,ts,label,idx
0,0,1,8228,0.0,0.0,1
1,1,2,8229,36.0,0.0,2
2,2,2,8229,77.0,0.0,3
3,3,3,8230,131.0,0.0,4
4,4,2,8229,150.0,0.0,5


In [13]:
len(removed_df['ts']) / len(train_graph_df['ts'])

0.22360113215763117

In [6]:
# sample filename - /scratch/hmnshpl/sparsified_data/wikipedia_kl_divergence_sparsified_0.9.csv
upto = 0.8
strategy = 'kl_divergence' # metrics=['kl_divergence', 'jensen_shannon_divergence', 'wasserstein']

filename = f'{scratch_location}/sparsified_data/{dataset_name}_{strategy}_sparsified_{upto}.csv'
print(filename)
df_twofold = pd.read_csv(filename)
df_twofold.drop(['Unnamed: 0'], axis=1, inplace=True)
df_twofold.head()

/scratch/hmnshpl/sparsified_data/wikipedia_kl_divergence_sparsified_0.8.csv


Unnamed: 0,u,i,ts,label,idx
0,2,8229,102127.0,0.0,3996
1,2,8229,102190.0,0.0,3999
2,2,8229,102255.0,0.0,4002
3,2,8229,102298.0,0.0,4004
4,2,8229,102331.0,0.0,4005


In [7]:
print(len(df_twofold['ts']) / len(train_graph_df['ts']))

0.701057769068873


In [8]:
# sample filename - /scratch/hmnshpl/sparsified_data/wikipedia_kl_divergence_sparsified_0.9.csv
upto = 0.9
strategy = 'kl_divergence' # metrics=['kl_divergence', 'jensen_shannon_divergence', 'wasserstein']

filename = f'{scratch_location}/sparsified_data/{dataset_name}_{strategy}_sparsified_{upto}.csv'
print(filename)
df_threefold = pd.read_csv(filename)
df_threefold.drop(['Unnamed: 0'], axis=1, inplace=True)
df_threefold.head()

/scratch/hmnshpl/sparsified_data/wikipedia_kl_divergence_sparsified_0.9.csv


Unnamed: 0,u,i,ts,label,idx
0,2,8229,97678.0,0.0,3837
1,2,8229,97792.0,0.0,3838
2,2,8229,98069.0,0.0,3844
3,2,8229,99940.0,0.0,3903
4,2,8229,100103.0,0.0,3911


In [9]:
print(len(df_threefold['ts']) / len(train_graph_df['ts']) )

0.7702028449089193
