# Sampling Example

In [1]:
from utils import *
from data_utils import *
from sample import *

## Data Directory

In [2]:
seed = 0

data_folder = './datasets'
dataset = 'PubMed'
model = 'DataFusion'

read_node_file = 'node.dat'
read_link_file = 'link.dat'
read_link_test_file = 'link.dat.test'
read_label_file = 'label.dat'
read_label_test_file = 'label.dat.test'
read_meta_file = 'meta.dat'
read_info_file = 'info.dat'

sample_id = '1'
sampled_node_file = f'sampled{sample_id}_node.dat'
sampled_link_file = f'sampled{sample_id}_link.dat'
sampled_link_test_file = f'sampled{sample_id}_link.dat.test'
sampled_label_file = f'sampled{sample_id}_label.dat'
sampled_label_test_file = f'sampled{sample_id}_label.dat.test'
sampled_meta_file = f'sampled{sample_id}_meta.dat'
sampled_info_file = f'sampled{sample_id}_info.dat'

## Sampling Data

In [3]:
set_seed(seed)

data = read_data(
    data_folder=data_folder,
    dataset=dataset,
    node_file=read_node_file,
    link_file=read_link_file,
    test_link_file=read_link_test_file,
    label_file=read_label_file,
    test_label_file=read_label_test_file,
    meta_file=read_meta_file,
    info_file=read_info_file
)
node_df, link_df, test_link_df, label_df, test_label_df, node_info_df, link_info_df, label_info_df, node_meta_df, link_meta_df, label_meta_df = data
sampled_data = sample(
    data, 
    sample_frac=0.33, 
    test_link_frac=0.01, 
    test_label_frac=0.2, 
    min_labels=300
)

# write_data(
#     data=sampled_data,
#     data_folder=data_folder,
#     dataset=dataset,
#     node_file=sampled_node_file,
#     link_file=sampled_link_file,
#     test_link_file=sampled_link_test_file,
#     label_file=sampled_label_file,
#     test_label_file=sampled_label_test_file,
#     meta_file=sampled_meta_file,
#     info_file=sampled_info_file
# )

Num labelled nodes: 306


## Data statistics

In [4]:
sampled_data = read_data(
    data_folder=data_folder,
    dataset=dataset,
    node_file=sampled_node_file,
    link_file=sampled_link_file,
    test_link_file=sampled_link_test_file,
    label_file=sampled_label_file,
    test_label_file=sampled_label_test_file,
    meta_file=sampled_meta_file,
    info_file=sampled_info_file
)
node_df, link_df, test_link_df, label_df, test_label_df, node_info_df, link_info_df, label_info_df, node_meta_df, link_meta_df, label_meta_df = sampled_data

print(f'Sparsity: {len(link_df) / len(node_df) ** 2}')
print('Same num nodes?:', node_df.shape, pd.unique(link_df[['node_id_from', 'node_id_to']].values.ravel()).shape)
print('node count\n', node_df['node_type'].value_counts().sort_index())
print()

print('edge count', link_df.shape)
print(link_df['link_type'].value_counts().sort_index())
print()

print('label count', label_df.shape)
print(label_df['node_label'].value_counts().sort_index())
print()

print('test label count')
print(test_label_df['node_label'].value_counts().sort_index())
print()

Sparsity: 0.0001946580480972659
Same num nodes?: (13087, 4) (13087,)
node count
 0    2661
1    4288
2    5546
3     592
Name: node_type, dtype: int64

edge count (33339, 4)
0    4553
1    5163
2    5836
3    4366
4    6478
5    5360
6     491
7     399
8     620
9      73
Name: link_type, dtype: int64

label count (241, 4)
0    35
1    29
2    26
3    22
4    50
5    33
6    22
7    24
Name: node_label, dtype: int64

test label count
0     9
1     8
2     7
3     5
4    13
5     8
6     5
7     6
Name: node_label, dtype: int64

