In [1]:
import numpy as np
from scipy.sparse import csr_matrix

In [2]:
path_to_data = './data/'
datasets = ['citeseer', 'cora_ml', 'cora', 'dblp', 'pubmed', 'polblogs']

In [3]:
def read_dataset(path):
    if not path.endswith('.npz'):
        path += '.npz'
    with np.load(path, allow_pickle=True) as loader:
        loader = dict(loader)
        A = csr_matrix((loader['adj_data'], loader['adj_indices'],
                        loader['adj_indptr']), shape=loader['adj_shape'])

        H = csr_matrix((loader['attr_data'], loader['attr_indices'],
                        loader['attr_indptr']), shape=loader['attr_shape'])

        y = loader.get('labels')

        return A.toarray(), [tuple(i) for i in H.toarray()], y

In [4]:
A, H, y = read_dataset(path_to_data + datasets[0])

In [5]:
# adjesency matrix
A.shape

(4230, 4230)

In [6]:
len(H[0])

602

In [7]:
import networkx as nx

G = nx.convert_matrix.from_numpy_matrix(A)
G.remove_edges_from(nx.selfloop_edges(G))
CG = nx.complement(G)

In [8]:
import pandas as pd


G_edges = G.edges
CG_edges = CG.edges

df1 = pd.DataFrame()
df0 = pd.DataFrame()

df1['node1'] = pd.Series([e[0] for e in G_edges])
df1['node2'] = pd.Series([e[1] for e in G_edges])
df1['goal'] = pd.Series(np.ones(len(G_edges)))

df0['node1'] = pd.Series([e[0] for e in CG_edges])
df0['node2'] = pd.Series([e[1] for e in CG_edges])
df0['goal'] = pd.Series(np.zeros(len(CG_edges)))

df = pd.concat([df1, df0], ignore_index=True)

In [9]:
len(df1)

5337

In [10]:
len(df)

8944335

In [11]:
df.head()

Unnamed: 0,node1,node2,goal
0,0,3514,1.0
1,0,3617,1.0
2,1,2951,1.0
3,1,3089,1.0
4,1,3131,1.0


In [12]:
#df.set_index(['node1'], drop=False, inplace=True)
df.head()

Unnamed: 0,node1,node2,goal
0,0,3514,1.0
1,0,3617,1.0
2,1,2951,1.0
3,1,3089,1.0
4,1,3131,1.0


In [13]:
#df['node_1_attrs'] = df['node1'].apply(lambda x: H[x])
#df['node_2_attrs'] = df['node2'].apply(lambda x: H[x])

In [14]:
tr1=0.7
ts1=0.15
seed=32

In [15]:
# balancing the edges and non-edges
min_length = np.min([len(df[df['goal'] == 0]), len(df[df['goal'] == 1])])
df = pd.concat([\
    df[df['goal'] == 0].sample(n=min_length, random_state=seed),\
    df[df['goal'] == 1].sample(n=min_length, random_state=seed)],\
    ignore_index=True)

# randomly choosing tr1% (70% by default) of all edges
df_tr_1 = df.sample(n=int(len(df)*tr1), random_state=seed)
remainder = pd.concat([df, df_tr_1], ignore_index=True).drop_duplicates(keep=False, ignore_index=True)

# randomly choosing ts1% (15% by default) of all edges
df_ts_1 = remainder.sample(n=int(len(df)*ts1), random_state=seed, ignore_index=True)

# the rest is test 2
df_ts_2 = pd.concat([remainder, df_ts_1], ignore_index=True).drop_duplicates(keep=False, ignore_index=True)

7471 5337 1601


In [16]:
len(df_tr_1)

7471

In [17]:
len(df_ts_1)

1601

In [18]:
len(df_ts_2)

1602