In [1]:
import pandas as pd
import os
import pickle
from argparse import ArgumentParser
from collections import Counter
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix

In [2]:
def save_pickle(o, fname):
    with open(fname, 'wb') as handle:
        pickle.dump(o, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [3]:
np.random.seed(42)

cnt_U = Counter()
cnt_V = Counter()

print('> reading files')
with open('edges.dat') as f:
    for line in f:
        u, v, weight = line.split()
        cnt_U[u] += 1
        cnt_V[v] += 1

print('> mapping nodes')
u_mapping = dict((k, i) for i, k in enumerate(cnt_U.keys()))
v_mapping = dict((k, i) for i, k in enumerate(cnt_V.keys()))
    
num_U = len(u_mapping)
num_V = len(v_mapping)
print('num_U:', num_U)
print('num_V:', num_V)

save_pickle(u_mapping, 'u_mapping.pickle')
save_pickle(v_mapping, 'v_mapping.pickle')

src = []
dst = []
w = []

test_ratio = 0.4
random_seed = 2020

with open('edges.dat') as f:
    for line in f:
        u, v, weight = line.split()
        if u in u_mapping and v in v_mapping:
            u_id = u_mapping[u]
            v_id = v_mapping[v]
            weight = float(weight)
            src.append(u_id)
            dst.append(v_id)
            w.append(weight)

src = np.array(src)
dst = np.array(dst)

w = np.array(w)
    
# split training and test sets
print('> train_test_split')
edges = np.stack((src, dst), axis=-1)
train, test = train_test_split(edges, test_size=test_ratio, random_state=random_seed, shuffle=True)
print('train edges:', train.shape)
# print('initial test edges:', test.shape)
print('test edges:', test.shape)

csr_train = csr_matrix((np.ones(train.shape[0]), (train[:, 0], train[:, 1])), shape=(num_U, num_V))
csr_test = csr_matrix((np.ones(test.shape[0]), (test[:, 0], test[:, 1])), shape=(num_U, num_V))

print('train graph:', csr_train.shape, csr_train.nnz)
print('test graph:', csr_test.shape, csr_test.nnz)

# save files
print('> saving file')
save_pickle(csr_train, 'train.csr.pickle')
print('save to', 'train.pkl')
save_pickle(csr_test, 'test.csr.pickle')
print('save to', 'test.pkl')

> reading files
> mapping nodes
num_U: 55187
num_V: 9916
> train_test_split
train edges: (900485, 2)
test edges: (600324, 2)
train graph: (55187, 9916) 900485
test graph: (55187, 9916) 600324
> saving file
save to train.pkl
save to test.pkl
