In [1]:
import numpy as np
import pickle as pkl
import networkx as nx
import sys
import os
import scipy.sparse as sp
from collections import defaultdict

### load_data

In [2]:
names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
objects = []

In [3]:
sys.version_info

sys.version_info(major=3, minor=6, micro=2, releaselevel='final', serial=0)

In [4]:
code_path = os.path.abspath('')
dataset_dir = code_path + '' + '/data'
dataset_str = 'citeseer'

In [5]:
for i in range(len(names)):
    with open(dataset_dir + "/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
        objects.append(pkl.load(f, encoding='latin1'))

In [23]:
objects[0].shape

(120, 3703)

In [25]:
objects[2].shape

(1000, 3703)

In [24]:
objects[4].shape

(2312, 3703)

In [8]:
# sparse adjancency matrix!
print(objects[0][0])

  (0, 184)	1.0
  (0, 257)	1.0
  (0, 362)	1.0
  (0, 560)	1.0
  (0, 565)	1.0
  (0, 597)	1.0
  (0, 600)	1.0
  (0, 601)	1.0
  (0, 637)	1.0
  (0, 729)	1.0
  (0, 805)	1.0
  (0, 816)	1.0
  (0, 942)	1.0
  (0, 1116)	1.0
  (0, 1435)	1.0
  (0, 1545)	1.0
  (0, 1623)	1.0
  (0, 1635)	1.0
  (0, 1846)	1.0
  (0, 2085)	1.0
  (0, 2338)	1.0
  (0, 2343)	1.0
  (0, 2565)	1.0
  (0, 2604)	1.0
  (0, 2696)	1.0
  (0, 2741)	1.0
  (0, 2918)	1.0
  (0, 2970)	1.0
  (0, 3502)	1.0
  (0, 3548)	1.0
  (0, 3647)	1.0


In [None]:
print(objects[0][0])

In [17]:
arr_train = []
arr_test = []
arr = []

with open('movie/train_2008.edge', 'r') as file:
    for line in file.readlines():
        p1, p2 = line.split()
        arr_train.append((int(p1)-1, int(p2)-1))
        arr.append((int(p1)-1, int(p2)-1))

with open('movie/test_2009.edge', 'r') as file:
    for line in file.readlines():
        p1, p2 = line.split()
        arr.append((int(p1)-1, int(p2)-1))
        arr_test.append((int(p1)-1, int(p2)-1))        

In [21]:
len(arr)

473831

In [20]:
len(arr_train)

472848

In [19]:
len(arr_test)

983

### Get graph with edges

In [31]:
graph = defaultdict(list)

In [48]:
for x, y in arr:
    graph[x].append(y)
    graph[y].append(x)

In [57]:
with open('movie/ind.movie.graph','wb') as f:
    pkl.dump(graph,f, pkl.HIGHEST_PROTOCOL)  

### Only want nodes with edges

In [87]:
keys = list(graph.keys())

### Get `y`

In [64]:
tmp = []
with open('movie/labels.csv') as file:
    tmp = file.readlines()
tmp = tmp[1:]

In [168]:
NUM_CLASS = 20
ally = []

for i, line in enumerate(tmp):
    if i in keys:
        t = line.strip().split(',')[-1][1:-1]
        label = t.split()
        label = list(map(int, label))
        a = np.zeros(NUM_CLASS)
        a[label] = 1
        ally.append(a)

In [169]:
ally = np.array(ally)

In [170]:
# 18, 19 are too few
idxs = [i for i in range(0, 18)]
ally = ally[:, idxs]

In [171]:
ally.shape

(2439, 18)

In [172]:
with open('movie/ind.movie.ally','wb') as f:
    pkl.dump(ally,f)  

In [155]:
 np.where(ally[:, 10] == 1)[0]

array([  19,   26,   28,   38,   40,   70,   76,   77,   92,   93,   97,
        189,  253,  269,  283,  348,  385,  389,  392,  425,  426,  428,
        437,  446,  451,  567,  586,  596,  600,  615,  656,  663,  665,
        671,  692,  702,  739,  755,  756,  780,  781,  804,  810,  812,
        820,  855,  909,  948,  961,  963,  965,  972,  994, 1002, 1011,
       1037, 1067, 1072, 1094, 1169, 1173, 1193, 1204, 1211, 1219, 1291,
       1299, 1352, 1357, 1386, 1393, 1410, 1427, 1468, 1497, 1570, 1587,
       1605, 1607, 1608, 1676, 1699, 1702, 1715, 1729, 1730, 1733, 1738,
       1755, 1762, 1765, 1794, 1821, 1831, 1858, 1905, 1914, 1918, 1998,
       2028, 2039, 2040, 2066, 2114, 2115, 2117, 2120, 2150, 2156, 2162,
       2171, 2177, 2180, 2253, 2284, 2303, 2323, 2401, 2417])

In [143]:
ally[:, 0]

array([0., 0., 0., ..., 1., 0., 0.])

### Get `x`

In [104]:
new_index = {key:idx for idx, key in enumerate(keys)}

In [106]:
allx = sp.lil_matrix((len(keys), len(keys)), dtype=int)

for x, y in arr:
    allx[new_index[x], new_index[y]] = 1
      
allx = allx.tocsr()

### Create split data

In [110]:
# start from 2028 is test data
test_idx = new_index[25766]

In [115]:
with open('movie/ind.movie.allx','wb') as f:
    pkl.dump(allx,f)  

In [116]:
with open('movie/ind.movie.tx','wb') as f:
    pkl.dump(allx[:test_idx],f)  
    
with open('movie/ind.movie.ty','wb') as f:
    pkl.dump(ally[:test_idx],f)  

In [121]:
with open('movie/ind.movie.x','wb') as f:
    pkl.dump(allx[test_idx:],f)  
    
with open('movie/ind.movie.y','wb') as f:
    pkl.dump(ally[test_idx:],f)  

In [122]:
def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index

In [123]:
idx_file = parse_index_file(dataset_dir + "/ind.{}.test.index".format(dataset_str))