In [4]:
from gensim import corpora
import numpy as np
import unicodecsv as csv
import tensorflow as tf
import math
import os,sys
import random
from scipy.sparse import csr_matrix
from tqdm import tqdm
import json

In [29]:
#ref http://stackoverflow.com/questions/8955448/save-load-scipy-sparse-csr-matrix-in-portable-data-format
def save_sparse_csr(filename,array):
    np.savez(filename,data = array.data ,indices=array.indices,
             indptr =array.indptr, shape=array.shape )

def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((  loader['data'], loader['indices'], loader['indptr']),shape = loader['shape'])

In [34]:
co_author_matrix=load_sparse_csr("../data/co-author-matrix.npz")
with open('../data/co-author-index.json', 'r') as f:
    aid2aname=json.load(f)
aid2aname=dict((int(k), v) for k, v in aid2aname.iteritems())

In [35]:
adj_mat_csr_sparse=co_author_matrix

def alpha(p,q,t,x):
    if t==x:
        return 1.0/p
    elif adj_mat_csr_sparse[t,x]>0:
        return 1.0
    else:
        return 1.0/q
    
p=1.0
q=0.5
    
transition={}

num_nodes=adj_mat_csr_sparse.shape[0]
indices=adj_mat_csr_sparse.indices
indptr=adj_mat_csr_sparse.indptr
data=adj_mat_csr_sparse.data

#Precompute the transition matrix in advance
for t in tqdm(xrange(num_nodes)):#t is row index
    for v in indices[indptr[t]:indptr[t+1]]:#i.e  possible next ndoes from t
        pi_vx_indices=indices[indptr[v]:indptr[v+1]]#i.e  possible next ndoes from v
        pi_vx_values = np.array([alpha(p,q,t,x) for x in pi_vx_indices])
        pi_vx_values=pi_vx_values*data[indptr[v]:indptr[v+1]]
        #This is eqilvalent to the following
#         pi_vx_values=[]
#         for x in pi_vx_indices:
#             pi_vx=alpha(p,q,t,x)*adj_mat_csr_sparse[v,x]
#             pi_vx_values.append(pi_vx)
        pi_vx_values=pi_vx_values/np.sum(pi_vx_values)
        #now, we have normalzied transion probabilities for v traversed from t
        #the probabilities are stored as a sparse vector. 
        transition[t,v]=(pi_vx_indices,pi_vx_values)

100%|██████████| 74530/74530 [11:21<00:00, 109.31it/s]


In [36]:
adj_mat_csr_sparse=co_author_matrix
indices=adj_mat_csr_sparse.indices
indptr=adj_mat_csr_sparse.indptr
data=adj_mat_csr_sparse.data
random_walk_length=100
    
def get_random_walk(p):
    random_walks=[]
    #get random walks
    for u in tqdm(xrange(num_nodes)):
        if len(indices[indptr[u]:indptr[u+1]]) !=0:
            #first move is just depends on weight
            possible_next_node=indices[indptr[u]:indptr[u+1]]
            weight_for_next_move=data[indptr[u]:indptr[u+1]]#i.e  possible next ndoes from u
            weight_for_next_move=weight_for_next_move.astype(np.float32)/np.sum(weight_for_next_move)
            first_walk=np.random.choice(possible_next_node, 1, p=weight_for_next_move)
            random_walk=[u,first_walk[0]]
            for i in xrange(random_walk_length-2):
                cur_node = random_walk[-1]
                precious_node=random_walk[-2]
                (pi_vx_indices,pi_vx_values)=transition[precious_node,cur_node]
                next_node=np.random.choice(pi_vx_indices, 1, p=pi_vx_values)
                random_walk.append(next_node[0])
            random_walks.append(random_walk)
    
    return random_walks

# random_walks=[]
# adj_mat_csr_sparse=co_author_matrix
# indices=adj_mat_csr_sparse.indices
# indptr=adj_mat_csr_sparse.indptr
# data=adj_mat_csr_sparse.data
# random_walk_length=100

# #get random walks
# for u in tqdm(xrange(num_nodes)):
#     if len(indices[indptr[u]:indptr[u+1]]) !=0:
#         #first move is just depends on weight
#         possible_next_node=indices[indptr[u]:indptr[u+1]]
#         weight_for_next_move=data[indptr[u]:indptr[u+1]]#i.e  possible next ndoes from u
#         weight_for_next_move=weight_for_next_move.astype(np.float32)/np.sum(weight_for_next_move)
#         first_walk=np.random.choice(possible_next_node, 1, p=weight_for_next_move)
#         random_walk=[u,first_walk[0]]
#         for i in xrange(random_walk_length-2):
#             cur_node = random_walk[-1]
#             precious_node=random_walk[-2]
#             (pi_vx_indices,pi_vx_values)=transition[precious_node,cur_node]
#             next_node=np.random.choice(pi_vx_indices, 1, p=pi_vx_values)
#             random_walk.append(next_node[0])
#         random_walks.append(random_walk)

In [37]:
import time
start = time.time()
elapsed_time = time.time() - start

import multiprocessing as mp
proc = 20  
pool = mp.Pool(proc)
callback = pool.map(get_random_walk, range(20))
pool.close()

elapsed_time = time.time() - start
print ("elapsed_time:{0}".format(elapsed_time)) + "[sec]"

random_walks=[]
for temp in callback:
    random_walks.extend(temp)
del callback
np_random_walks=np.array(random_walks,dtype=np.int32)
del random_walks
np.savez('../work/random_walks.npz',np_random_walks)

elapsed_time = time.time() - start
print ("elapsed_time:{0}".format(elapsed_time)) + "[sec]"

100%|██████████| 74530/74530 [07:58<00:00, 155.87it/s]
100%|██████████| 74530/74530 [07:58<00:00, 155.71it/s]
100%|██████████| 74530/74530 [08:01<00:00, 154.75it/s]
 99%|█████████▉| 74151/74530 [08:03<00:02, 158.70it/s]
100%|██████████| 74530/74530 [08:03<00:00, 154.04it/s]
100%|██████████| 74530/74530 [08:03<00:00, 154.02it/s]
100%|██████████| 74530/74530 [08:05<00:00, 153.47it/s]
100%|██████████| 74530/74530 [08:05<00:00, 153.45it/s]
100%|██████████| 74530/74530 [08:05<00:00, 153.37it/s]
100%|██████████| 74530/74530 [08:06<00:00, 153.23it/s]
100%|██████████| 74530/74530 [08:08<00:00, 152.63it/s]
100%|██████████| 74530/74530 [08:08<00:00, 152.51it/s]
100%|██████████| 74530/74530 [08:08<00:00, 152.42it/s]
100%|██████████| 74530/74530 [08:11<00:00, 151.61it/s]
100%|██████████| 74530/74530 [08:13<00:00, 150.97it/s]
100%|██████████| 74530/74530 [08:14<00:00, 150.81it/s]
100%|██████████| 74530/74530 [08:16<00:00, 150.02it/s]
100%|██████████| 74530/74530 [08:18<00:00, 149.51it/s]
100%|█████

elapsed_time:1350.17567492[sec]
elapsed_time:1369.05380893[sec]


In [46]:
#Computational Graph Definition
tf.reset_default_graph()#remove this if not ipython notebook

num_nodes=adj_mat_csr_sparse.shape[0]
context_size=16
batch_size = None
embedding_size = 200 # Dimension of the embedding vector.
num_sampled = 64 # Number of negative examples to sample.

global_step = tf.Variable(0, name='global_step', trainable=False)

# Parameters to learn
node_embeddings = tf.Variable(tf.random_uniform([num_nodes, embedding_size], -1.0, 1.0))

#Fixedones
biases=tf.zeros([num_nodes])

# Input data and re-orgenize size.
with tf.name_scope("context_node") as scope:
    #context nodes to each input node in the batch (e.g [[1,2],[4,6],[5,7]] where batch_size = 3,context_size=3)
    train_context_node= tf.placeholder(tf.int32, shape=[batch_size,context_size],name="context_node")
    #orgenize prediction labels (skip-gram model predicts context nodes (i.e labels) given a input node)
    #i.e make [[1,2,4,6,5,7]] given context above. The redundant dimention is just for restriction on tensorflow API.
    train_context_node_flat=tf.reshape(train_context_node,[-1,1])
with tf.name_scope("input_node") as scope:
    #batch input node to the network(e.g [2,1,3] where batch_size = 3)
    train_input_node= tf.placeholder(tf.int32, shape=[batch_size],name="input_node")
    #orgenize input as flat. i.e we want to make [2,2,2,1,1,1,3,3,3] given the  input nodes above
    input_ones=tf.ones_like(train_context_node)
    train_input_node_flat=tf.reshape(tf.mul(input_ones,tf.reshape(train_input_node,[-1,1])),[-1])

# Model.
with tf.name_scope("loss") as scope:
    # Look up embeddings for words.
    node_embed = tf.nn.embedding_lookup(node_embeddings, train_input_node_flat)
    # Compute the softmax loss, using a sample of the negative labels each time.
    loss_node2vec = tf.reduce_mean(tf.nn.sampled_softmax_loss(node_embeddings,biases,node_embed,train_context_node_flat, num_sampled, num_nodes))
    loss_node2vec_summary = tf.scalar_summary("loss_node2vec", loss_node2vec)

# Initializing the variables
init = tf.initialize_all_variables()

# Add ops to save and restore all the variables.
saver = tf.train.Saver(max_to_keep=20)

# Optimizer.
update_loss = tf.train.AdamOptimizer().minimize(loss_node2vec,global_step=global_step)

merged = tf.merge_all_summaries()

In [None]:
%ls ../

[0m[01;34mcode[0m/  [01;34mdata[0m/  [01;34mlog_node2vec1[0m/  [01;34mresults[0m/  [01;34mwork[0m/


In [None]:
# hyper parameters
num_random_walks=np_random_walks.shape[0]

# Launch the graph
# Initializing the variables
init = tf.initialize_all_variables()

with tf.Session() as sess:
    log_dir="../log0/"
    writer = tf.train.SummaryWriter(log_dir, sess.graph)
    sess.run(init)
    for i in xrange(0,num_random_walks):
        a_random_walk=np_random_walks[i]
        train_input_batch = np.array([a_random_walk[j] for j in xrange(random_walk_length-context_size)])
        train_context_batch = np.array([a_random_walk[j+1:j+1+context_size] for j in xrange(random_walk_length-context_size)])
        feed_dict={train_input_node:train_input_batch,
                   train_context_node:train_context_batch,
                  }        
        _,loss_value,summary_str=sess.run([update_loss,loss_node2vec,merged], feed_dict)
        writer.add_summary(summary_str,i)

        with open(log_dir+"loss_value.txt", "a") as f:
            f.write(str(loss_value)+'\n') 
                
        # Save the variables to disk.
        if i%10000==0:
            model_path=log_dir+"model.ckpt"
            save_path = saver.save(sess, model_path,global_step)
            print("Model saved in file: %s" % save_path)

Model saved in file: ../log0/model.ckpt-1


tensorboard --logdir=./log0