In [1]:
# %load imdb_data_to_graph.py
import numpy as np
import dgl
import pickle
import os
from sklearn.preprocessing import MultiLabelBinarizer
import torch
import sys
sys.path.append('../../panrep/') 
import load_data
import argparse
import copy
import itertools

In [2]:
    parser = argparse.ArgumentParser(description='PanRep')
    parser.add_argument("--dropout", type=float, default=0.2,
            help="dropout probability")
    parser.add_argument("--n-hidden", type=int, default=60,
            help="number of hidden units") # use 16, 2 for debug
    parser.add_argument("--gpu", type=int, default=0,
            help="gpu")
    parser.add_argument("--lr", type=float, default=1e-2,
            help="learning rate")
    parser.add_argument("--n-bases", type=int, default=20,
            help="number of filter weight matrices, default: -1 [use all]")
    parser.add_argument("--n-layers", type=int, default=3,
            help="number of propagation rounds")
    parser.add_argument("-e", "--n-epochs", type=int, default=50,
            help="number of training epochs for decoder")
    parser.add_argument("-ec", "--n-cepochs", type=int, default=400,
                        help="number of training epochs for classification")
    parser.add_argument("-num_masked", "--n-masked-nodes", type=int, default=100,
                        help="number of masked nodes")
    parser.add_argument("-pct_masked_links", "--pct-masked-links", type=int, default=0.5,
                        help="number of masked links")
    parser.add_argument("-negative_rate", "--negative-rate", type=int, default=4,
                        help="number of negative examples per masked link")


    parser.add_argument("-d", "--dataset", type=str, required=True,
            help="dataset to use")
    parser.add_argument("-en", "--encoder", type=str, required=True,
                        help="Encoder to use")
    parser.add_argument("--l2norm", type=float, default=0.0000,
            help="l2 norm coef")
    parser.add_argument("--relabel", default=False, action='store_true',
            help="remove untouched nodes and relabel")
    parser.add_argument("--use-self-loop", default=False, action='store_true',
            help="include self feature as a special relation")
    parser.add_argument("--use-infomax-loss", default=False, action='store_true',
                        help="use infomax task supervision")
    parser.add_argument("--use-reconstruction-loss", default=True, action='store_true',
                        help="use feature reconstruction task supervision")
    parser.add_argument("--node-masking", default=False, action='store_true',
                        help="mask a subset of node features")
    parser.add_argument("--loss-over-all-nodes", default=True, action='store_true',
                        help="compute the feature reconstruction loss over all nods or just the masked")
    parser.add_argument("--link-prediction", default=False, action='store_true',
                       help="use link prediction as supervision task")
    parser.add_argument("--mask-links", default=True, action='store_true',
                       help="mask the links to be predicted")

    parser.add_argument("--batch-size", type=int, default=100,
            help="Mini-batch size. If -1, use full graph training.")
    parser.add_argument("--model_path", type=str, default=None,
            help='path for save the model')
    parser.add_argument("--fanout", type=int, default=10,
            help="Fan-out of neighbor sampling.")

    fp = parser.add_mutually_exclusive_group(required=False)
    fp.add_argument('--validation', dest='validation', action='store_true')
    fp.add_argument('--testing', dest='validation', action='store_false')
    parser.set_defaults(validation=True)
    dataset='wn18'
    args = parser.parse_args(['--dataset', dataset,'--encoder', 'RGCN'])

In [3]:
train_edges, test_edges, valid_edges, train_g, valid_g, test_g, featless_node_types=\
        load_data.load_hetero_link_pred_data(args)

Using device cuda:0


In [4]:
mapping_dict={}
edge_lists={}
reverse_mapping_dict={}

In [5]:
g_without_features=copy.deepcopy(train_g)
for ntype in g_without_features.ntypes:
    del g_without_features.nodes[ntype].data['h_f']
homo_g=dgl.to_homo(g_without_features)
# return the mapping among id of the homogenous graph and id and ntype of the hetero graph
#not needed probabily
#mapping_dict=[(homo_g.ndata['_ID'][i],g.ntypes[int(homo_g.ndata['_TYPE'][i])]) for i in range(len(homo_g.ndata['_TYPE']))]

u,v=homo_g.all_edges()

u=u.data.numpy()
v=v.data.numpy()
u=u.astype(int)
v=v.astype(int)
edges=np.concatenate((np.expand_dims(u, axis=0),np.expand_dims(v, axis=0)),axis=0).transpose()


In [8]:
import pickle
data_folder="../../data/kg/wn18/"

In [9]:
np.savetxt(data_folder+"lp_edge_list_complete.txt",edges,fmt='%i')

In [10]:
motif_features={}

For the IMDB type of graph where a lot of relationships are among different entinties the motif representation
will be quite poor. Consider combining several relationtypes to get more interesting signals...

In [13]:
folder='../../../../PGD-orbit-master/'
etype='complete'
dataset='wn18_lp'

# the row index corresponds to the original id and the value corresponds to the internal id used by nasreen
vertex_mapping_2_nasreens_ids=np.loadtxt(folder+"vertex_mapping.txt") 
mapping_from_nasreens_ids={int(vertex_mapping_2_nasreens_ids[i]): i for i in range(len(vertex_mapping_2_nasreens_ids))}
motif_per_node=np.loadtxt(folder+dataset+'_'+etype+"-node.txt",skiprows=1,delimiter=',')

for ntype in train_g.ntypes:
    train_g.nodes[ntype].data['motifs']=torch.zeros((train_g.number_of_nodes(ntype),motif_per_node.shape[1]-1)).int()

The following code extracts the motifs per node and then maps it as node features in the original graph by using some id mapping. It also performs some check to validate that the predicted degree is the same with the actual degree and hence the vertex mapping is correct.

In [14]:
#To check consistency see if deggrees same.
edges_dict={}
for e in edges:
    #if e[0]==3 or e[1]==3:
    #    print(e)
    e=tuple(e)
    if e not in edges_dict:
        edges_dict[e]=1
edges_li=list(edges_dict.keys())

dglonelistgraph=dgl.heterograph(data_dict={('0','1','0'):edges_li})
ntg= dgl.to_networkx(dglonelistgraph)
dgl_grp=dgl.DGLGraph(ntg)
bidercted_g=dgl.transform.to_bidirected(dgl_grp)

Graph(num_nodes={'word': 40943},
      num_edges={('word', '14', 'word'): 24, ('word', '7', 'word'): 39, ('word', '12', 'word'): 1153, ('word', '4', 'word'): 26, ('word', '5', 'word'): 253, ('word', '6', 'word'): 165, ('word', '17', 'word'): 1251, ('word', '1', 'word'): 172, ('word', '8', 'word'): 1074, ('word', '16', 'word'): 278, ('word', '13', 'word'): 56, ('word', '9', 'word'): 122, ('word', '3', 'word'): 37, ('word', '15', 'word'): 114, ('word', '0', 'word'): 111, ('word', '10', 'word'): 108, ('word', '2', 'word'): 14, ('word', '11', 'word'): 3},
      metagraph=[('word', 'word'), ('word', 'word'), ('word', 'word'), ('word', 'word'), ('word', 'word'), ('word', 'word'), ('word', 'word'), ('word', 'word'), ('word', 'word'), ('word', 'word'), ('word', 'word'), ('word', 'word'), ('word', 'word'), ('word', 'word'), ('word', 'word'), ('word', 'word'), ('word', 'word'), ('word', 'word')])


In [16]:

c=0
for i in range(motif_per_node.shape[0]):
    homo_id=int(mapping_from_nasreens_ids[motif_per_node[i,0]-1])
    #print('--'+str(homo_id))
    if ((bidercted_g.in_degree(homo_id)))==(int(motif_per_node[i,1])):
        c+=1
    else:
        print(bidercted_g.in_degree(homo_id))
        print(int(motif_per_node[i,1]))
    ntype=train_g.ntypes[homo_g.ndata['_TYPE'][homo_id]]
    ntype_id=homo_g.ndata['_ID'][homo_id]
    train_g.nodes[ntype].data['motifs'][ntype_id]=torch.tensor(motif_per_node[i,1:]).int()
print(c==motif_per_node.shape[0])

4
3
3
2
4
3
3
2
5
4
3
2
2
1
False


In [24]:
#save graph with features
data_folder="../../data/kg/wn18/"


In [25]:
train_g.nodes['word'].data


{'h_f': tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.1322, -0.0282,  0.0381],
        [-0.4691,  0.5274,  0.3578,  ...,  0.1310, -0.0541,  0.0083],
        [-0.2716,  0.8514,  0.3872,  ..., -0.1355,  0.1837, -0.0843],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ..., -0.1176,  0.0368, -0.0681],
        [ 0.0667,  0.1855,  0.4597,  ..., -0.2212,  0.0663,  0.1769],
        [-0.3224,  0.2966, -0.7549,  ..., -0.2261,  0.0869, -0.0762]]), 'motifs': tensor([[ 5, 18, 10,  ...,  0,  0,  0],
        [ 2, 44,  1,  ...,  0,  0,  0],
        [ 2,  3,  1,  ...,  0,  0,  0],
        ...,
        [ 2, 77,  1,  ...,  0,  0,  0],
        [ 3, 11,  3,  ...,  0,  0,  0],
        [ 5, 33, 10,  ...,  0,  0,  0]], dtype=torch.int32)}

The following code retrieves the motif characteristics of each edge in the graph

In [26]:
etype='complete'
motif_per_edge=np.loadtxt(folder+dataset+'_'+etype+"-edge.txt",skiprows=1,delimiter=',')


    

Some nodes are disconnected, for this the mapping from nasreen will be -1

In [27]:
print(len(np.unique(vertex_mapping_2_nasreens_ids)))
print(len(vertex_mapping_2_nasreens_ids))
print(homo_g.number_of_nodes())
condition = (vertex_mapping_2_nasreens_ids==-1)
print(len(vertex_mapping_2_nasreens_ids[condition]))

40943
40943
40943
0


Nasreens code returns bidirectional graph by considering both directions of the edges of the original directed graphs.
Hence the returned graph has some edges that do not exist in the original graph. For now I just skip these.

The following creates a new edge, called motif that has the motif data.

In [29]:
# currently works for one node type

edata=[]
src_id=[]
dest_id=[]
#dict_motif_edges={}
#dict_motif_edata={}
for i in range(motif_per_edge.shape[0]):
    
    homo_id_dest=int(mapping_from_nasreens_ids[motif_per_edge[i,0]-1])
    homo_id_src=int(mapping_from_nasreens_ids[motif_per_edge[i,1]-1])
    #print('--'+str(homo_id))
    
    ntype_src=train_g.ntypes[homo_g.ndata['_TYPE'][homo_id_src]]
    ntype_id_src=homo_g.ndata['_ID'][homo_id_src]
    ntype_dest=train_g.ntypes[homo_g.ndata['_TYPE'][homo_id_dest]]
    ntype_id_dest=homo_g.ndata['_ID'][homo_id_dest]
    homo_e_id=homo_g.edge_id(homo_id_src,homo_id_dest)
    homo_e_id=homo_e_id.data.cpu().numpy()
    n_etype=(ntype_src,'motif_edge',ntype_dest)
    src_id+=[int(ntype_id_src.data.cpu().numpy())]
    dest_id+=[int(ntype_id_dest.data.cpu().numpy())]
    edata+=[torch.tensor(motif_per_edge[i,2:]).int()]
edata=torch.stack(edata)
src_id=torch.tensor(src_id).int()
dest_id=torch.tensor(dest_id).int()


In [30]:
edata

tensor([[ 74, 516, 515,  ...,   0,   0,   0],
        [ 28, 516, 515,  ...,   0,   0,   0],
        [233, 306, 305,  ...,   0,   0,   0],
        ...,
        [  1,   1,   0,  ...,   0,   0,   0],
        [  1,   1,   0,  ...,   0,   0,   0],
        [  1,   1,   0,  ...,   0,   0,   0]], dtype=torch.int32)

In [31]:

data={"train_edges":train_edges,"test_edges":test_edges,"valid_edges":valid_edges, "train_g":train_g,"valid_g":valid_g,
      "test_g":test_g,"featless_node_types": featless_node_types, "src_id":src_id,"dest_id":dest_id,"edata":edata}
pickle.dump(data, open(os.path.join(data_folder, "data_lp_motifs.pickle"), "wb"),
                protocol=4);

In [36]:
sum(train_g.nodes['word'].data['motifs'])

tensor([   150902,   3115064,   1557532,     15141,  13121164,  13121164,
        338263482, 112754494,    257200,    298711,    597422,    298711,
            28278,     28278,       232], dtype=torch.int32)

In [35]:
np.where(~train_g.nodes['word'].data['motifs'].data.cpu().numpy().any(axis=0))[0]

array([], dtype=int64)

Check that the eid for the first edge in dict_homo_edge is 1 so that the dict_homo_edata are correctly aligned

In [None]:
print(len(np.where(~dict_motif_edata[e].data.cpu().numpy()[:,:].any(axis=1))[0]))

Here we have multiple edge ids in the hetero graph that possibly map to the same id in the graph of nasreen. The
following creates the motif edata for each of the existing links. Since the current design treats it as homogenous
focus on above.

In [None]:
result=True
for etype in g.etypes:
    g.edges[etype].data['motifs']=torch.zeros((g.number_of_edges(etype),motif_per_edge.shape[1]-2)).int()
for i in range(motif_per_edge.shape[0]):
    
    homo_id_dest=int(mapping_from_nasreens_ids[motif_per_edge[i,0]-1])
    homo_id_src=int(mapping_from_nasreens_ids[motif_per_edge[i,1]-1])
    #print('--'+str(homo_id))
    
    #print(str(homo_id_src)+','+str(homo_id_dest))
    ntype_src=g.ntypes[homo_g.ndata['_TYPE'][homo_id_src]]
    ntype_id_src=homo_g.ndata['_ID'][homo_id_src]
    ntype_dest=g.ntypes[homo_g.ndata['_TYPE'][homo_id_dest]]
    ntype_id_dest=homo_g.ndata['_ID'][homo_id_dest]
    homo_e_id=homo_g.edge_id(homo_id_src,homo_id_dest)
    homo_e_id=homo_e_id.data.cpu().numpy()

    
    if len(homo_e_id)!=0:
        homo_e_id=homo_e_id[0]
        #print('homo_id '+str(homo_e_id))
        cetype=g.etypes[homo_g.edata['_TYPE'][homo_e_id]]
        hetero_e_id=homo_g.edata['_ID'][homo_e_id]
        # TODO probably here we need to add the features for 
        # all edge types that may contain this specific src-dest pair
        
        het_e_id=g.edge_id(ntype_id_src,ntype_id_dest,etype=(ntype_src,cetype,ntype_dest))
        het_e_id=het_e_id.data.cpu().numpy()
        
        print('hetero_id '+str(het_e_id))
        #print(cetype)
        if len(het_e_id)<=1:
            het_e_id=int(het_e_id)
            result=result and (hetero_e_id==het_e_id)
            #print(result)
            g.edges[cetype].data['motifs'][het_e_id]=torch.tensor(motif_per_edge[i,2:]).int()
        else:
            # for some edge type ( participated by ) we may have multiple egdes of the same type among actor-movies.
            for eid in het_e_id:
                g.edges[cetype].data['motifs'][eid]=torch.tensor(motif_per_edge[i,2:]).int()
print(result)

SOS The same edge that corresponds to different edge types is counted multiple timed by the dgl.graph.in_degrees implementation. On the other hand multiple edges are ignored in Nasreens code. This may lead to a discrepancy in the degree reported by her code and dgl. 

In [None]:
#(torch.sum(,1))
etype='written_by'
print(len(g.edges[etype].data['motifs']))
print(len(np.where(~g.edges[etype].data['motifs'].data.cpu().numpy().any(axis=1))[0]))
sum(g.edges[etype].data['motifs'].data.cpu().numpy())

In [None]:

pickle.dump(g, open(os.path.join(data_folder, "graph_reduced_m.pickle"), "wb"),
                protocol=4);

In [None]:
import pickle
data_folder="../data/kg/wn18/"

In [None]:
g=pickle.load(open(os.path.join(data_folder, "graph_reduced_m.pickle"), "rb"))

In [None]:
g.edges['12'].data