In [4]:
# %load imdb_data_to_graph.py
import numpy as np
import dgl
import pickle
import os
from sklearn.preprocessing import MultiLabelBinarizer
import torch
import sys
sys.path.append('../../panrep/') 
import load_data
import argparse
import copy
from torch.utils.data import DataLoader
import itertools

In [5]:
    parser = argparse.ArgumentParser(description='PanRep')
    parser.add_argument("--dropout", type=float, default=0.2,
            help="dropout probability")
    parser.add_argument("--n-hidden", type=int, default=60,
            help="number of hidden units") # use 16, 2 for debug
    parser.add_argument("--gpu", type=int, default=0,
            help="gpu")
    parser.add_argument("--lr", type=float, default=1e-2,
            help="learning rate")
    parser.add_argument("--n-bases", type=int, default=20,
            help="number of filter weight matrices, default: -1 [use all]")
    parser.add_argument("--n-layers", type=int, default=3,
            help="number of propagation rounds")
    parser.add_argument("-e", "--n-epochs", type=int, default=50,
            help="number of training epochs for decoder")
    parser.add_argument("-ec", "--n-cepochs", type=int, default=400,
                        help="number of training epochs for classification")
    parser.add_argument("-num_masked", "--n-masked-nodes", type=int, default=100,
                        help="number of masked nodes")
    parser.add_argument("-pct_masked_links", "--pct-masked-links", type=int, default=0.5,
                        help="number of masked links")
    parser.add_argument("-negative_rate", "--negative-rate", type=int, default=4,
                        help="number of negative examples per masked link")


    parser.add_argument("-d", "--dataset", type=str, required=True,
            help="dataset to use")
    parser.add_argument("-en", "--encoder", type=str, required=True,
                        help="Encoder to use")
    parser.add_argument("--l2norm", type=float, default=0.0000,
            help="l2 norm coef")
    parser.add_argument("--relabel", default=False, action='store_true',
            help="remove untouched nodes and relabel")
    parser.add_argument("--use-self-loop", default=False, action='store_true',
            help="include self feature as a special relation")
    parser.add_argument("--use-infomax-loss", default=False, action='store_true',
                        help="use infomax task supervision")
    parser.add_argument("--use-reconstruction-loss", default=True, action='store_true',
                        help="use feature reconstruction task supervision")
    parser.add_argument("--node-masking", default=False, action='store_true',
                        help="mask a subset of node features")
    parser.add_argument("--loss-over-all-nodes", default=True, action='store_true',
                        help="compute the feature reconstruction loss over all nods or just the masked")
    parser.add_argument("--link-prediction", default=False, action='store_true',
                       help="use link prediction as supervision task")
    parser.add_argument("--mask-links", default=True, action='store_true',
                       help="mask the links to be predicted")

    parser.add_argument("--batch-size", type=int, default=100,
            help="Mini-batch size. If -1, use full graph training.")
    parser.add_argument("--model_path", type=str, default=None,
            help='path for save the model')
    parser.add_argument("--fanout", type=int, default=10,
            help="Fan-out of neighbor sampling.")

    fp = parser.add_mutually_exclusive_group(required=False)
    fp.add_argument('--validation', dest='validation', action='store_true')
    fp.add_argument('--testing', dest='validation', action='store_false')
    parser.set_defaults(validation=True)
    dataset='imdb_pre_xiang'
    args = parser.parse_args(['--dataset', dataset,'--encoder', 'RGCN'])

In [6]:
train_idx, test_idx, val_idx, labels, g, category, num_classes, masked_node_types=\
        load_data.load_hetero_data(args)

Using device cuda:0


ModuleNotFoundError: No module named 'dataloader'

In [8]:
data_folder = "../../data/imdb_data/xiang/"
g=pickle.load(open(os.path.join(data_folder, 'graph.pickle'), "rb"))

In [9]:
mapping_dict={}
edge_lists={}
reverse_mapping_dict={}

In [11]:
g.nodes['crew'].data

{'h_f': tensor([[0.0000, 0.3333, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.5000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.3333,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]]), 'h_clusters': tensor([[0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        ...,
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.]])}

In [12]:
g_without_features=copy.deepcopy(g)
for ntype in g_without_features.ntypes:
    if g_without_features.nodes[ntype].data.get("h_f", None) is not None:
        del g_without_features.nodes[ntype].data['h_f']
    if g_without_features.nodes[ntype].data.get("h_clusters", None) is not None:
        del g_without_features.nodes[ntype].data['h_clusters']
homo_g=dgl.to_homo(g_without_features)
# return the mapping among id of the homogenous graph and id and ntype of the hetero graph
#not needed probabily
#mapping_dict=[(homo_g.ndata['_ID'][i],g.ntypes[int(homo_g.ndata['_TYPE'][i])]) for i in range(len(homo_g.ndata['_TYPE']))]

u,v=homo_g.all_edges()

u=u.data.numpy()
v=v.data.numpy()
u=u.astype(int)
v=v.astype(int)
edges=np.concatenate((np.expand_dims(u, axis=0),np.expand_dims(v, axis=0)),axis=0).transpose()


In [13]:
import pickle
data_folder = "../../data/imdb_data/xiang/"


In [14]:
np.savetxt(data_folder+"edge_list_complete.txt",edges,fmt='%i')

In [15]:
motif_features={}

For the IMDB type of graph where a lot of relationships are among different entinties the motif representation
will be quite poor. Consider combining several relationtypes to get more interesting signals...

In [17]:
folder='../../../../PGD-orbit-master/'
etype='complete'

# the row index corresponds to the original id and the value corresponds to the internal id used by nasreen
vertex_mapping_2_nasreens_ids=np.loadtxt(folder+"vertex_mapping.txt") 
mapping_from_nasreens_ids={int(vertex_mapping_2_nasreens_ids[i]): i for i in range(len(vertex_mapping_2_nasreens_ids))}
motif_per_node=np.loadtxt(folder+dataset+'_'+etype+"-node.txt",skiprows=1,delimiter=',')

for ntype in g.ntypes:
    motif_features[ntype]=torch.zeros((g.number_of_nodes(ntype),motif_per_node.shape[1]-1)).int()

The following code extracts the motifs per node and then maps it as node features in the original graph by using some id mapping. It also performs some check to validate that the predicted degree is the same with the actual degree and hence the vertex mapping is correct.

In [19]:
#To check consistency see if deggrees same.
edges_dict={}
for e in edges:
    #if e[0]==3 or e[1]==3:
    #    print(e)
    e=tuple(e)
    if e not in edges_dict:
        edges_dict[e]=1
edges_li=list(edges_dict.keys())

dglonelistgraph=dgl.heterograph(data_dict={('0','1','0'):edges_li})
ntg= dgl.to_networkx(dglonelistgraph)
dgl_grp=dgl.DGLGraph(ntg)
bidercted_g=dgl.transform.to_bidirected(dgl_grp)

In [20]:

c=0
for i in range(motif_per_node.shape[0]):
    homo_id=int(mapping_from_nasreens_ids[motif_per_node[i,0]-1])
    #print('--'+str(homo_id))
    if ((bidercted_g.in_degree(homo_id)))==(int(motif_per_node[i,1])):
        c+=1
    else:
        print(bidercted_g.in_degree(homo_id))
        print(int(motif_per_node[i,1]))
    ntype=g.ntypes[homo_g.ndata['_TYPE'][homo_id]]
    ntype_id=homo_g.ndata['_ID'][homo_id]
    motif_features[ntype][ntype_id]=torch.tensor(motif_per_node[i,1:]).int()
print(c==motif_per_node.shape[0])

True


In [21]:
motif_features

{'crew': tensor([[  300,  4058, 44850,  ...,     0,     0,     0],
         [  398,  4849, 79003,  ...,     0,     0,     0],
         [  173,  2109, 14878,  ...,     0,     0,     0],
         ...,
         [    1,     9,     0,  ...,     0,     0,     0],
         [    1,     9,     0,  ...,     0,     0,     0],
         [    0,     0,     0,  ...,     0,     0,     0]], dtype=torch.int32),
 'title': tensor([[   3,  875,    3,  ...,    0,    0,    0],
         [   2,    9,    1,  ...,    0,    0,    0],
         [   6,    9,   15,  ...,    0,    0,    0],
         ...,
         [  10, 3545,   45,  ...,    0,    0,    0],
         [   9,   16,   36,  ...,    0,    0,    0],
         [  10, 2976,   45,  ...,    0,    0,    0]], dtype=torch.int32)}

In [22]:
# motif features

pickle.dump(motif_features, open(os.path.join(data_folder, "node_motifs.pickle"), "wb"),
                protocol=4);

In [None]:
from statistics import median
 

The following code retrieves the motif characteristics of each edge in the graph

In [None]:
etype='complete'
motif_per_edge=np.loadtxt(folder+dataset+'_'+etype+"-edge.txt",skiprows=1,delimiter=',')


    

Some nodes are disconnected, for this the mapping from nasreen will be -1

In [None]:
print(len(np.unique(vertex_mapping_2_nasreens_ids)))
print(len(vertex_mapping_2_nasreens_ids))
print(homo_g.number_of_nodes())
condition = (vertex_mapping_2_nasreens_ids==-1)
print(len(vertex_mapping_2_nasreens_ids[condition]))

Nasreens code returns bidirectional graph by considering both directions of the edges of the original directed graphs.
Hence the returned graph has some edges that do not exist in the original graph. For now I just skip these.

The following creates a new edge, called motif that has the motif data.

In [None]:
dict_motif_edges={}
dict_motif_edata={}
for i in range(motif_per_edge.shape[0]):
    
    homo_id_dest=int(mapping_from_nasreens_ids[motif_per_edge[i,0]-1])
    homo_id_src=int(mapping_from_nasreens_ids[motif_per_edge[i,1]-1])
    #print('--'+str(homo_id))
    
    ntype_src=g.ntypes[homo_g.ndata['_TYPE'][homo_id_src]]
    ntype_id_src=homo_g.ndata['_ID'][homo_id_src]
    ntype_dest=g.ntypes[homo_g.ndata['_TYPE'][homo_id_dest]]
    ntype_id_dest=homo_g.ndata['_ID'][homo_id_dest]
    homo_e_id=homo_g.edge_id(homo_id_src,homo_id_dest)
    homo_e_id=homo_e_id.data.cpu().numpy()
    n_etype=(ntype_src,'motif_edge',ntype_dest)
    if n_etype in dict_motif_edges:
        dict_motif_edges[n_etype]+=[(int(ntype_id_src.data.cpu().numpy()),int(ntype_id_dest.data.cpu().numpy()))]
        dict_motif_edata[n_etype]+=[torch.tensor(motif_per_edge[i,2:]).int()]
    else:
        dict_motif_edges[n_etype]=[(int(ntype_id_src.data.cpu().numpy()),int(ntype_id_dest.data.cpu().numpy()))]
        dict_motif_edata[n_etype]=[torch.tensor(motif_per_edge[i,2:]).int()]

In [None]:
edge_motifs={0:dict_motif_edata,1:dict_motif_edges}
pickle.dump(edge_motifs, open(os.path.join(data_folder, "edge_motifs.pickle"), "wb"),
                protocol=4);

In [None]:
edge_motifs

Check that the eid for the first edge in dict_homo_edge is 1 so that the dict_homo_edata are correctly aligned

In [None]:
print(len(np.where(~dict_motif_edata[e].data.cpu().numpy()[:,:].any(axis=1))[0]))

Here we have multiple edge ids in the hetero graph that possibly map to the same id in the graph of nasreen. The
following creates the motif edata for each of the existing links. Since the current design treats it as homogenous
focus on above.

In [None]:
result=True
for etype in g.etypes:
    g.edges[etype].data['motifs']=torch.zeros((g.number_of_edges(etype),motif_per_edge.shape[1]-2)).int()
for i in range(motif_per_edge.shape[0]):
    
    homo_id_dest=int(mapping_from_nasreens_ids[motif_per_edge[i,0]-1])
    homo_id_src=int(mapping_from_nasreens_ids[motif_per_edge[i,1]-1])
    #print('--'+str(homo_id))
    
    #print(str(homo_id_src)+','+str(homo_id_dest))
    ntype_src=g.ntypes[homo_g.ndata['_TYPE'][homo_id_src]]
    ntype_id_src=homo_g.ndata['_ID'][homo_id_src]
    ntype_dest=g.ntypes[homo_g.ndata['_TYPE'][homo_id_dest]]
    ntype_id_dest=homo_g.ndata['_ID'][homo_id_dest]
    homo_e_id=homo_g.edge_id(homo_id_src,homo_id_dest)
    homo_e_id=homo_e_id.data.cpu().numpy()

    
    if len(homo_e_id)!=0:
        homo_e_id=homo_e_id[0]
        #print('homo_id '+str(homo_e_id))
        cetype=g.etypes[homo_g.edata['_TYPE'][homo_e_id]]
        hetero_e_id=homo_g.edata['_ID'][homo_e_id]
        # TODO probably here we need to add the features for 
        # all edge types that may contain this specific src-dest pair
        
        het_e_id=g.edge_id(ntype_id_src,ntype_id_dest,etype=(ntype_src,cetype,ntype_dest))
        het_e_id=het_e_id.data.cpu().numpy()
        
        print('hetero_id '+str(het_e_id))
        #print(cetype)
        if len(het_e_id)<=1:
            het_e_id=int(het_e_id)
            result=result and (hetero_e_id==het_e_id)
            #print(result)
            g.edges[cetype].data['motifs'][het_e_id]=torch.tensor(motif_per_edge[i,2:]).int()
        else:
            # for some edge type ( participated by ) we may have multiple egdes of the same type among actor-movies.
            for eid in het_e_id:
                g.edges[cetype].data['motifs'][eid]=torch.tensor(motif_per_edge[i,2:]).int()
print(result)

SOS The same edge that corresponds to different edge types is counted multiple timed by the dgl.graph.in_degrees implementation. On the other hand multiple edges are ignored in Nasreens code. This may lead to a discrepancy in the degree reported by her code and dgl. 

In [None]:
#(torch.sum(,1))
etype='written_by'
print(len(g.edges[etype].data['motifs']))
print(len(np.where(~g.edges[etype].data['motifs'].data.cpu().numpy().any(axis=1))[0]))
sum(g.edges[etype].data['motifs'].data.cpu().numpy())

In [None]:

pickle.dump(g, open(os.path.join(data_folder, "graph_reduced_m.pickle"), "wb"),
                protocol=4);

In [None]:
import pickle
data_folder="../data/kg/wn18/"

In [None]:
g=pickle.load(open(os.path.join(data_folder, "graph_reduced_m.pickle"), "rb"))

In [None]:
g.edges['12'].data