In [3]:
# %load imdb_data_to_graph.py
import numpy as np
import dgl
import pickle
import os
from sklearn.preprocessing import MultiLabelBinarizer
import torch
import sys
sys.path.append('../../panrep/') 
import load_data
import argparse
import itertools

In [4]:
    parser = argparse.ArgumentParser(description='PanRep')
    parser.add_argument("--dropout", type=float, default=0.2,
            help="dropout probability")
    parser.add_argument("--n-hidden", type=int, default=60,
            help="number of hidden units") # use 16, 2 for debug
    parser.add_argument("--gpu", type=int, default=0,
            help="gpu")
    parser.add_argument("--lr", type=float, default=1e-2,
            help="learning rate")
    parser.add_argument("--n-bases", type=int, default=20,
            help="number of filter weight matrices, default: -1 [use all]")
    parser.add_argument("--n-layers", type=int, default=3,
            help="number of propagation rounds")
    parser.add_argument("-e", "--n-epochs", type=int, default=50,
            help="number of training epochs for decoder")
    parser.add_argument("-ec", "--n-cepochs", type=int, default=400,
                        help="number of training epochs for classification")
    parser.add_argument("-num_masked", "--n-masked-nodes", type=int, default=100,
                        help="number of masked nodes")
    parser.add_argument("-pct_masked_links", "--pct-masked-links", type=int, default=0.5,
                        help="number of masked links")
    parser.add_argument("-negative_rate", "--negative-rate", type=int, default=4,
                        help="number of negative examples per masked link")


    parser.add_argument("-d", "--dataset", type=str, required=True,
            help="dataset to use")
    parser.add_argument("-en", "--encoder", type=str, required=True,
                        help="Encoder to use")
    parser.add_argument("--l2norm", type=float, default=0.0000,
            help="l2 norm coef")
    parser.add_argument("--relabel", default=False, action='store_true',
            help="remove untouched nodes and relabel")
    parser.add_argument("--use-self-loop", default=False, action='store_true',
            help="include self feature as a special relation")
    parser.add_argument("--use-infomax-loss", default=False, action='store_true',
                        help="use infomax task supervision")
    parser.add_argument("--use-reconstruction-loss", default=True, action='store_true',
                        help="use feature reconstruction task supervision")
    parser.add_argument("--node-masking", default=False, action='store_true',
                        help="mask a subset of node features")
    parser.add_argument("--loss-over-all-nodes", default=True, action='store_true',
                        help="compute the feature reconstruction loss over all nods or just the masked")
    parser.add_argument("--link-prediction", default=False, action='store_true',
                       help="use link prediction as supervision task")
    parser.add_argument("--mask-links", default=True, action='store_true',
                       help="mask the links to be predicted")

    parser.add_argument("--batch-size", type=int, default=100,
            help="Mini-batch size. If -1, use full graph training.")
    parser.add_argument("--model_path", type=str, default=None,
            help='path for save the model')
    parser.add_argument("--fanout", type=int, default=10,
            help="Fan-out of neighbor sampling.")

    fp = parser.add_mutually_exclusive_group(required=False)
    fp.add_argument('--validation', dest='validation', action='store_true')
    fp.add_argument('--testing', dest='validation', action='store_false')
    parser.set_defaults(validation=True)

    args = parser.parse_args(['--dataset', 'wn18','--encoder', 'RGCN'])

In [5]:
train_idx, test_idx, val_idx, labels, g, category, num_classes, masked_node_types=\
        load_data.load_hetero_data(args)

Using device cuda:0


FileNotFoundError: [Errno 2] No such file or directory: '../data/kg/wn18/graph_reduced.pickle'

In [14]:
mapping_dict={}
edge_lists={}
reverse_mapping_dict={}
for etype in g.etypes:
    reverse_mapping_dict[etype]={}
    u,v,eid=g.all_edges(form='all', etype=etype)
    srtype,etyp,destype=g.to_canonical_etype(etype)
    u=u.data.numpy()
    v=v.data.numpy()
    u=u.astype(int)
    v=v.astype(int)
    if srtype==destype:
        uniq_v,uniq_i=np.unique((u,v),return_inverse=True)
        counter = itertools.count(0)
        # make consecutive ids
        mapping_dict[etype] = {next(counter):(v_node,srtype) for v_node in uniq_v}
        
        reverse_mapping_dict[etype][srtype]={v_node[0]:k for k,v_node in (mapping_dict[etype]).items()}
        
        edges=np.concatenate((np.expand_dims(u, axis=0),np.expand_dims(v, axis=0)),axis=0)
        #print(edges)
        edge_lists[etype]=[(reverse_mapping_dict[etype][srtype][edges[0,i]],reverse_mapping_dict[etype][destype][edges[1,i]]) for i in range((edges).shape[1])]
        #print(mapping_dict[etype][edge_lists[etype][0][1]])
        #print(v[0])
        print(counter)
        print(edge_lists[etype])
        print(max(v))
    else:
        uniq_u,uniq_i=np.unique((u),return_inverse=True)
        counter = itertools.count(0)
        # make consecutive ids
        mapping_dict[etype] = {next(counter):(u_node,srtype) for u_node in uniq_u}
        stid=next(counter)
        # continue for destination, but keep the same counter
        uniq_v,uniq_i=np.unique((v),return_inverse=True)
        for v_node in uniq_v:
            mapping_dict[etype][stid]=(v_node,destype)
            stid+=1
        #print(mapping_dict[etype])
        reverse_mapping_dict[etype][srtype]={v_node[0]:k for k,v_node in (mapping_dict[etype]).items() if v_node[1]==srtype}
        reverse_mapping_dict[etype][destype]={v_node[0]:k for k,v_node in (mapping_dict[etype]).items() if v_node[1]==destype}
        
        edges=np.concatenate((np.expand_dims(u, axis=0),np.expand_dims(v, axis=0)),axis=0)
        
        edge_lists[etype]=[(reverse_mapping_dict[etype][srtype][edges[0,i]],reverse_mapping_dict[etype][destype][edges[1,i]]) for i in range((edges).shape[1])]
        #print(mapping_dict[etype][edge_lists[etype][0][1]])
        #print(v[0])
        #print(stid)
        #print(max(v))
        
        
        

In [15]:
import pickle
folder="../data/imdb_data/"

In [16]:
f = open(folder+"mappings.pkl","wb")
pickle.dump(mapping_dict,f)
f.close()

In [17]:
for e in g.etypes:
    # print(edge_lists[e])
    np.savetxt(folder+"edge_list_"+e+".txt",edge_lists[e],fmt='%i')

In [34]:
motif_features={}

For the IMDB type of graph where a lot of relationships are among different entinties the motif representation
will be quite poor. Consider combining several relationtypes to get more interesting signals...

In [35]:
folder='../../../PGD-orbit-master/'
etype='directed_by'
dataset='imdb'
stype='person'
dtype='movie'
# the row index corresponds to the original id and the value corresponds to the internal id used by nasreen
vertex_mapping_2_nasreens_ids=np.loadtxt(folder+"vertex_mapping.txt") 
mapping_from_nasreens_ids={i: vertex_mapping_2_nasreens_ids[i] for i in range(len(vertex_mapping_2_nasreens_ids))}
motif_per_node=np.loadtxt(folder+dataset+'_'+etype+"-node.txt",skiprows=1,delimiter=',')
motif_node_features=torch.zeros((motif_per_node.shape[0],motif_per_node.shape[1]-1))

if stype not in motif_features:
    motif_features[stype]={}
if dtype not in motif_features:
    motif_features[dtype]={}   
for i in range(motif_per_node.shape[0]):
    internal_id,ntype=mapping_dict[etype][int(mapping_from_nasreens_ids[i])]
    if internal_id in motif_features[ntype]:
        motif_features[ntype][internal_id]+=[torch.tensor(motif_per_node[int(mapping_from_nasreens_ids[i]),1:])]
    else:
        motif_features[ntype][internal_id]=[torch.tensor(motif_per_node[int(mapping_from_nasreens_ids[i]),1:])]

    

In [36]:
motif_features

{'person': {21: [tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
          dtype=torch.float64)],
  26: [tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
          dtype=torch.float64)],
  41: [tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
          dtype=torch.float64)],
  70: [tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
          dtype=torch.float64)],
  75: [tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
          dtype=torch.float64)],
  77: [tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
          dtype=torch.float64)],
  80: [tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
          dtype=torch.float64)],
  83: [tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
          dtype=torch.float64)],
  85: [tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
          dtype=torch.float64)],
  87: [te