In [1]:
import numpy as np
import networkx as nx
import gensim
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist, pdist
import pickle
from collections import defaultdict
import random
import pickle
import os
%matplotlib inline

In [2]:
# Import training set

training_set = pickle.load(open("training_data.txt", "rb"))
testing_set = pickle.load(open("testing_data.txt","rb"))


In [3]:
# Create graph from training set
G = nx.DiGraph()
for edge in training_set.keys():
    nodes = edge.split('-')
    G.add_edge(nodes[0],nodes[1])



In [4]:
# Add only the nodes from test set to graph if not already present in generated graph
node_list_conn = G.nodes()
for edge in testing_set.keys():
    nodes = edge.split('-')
    for node in nodes:
        if node in node_list_conn:
            continue
        else:
            G.add_node(node)

In [5]:
# Build new edgelist node2vec can utilize for generating embeddings
nx.write_edgelist(G,'graph/train_n2v.txt')

In [6]:
CmdStr = "python main.py --p 1 --q 0.5 --iter 200 --input graph/train_n2v.txt \
    --output emb/emb_train_n2v.emb --dimensions 64"
os.system(CmdStr)

256

In [None]:
node_list_conn_int = sorted(map(lambda x : int(x),node_list_conn))
node_list_conn = map(lambda x : str(x),node_list_conn_int)

In [9]:
# Sanity check to make sure same number of nodes reappears in new graph generated
G.number_of_nodes()

5242

In [10]:
testing_set

{'22601-4512': 1,
 '1425-19557': 1,
 '1982-4458': 1,
 '15538-2558': 1,
 '10881-3651': 1,
 '18582-24097': 1,
 '1289-9184': 0,
 '6934-13556': 1,
 '9974-25902': 0,
 '12745-5400': 0,
 '12802-1293': 1,
 '26098-15600': 1,
 '7546-25217': 1,
 '19964-10923': 1,
 '21491-7689': 1,
 '6512-9313': 1,
 '12118-4241': 0,
 '20157-19501': 0,
 '8208-20783': 0,
 '1552-2054': 1,
 '3917-19356': 0,
 '17655-23293': 1,
 '5717-7090': 1,
 '8541-18771': 1,
 '16042-715': 1,
 '2250-5621': 1,
 '17182-4164': 1,
 '88-22504': 1,
 '10496-5144': 0,
 '10350-19939': 1,
 '23647-10942': 1,
 '17124-11379': 1,
 '25402-12070': 1,
 '4416-9511': 1,
 '23267-7895': 1,
 '11379-11372': 1,
 '2912-23266': 0,
 '18143-215': 1,
 '7007-9098': 1,
 '8428-18603': 1,
 '22691-2212': 1,
 '9408-9147': 0,
 '10252-23506': 0,
 '25111-12786': 1,
 '20765-19351': 1,
 '14090-11233': 1,
 '26167-2912': 0,
 '18225-2535': 1,
 '4755-4355': 1,
 '15665-13422': 1,
 '624-25201': 1,
 '19640-19870': 1,
 '1818-10514': 0,
 '25359-5067': 1,
 '24620-13813': 1,
 '19216-

In [11]:
G.has_edge('88','22504')

False

In [12]:
testing_set['88-22504']

1

In [48]:
# Read embeddings file from and generate features

model = gensim.models.KeyedVectors.load_word2vec_format('emb/emb_train_n2v.emb')

In [59]:
embeddings = {}

for node in node_list_conn:
    embeddings[node] = model.word_vec(node)

In [60]:
def combine_embedding(method,n_out,n_in):
    if(method == 1):
        #print "Implementing Simple average"
        return (n_out+n_in)/2.0    
    
    elif(method == 2):
        #print "Implementing Hadamard"
        #print n_in,n_out
        return np.multiply(n_in,n_out)
    else:
        print "Invalid Method. Enter 1 or 2"
        return     

In [61]:
# Try Hadamard first
feature = []
label = []
for edge in training_set.keys():
    nodes = edge.split('-')
    feature.append(combine_embedding(1,embeddings[nodes[0]],embeddings[nodes[1]]))
    label.append(training_set[edge])
    

In [62]:
len(feature)

37818

In [63]:
G.number_of_edges()

37818

In [64]:
feature_np = np.asarray(feature)
print feature_np.shape
label_np = np.asarray(label)
print label_np.shape

(37818, 64)
(37818,)


In [65]:
x,residuals,rank,s = np.linalg.lstsq(feature_np,label_np)

In [66]:
def evaluate_perf(data,w,labels):
    label_pred = np.dot(data,w)
    print label_pred.shape
    diff = np.abs(np.subtract(label_pred,labels))
    return np.sum(diff)*1.0/len(labels)

In [67]:
x.shape

(64,)

In [68]:
error = evaluate_perf(feature_np,x,label_np)
error

(37818,)


0.4042766752145282

In [None]:
# Extract test set features

