In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from gensim.models import Word2Vec, KeyedVectors
import pandas as pd
import numpy as np

In [2]:
embeddings_file = './output/facebook_embedding'
train_file = './data/facebook_edgelist'
test_file = './data/facebook_test'
output_file = './output/prediction'

model = KeyedVectors.load_word2vec_format(embeddings_file, binary=False)
indexes = list(map(int, model.index2word))
emb_matrix = model.vectors[np.argsort(indexes)]
print(emb_matrix.shape)

# index is start with 0 while node number is start with 1
test_edgelist = pd.read_csv(test_file, names=[1, 2]).values - 1
edgelist = pd.read_csv(train_file, names=[1, 2]).values - 1
print(edgelist.shape, test_edgelist.shape)

(53918, 64)
(550426, 2) (500000, 2)


# Logistic

In [16]:
def get_edge_embeddings(emb_matrix, edge_list):
    return emb_matrix[edge_list[:, 0]] * emb_matrix[edge_list[:, 1]]

In [18]:
def make_false_edge(edgelist):
    edgeset = set([tuple(i) for i in edgelist])
    edge_false = []
    while len(edge_false) < edgelist.shape[0]:
        u = np.random.randint(0, emb_matrix.shape[0])
        v = np.random.randint(0, emb_matrix.shape[0])
        if u != v and (u, v) not in edgeset and (v, u) not in edgeset:
            edge_false.append((u, v))
    return np.array(edge_false)

edge_false = make_false_edge(edgelist)
edge_all = np.concatenate([edgelist, edge_false], axis=0)

# assert
# eq = edge_false.reshape(-1, 1, 2) == edgelist.reshape(1, -1, 2)
# assert not eq.all(axis=2).any()

test_edge_embeddings = get_edge_embeddings(emb_matrix, test_edgelist)
embeddings = get_edge_embeddings(emb_matrix, edge_all)

labels = np.concatenate([np.ones(edgelist.shape[0]), 
                         np.zeros(edgelist.shape[0])], axis=0)

# cross validation
X_train, X_val, y_train, y_val = train_test_split(embeddings, labels, test_size=0.2, shuffle=True, random_state=0)

classifier = LogisticRegression(random_state=0, solver='lbfgs')
classifier.fit(X_train, y_train)
val_preds = classifier.predict_proba(X_val)[:, 1]
roc_auc_score(y_val, val_preds)

0.9500315184799324

In [7]:
def write_results(output_file, test_edgelist, test_preds):
    with open(output_file, 'w') as f:
        f.write('NodePair,Score\n')
        for (u, v), pred in zip(test_edgelist, test_preds):
            f.write('{}-{},{}\n'.format(u+1, v+1, pred))

# test and write results
classifier = LogisticRegression(random_state=0, solver='lbfgs')
classifier.fit(embeddings, labels)
test_preds = classifier.predict_proba(test_edge_embeddings)[:, 1]

write_results(output_file, test_edgelist, test_preds)

# inner product

In [8]:
# cross validation
ip = np.sum(emb_matrix[edge_all[:, 0]] * emb_matrix[edge_all[:, 1]], axis=1)
pred = np.interp(ip, (ip.min(), ip.max()), (0, 1))
roc_auc_score(labels, pred)

0.8810307395900614

In [9]:
# test and write results
ip = np.sum(emb_matrix[test_edgelist[:, 0]] * emb_matrix[test_edgelist[:, 1]], axis=1)
pred = np.interp(ip, (ip.min(), ip.max()), (0, 1))
write_results('./output/deepwalk_inner_product_pred.csv', test_edgelist, pred)