In [None]:
import networkx as nx
import random
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from gensim.models import Word2Vec
import pandas as pd
import time

In [None]:
def add_edges(G, train_G, c, epc):
    st = time.time()
    for i in range(c):
        edges_in_clique = 0
        while edges_in_clique <= epc:
            a = random.randint(npc*i, npc*(i+1))
            b = random.randint(npc*i, npc*(i+1))
            if a != b:
                G.add_edge(a, b)
                train_G.add_edge(a, b)
                edges_in_clique += 1
    run_time = time.time() - st
#     print("Add edges:", round(run_time, 4))
    return G, train_G

In [None]:
def singleton_nodes(G, train_G, nodes):
    # Create an edge for any singleton nodes
    st = time.time()
    limit = 0
    for node in nodes:
        while len(list(G.neighbors(node))) < connect_num:
            b = random.randint(0, n-1)
            limit += 1
            if b != node:
                G.add_edge(node, b)
                train_G.add_edge(node, b)
            if limit > m*20:
                print('singleton nodes - had to break')
                break
    run_time = time.time() - st
#     print("Singleton Nodes:", round(run_time, 4))
    return G, train_G

In [None]:
def split_test_set(train_G, connect_num, test_size):
    st = time.time()
    test_edges = []
    labels = []
    limit = 0
    while len(test_edges) < test_size:
        h_u, h_v = random.choice(list(train_G.edges))
        u_neighbors = len(list(train_G.neighbors(h_u)))
        v_neighbors = len(list(train_G.neighbors(h_v)))
        if u_neighbors > connect_num and v_neighbors > connect_num:
            train_G.remove_edge(h_u, h_v)
            test_edges.append((h_u, h_v))
            labels.append(1)
            limit -= 1
        limit +=1 
        if limit > m*100:
            print('split_test_set - had to break', len(test_edges), test_size)
            break
    run_time = time.time() - st
#     print("Split test set:", round(run_time, 4))
    return train_G, test_edges, labels

In [None]:
def add_negative_test(G, test_edges, test_size, labels):
    st = time.time()
    limit = 0
    ts = len(test_edges)
    while len(test_edges) < ts * 2:
        a = random.randint(0, n-1)
        b = random.randint(0, n-1)
        if G.has_edge(a, b) == False and a != b:
            test_edges.append((a , b))
            labels.append(0)
            limit -= 1
        limit +=1
        if limit > m*100:
            print('negatives - had to break', len(test_edges), test_size)
            break
    run_time = time.time() - st
#     print("Add negatives:", round(run_time, 4))
    return test_edges, labels

In [None]:
def generate_walks(train_G, nodes, walks_per_node, walk_length):
    st = time.time()
    walks = []
    for node in nodes:
        for walk in range(walks_per_node):
            nodes_in_walk = [str(node)]
            cur_node = node
            for step in range(walk_length - 1):
                neigh = list(train_G.neighbors(cur_node))
                next_step = random.choice(neigh)
                cur_node = next_step
                nodes_in_walk.append(str(next_step))
            walks.append(nodes_in_walk)
    run_time = time.time() - st
#     print("Generate walks:", round(run_time, 4))
    return walks

In [None]:
def test_model(walks, dim_order, test_edges, labels):
    st = time.time()
    model = Word2Vec(size=2**dim_order, window=window, workers=16, 
                                           ns_exponent =-0.5,
                                           sg=1, hs=0, negative=5)
    model.build_vocab(walks)
    losses = []
    aucs = []
    for i in range(10):
        model.train(walks, total_examples=len(walks),
                epochs=10, compute_loss=True)
        losses.append(model.get_latest_training_loss())
        preds = []
        for t in test_edges:
            pred = model.wv.similarity(str(t[0]), str(t[1]))
            preds.append(pred)
        auc = roc_auc_score(labels, preds)
        aucs.append(auc)
    run_time = time.time() - st
#     print("Test model:", round(run_time, 4))
    return losses, aucs

In [None]:
results = []

for c in range(1, 10, 2): # 5
    for connect_num in range(2, 7, 2): # 3
        for npc in range(8, 24, 2): # 8
            for epc in range(3, 104, 3): # 153
                for window in range(1, 11, 3): # 4
                    st = time.time()
                    n = c * (npc ** 2)
                    m = c * (epc ** 3)
                    D = 2 * m / (n * (n-1))
                    test_size = int(m * 0.2)
                    G=nx.Graph()
                    train_G=nx.Graph()

                    nodes = [x for x in range(n)]
                    G.add_nodes_from(nodes)
                    train_G.add_nodes_from(nodes)

                    G, train_G = add_edges(G, train_G, c, epc)

                    G, train_G = singleton_nodes(G, train_G, nodes)
                    
#                     nx.draw_networkx(G)
#                     plt.show()

                    train_G, test_edges, labels = split_test_set(train_G, connect_num, test_size)

                    test_edges, labels = add_negative_test(G, test_edges, test_size, labels)
                    
                    if m >= n and D < 1 and len(test_edges) > 10:
                        walk_length = 80
                        walks_per_node = 10
                        
                        walks = generate_walks(train_G, nodes, walks_per_node, walk_length)

                        for dim_order in range(1, 9):
                            losses, aucs = test_model(walks, dim_order, test_edges, labels)  
                            results.append([c, D, npc, epc, connect_num, window, 2**dim_order, np.mean(losses[-5:]), min(losses), np.mean(aucs[-5:]), max(aucs)])
                        
                        if len(results) % 1 == 0:
                            run_time = time.time() - st
                            print("Time:", round(run_time, 4))
                            print(np.array(walks).shape) 
                            print("Nodes:", n)
                            print("Edges:", m)
                            print("Graph Density:", D)
                            print("Connectivity:", connect_num)
                            print("Cliques:", c)
                            print("Window:", window)
                            print("Test size:", len(test_edges))    
                            print(2**dim_order, np.mean(losses[-5:]), min(losses), np.mean(aucs[-5:]), max(aucs))
                            columns = ["cliques", "density", "nodes per clique", "edges per clique", 
                                       "connectivity between cliques", "w2v window", "w2v dimensions", 
                                       "average loss", "min loss", "[average auc]", "max auc"]
                            results_df = pd.DataFrame(results, columns = columns)
                            results_df.to_csv('HyperParameter_Results_2.csv')

In [None]:
np.histogram(results_df['average auc'])