In [2]:
import json
from networkx.readwrite import json_graph
import os
import numpy as np
import networkx as nx
from pathlib import Path
import stellargraph as sg
from stellargraph.data import EdgeSplitter

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier 
#from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, StackingClassifier, VotingClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report,confusion_matrix,roc_auc_score
from sklearn.metrics import precision_score, recall_score,f1_score
from node2vec import Node2Vec

In [3]:
from sklearn.naive_bayes import GaussianNB 
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier


In [4]:
G = json_graph.node_link_graph(json.load(open("ppi-G.json")))
edges = [n for n in G.edges()]
ppi_edge_file = "ppi_edge_list.txt"
with open(ppi_edge_file, 'w') as fp:
    fp.write('\n'.join('{} {}'.format(x[0],x[1]) for x in edges))

In [5]:
class_map = json.load(open("ppi-class_map.json"))
n = len(class_map.keys())
m = len(class_map['0'])
target = np.zeros((n,m))
for i in range(n):
    target[i] = np.array(class_map[str(i)])

In [6]:
edge_labels_internal = json.load(open("ppi-class_map.json"))
edge_labels_internal = {int(i): l for i, l in edge_labels_internal.items()}
train_ids = [n for n in G.nodes()]
train_labels = np.array([edge_labels_internal[i] for i in train_ids])
if train_labels.ndim == 1:
    train_labels = np.expand_dims(train_labels, 1)

In [7]:
split_size = 2

In [None]:
directory = "tmp"
if not os.path.exists(directory):
    os.makedirs(directory)

In [9]:
EMBEDDING_DIMS = [64, 128]
WALK_LENGTHS = [40]
NUM_WALKS = [10]
WORKERS = [4]
Ps = [1]
Qs = [1]
for EMBEDDING_DIM in EMBEDDING_DIMS:
    for WALK_LENGTH in WALK_LENGTHS:
        for NUM_WALK in NUM_WALKS:
            for WORKER in WORKERS:
                for P in Ps:
                    for Q in Qs:
                        filename = "ppi_node2vec_full_embeddings_"+str(EMBEDDING_DIM)+"_"+str(WALK_LENGTH)+"_"+str(NUM_WALK)+"_"+str(P)+"_"+str(Q)+".emb"
                        print(filename)
                        if not Path(filename).is_file():
                            node2vec = Node2Vec(G, dimensions=EMBEDDING_DIM, walk_length=WALK_LENGTH, num_walks=NUM_WALK, workers=WORKER, p = P, q = Q, temp_folder="tmp/")
                            model = node2vec.fit(window=10, min_count=1, batch_words=4)
                            model.wv.save_word2vec_format(filename)
                        
                        data_emb = np.loadtxt(filename,skiprows=1)
                        emb_dim = len(data_emb[0])-1
                        num_nodes = len(list(G.nodes()))
                        embedding = np.zeros((len(G.nodes()),emb_dim))
                        for idx in range(data_emb.shape[0]):
                            embedding[int(data_emb[idx][0])] = data_emb[idx][1:]
                        X = np.zeros((num_nodes,emb_dim))
                        idx = 0
                        for node in G.nodes():
                            X[idx] = embedding[node]
                            idx += 1

                        y = target
                        s = np.arange(X.shape[0])
                        np.random.shuffle(s)
                        X2 = X[s]
                        y2 = y[s]
                        roc = []
                        prec = []
                        rec = []
                        f1 = []
                        kf = KFold(n_splits=5)
                        for train_index, test_index in kf.split(X2):
                            X_train2, X_test2 = X2[train_index], X2[test_index]
                            y_train2, y_test2 = y2[train_index], y2[test_index]
                            #clf = MLPClassifier(verbose=1)
                            #clf.fit(X_train2,y_train2)
                            forest = RandomForestClassifier(random_state=1,verbose=1,n_estimators=10)
                            clf = MultiOutputClassifier(forest, n_jobs=-1)
                            clf.fit(X_train2,y_train2)

                            pred = clf.predict(X_test2)


                            roc.append(roc_auc_score(y_test2,pred, average='micro'))
                            prec.append(precision_score(y_test2,pred, average='micro'))
                            rec.append(recall_score(y_test2,pred, average='micro'))
                            f1.append(f1_score(y_test2,pred, average='micro'))

                        result = str(EMBEDDING_DIM)+","+str(WALK_LENGTH)+","+str(NUM_WALK)+","+str(P)+","+str(Q)+","+str(np.mean(roc))+","+str(np.mean(prec))+","+str(np.mean(rec))+","+str(np.mean(f1))+"\n"
                        f= open("result_multiclass_ppi_node2vec.txt","a+")
                        f.write(result)
                        f.close()
                        
                        

ppi_node2vec_full_embeddings_64_40_10_1_1.emb


HBox(children=(IntProgress(value=0, description='Computing transition probabilities', max=56944, style=Progres…

Generating walks (CPU: 1):   0%|          | 0/3 [00:00<?, ?it/s]





Generating walks (CPU: 2):   0%|          | 0/3 [00:00<?, ?it/s][A

Generating walks (CPU: 3):   0%|          | 0/2 [00:00<?, ?it/s][A[A


Generating walks (CPU: 4):   0%|          | 0/2 [00:00<?, ?it/s][A[A[A


Generating walks (CPU: 4): 100%|██████████| 2/2 [28:32<00:00, 856.26s/it][A[A[A
Generating walks (CPU: 1):  67%|██████▋   | 2/3 [28:39<14:19, 859.75s/it][A

Generating walks (CPU: 4): 100%|██████████| 2/2 [52:32<00:00, 1576.10s/it][A[A
Generating walks (CPU: 3): 100%|██████████| 2/2 [52:35<00:00, 1577.74s/it]
Generating walks (CPU: 1): 100%|██████████| 3/3 [52:38<00:00, 1033.42s/it]
Generating walks (CPU: 2): 100%|██████████| 3/3 [52:37<00:00, 1033.20s/it][A
Generating walks (CPU: 1): 100%|██████████| 3/3 [1:00:13<00:00, 1204.39s/it]
Generating walks (CPU: 2): 100%|██████████| 3/3 [1:00:15<00:00, 1205.31s/it]


In [8]:
EMBEDDING_DIMS = [64]
WALK_LENGTHS = [10,100]
NUM_WALKS = [10]
WORKERS = [4]
Ps = [1]
Qs = [1]
for EMBEDDING_DIM in EMBEDDING_DIMS:
    for WALK_LENGTH in WALK_LENGTHS:
        for NUM_WALK in NUM_WALKS:
            for WORKER in WORKERS:
                for P in Ps:
                    for Q in Qs:
                        filename = "ppi_node2vec_full_embeddings_"+str(EMBEDDING_DIM)+"_"+str(WALK_LENGTH)+"_"+str(NUM_WALK)+"_"+str(P)+"_"+str(Q)+".emb"
                        print(filename)
                        if not Path(filename).is_file():
                            node2vec = Node2Vec(G, dimensions=EMBEDDING_DIM, walk_length=WALK_LENGTH, num_walks=NUM_WALK, workers=WORKER, p = P, q = Q, temp_folder="tmp/")
                            model = node2vec.fit(window=10, min_count=1, batch_words=4)
                            model.wv.save_word2vec_format(filename)
                        
                        data_emb = np.loadtxt(filename,skiprows=1)
                        emb_dim = len(data_emb[0])-1
                        num_nodes = len(list(G.nodes()))
                        embedding = np.zeros((len(G.nodes()),emb_dim))
                        for idx in range(data_emb.shape[0]):
                            embedding[int(data_emb[idx][0])] = data_emb[idx][1:]
                        X = np.zeros((num_nodes,emb_dim))
                        idx = 0
                        for node in G.nodes():
                            X[idx] = embedding[node]
                            idx += 1

                        y = target
                        s = np.arange(X.shape[0])
                        np.random.shuffle(s)
                        X2 = X[s]
                        y2 = y[s]
                        roc = []
                        prec = []
                        rec = []
                        f1 = []
                        kf = KFold(n_splits=5)
                        for train_index, test_index in kf.split(X2):
                            X_train2, X_test2 = X2[train_index], X2[test_index]
                            y_train2, y_test2 = y2[train_index], y2[test_index]
                            #clf = MLPClassifier(verbose=1)
                            #clf.fit(X_train2,y_train2)
                            forest = RandomForestClassifier(random_state=1,verbose=1,n_estimators=10)
                            clf = MultiOutputClassifier(forest, n_jobs=-1)
                            clf.fit(X_train2,y_train2)

                            pred = clf.predict(X_test2)


                            roc.append(roc_auc_score(y_test2,pred, average='micro'))
                            prec.append(precision_score(y_test2,pred, average='micro'))
                            rec.append(recall_score(y_test2,pred, average='micro'))
                            f1.append(f1_score(y_test2,pred, average='micro'))

                        result = str(EMBEDDING_DIM)+","+str(WALK_LENGTH)+","+str(NUM_WALK)+","+str(P)+","+str(Q)+","+str(np.mean(roc))+","+str(np.mean(prec))+","+str(np.mean(rec))+","+str(np.mean(f1))+"\n"
                        f= open("result_multiclass_ppi_node2vec.txt","a+")
                        f.write(result)
                        f.close()
                        
                        

ppi_node2vec_full_embeddings_64_100_10_1_1.emb


HBox(children=(IntProgress(value=0, description='Computing transition probabilities', max=56944, style=Progres…

Generating walks (CPU: 1):   0%|          | 0/3 [00:00<?, ?it/s]
Generating walks (CPU: 2):   0%|          | 0/3 [00:00<?, ?it/s][A

Generating walks (CPU: 3):   0%|          | 0/2 [00:00<?, ?it/s][A[A







Generating walks (CPU: 4):   0%|          | 0/2 [00:00<?, ?it/s][A[A[A


Generating walks (CPU: 4): 100%|██████████| 2/2 [21:34<00:00, 647.09s/it][A[A[A

Generating walks (CPU: 1):  67%|██████▋   | 2/3 [22:19<11:09, 669.73s/it][A[A
Generating walks (CPU: 4): 100%|██████████| 2/2 [42:05<00:00, 1262.80s/it][A
Generating walks (CPU: 3): 100%|██████████| 2/2 [42:25<00:00, 1272.84s/it]
Generating walks (CPU: 1): 100%|██████████| 3/3 [42:59<00:00, 840.69s/it]
Generating walks (CPU: 2): 100%|██████████| 3/3 [53:58<00:00, 1079.42s/it][A
Generating walks (CPU: 1): 100%|██████████| 3/3 [54:02<00:00, 1080.87s/it]


In [None]:
EMBEDDING_DIMS = [64]
WALK_LENGTHS = [40]
NUM_WALKS = [50,100]
WORKERS = [4]
Ps = [1]
Qs = [1]
for EMBEDDING_DIM in EMBEDDING_DIMS:
    for WALK_LENGTH in WALK_LENGTHS:
        for NUM_WALK in NUM_WALKS:
            for WORKER in WORKERS:
                for P in Ps:
                    for Q in Qs:
                        filename = "ppi_node2vec_full_embeddings_"+str(EMBEDDING_DIM)+"_"+str(WALK_LENGTH)+"_"+str(NUM_WALK)+"_"+str(P)+"_"+str(Q)+".emb"
                        print(filename)
                        if not Path(filename).is_file():
                            node2vec = Node2Vec(G, dimensions=EMBEDDING_DIM, walk_length=WALK_LENGTH, num_walks=NUM_WALK, workers=WORKER, p = P, q = Q, temp_folder="tmp/")
                            model = node2vec.fit(window=10, min_count=1, batch_words=4)
                            model.wv.save_word2vec_format(filename)
                        
                        data_emb = np.loadtxt(filename,skiprows=1)
                        emb_dim = len(data_emb[0])-1
                        num_nodes = len(list(G.nodes()))
                        embedding = np.zeros((len(G.nodes()),emb_dim))
                        for idx in range(data_emb.shape[0]):
                            embedding[int(data_emb[idx][0])] = data_emb[idx][1:]
                        X = np.zeros((num_nodes,emb_dim))
                        idx = 0
                        for node in G.nodes():
                            X[idx] = embedding[node]
                            idx += 1

                        y = target
                        s = np.arange(X.shape[0])
                        np.random.shuffle(s)
                        X2 = X[s]
                        y2 = y[s]
                        roc = []
                        prec = []
                        rec = []
                        f1 = []
                        kf = KFold(n_splits=5)
                        for train_index, test_index in kf.split(X2):
                            X_train2, X_test2 = X2[train_index], X2[test_index]
                            y_train2, y_test2 = y2[train_index], y2[test_index]
                            #clf = MLPClassifier(verbose=1)
                            #clf.fit(X_train2,y_train2)
                            forest = RandomForestClassifier(random_state=1,verbose=1,n_estimators=10)
                            clf = MultiOutputClassifier(forest, n_jobs=-1)
                            clf.fit(X_train2,y_train2)

                            pred = clf.predict(X_test2)


                            roc.append(roc_auc_score(y_test2,pred, average='micro'))
                            prec.append(precision_score(y_test2,pred, average='micro'))
                            rec.append(recall_score(y_test2,pred, average='micro'))
                            f1.append(f1_score(y_test2,pred, average='micro'))

                        result = str(EMBEDDING_DIM)+","+str(WALK_LENGTH)+","+str(NUM_WALK)+","+str(P)+","+str(Q)+","+str(np.mean(roc))+","+str(np.mean(prec))+","+str(np.mean(rec))+","+str(np.mean(f1))+"\n"
                        f= open("result_multiclass_ppi_node2vec.txt","a+")
                        f.write(result)
                        f.close()
                        
                        

ppi_node2vec_full_embeddings_64_40_50_1_1.emb


HBox(children=(IntProgress(value=0, description='Computing transition probabilities', max=56944, style=Progres…

Generating walks (CPU: 1):   0%|          | 0/13 [00:00<?, ?it/s]





Generating walks (CPU: 2):   0%|          | 0/13 [00:00<?, ?it/s][A

Generating walks (CPU: 3):   0%|          | 0/12 [00:00<?, ?it/s][A[A


Generating walks (CPU: 4):   0%|          | 0/12 [00:00<?, ?it/s][A[A[A
Generating walks (CPU: 2):  15%|█▌        | 2/13 [11:52<1:05:15, 356.00s/it][A

Generating walks (CPU: 3):  17%|█▋        | 2/12 [11:53<59:27, 356.74s/it][A[A


Generating walks (CPU: 1):  15%|█▌        | 2/13 [11:57<1:05:43, 358.53s/it]A[A[A
Generating walks (CPU: 2):  23%|██▎       | 3/13 [21:41<1:11:00, 426.05s/it][A

Generating walks (CPU: 3):  25%|██▌       | 3/12 [21:44<1:04:03, 427.06s/it][A[A


Generating walks (CPU: 1):  23%|██▎       | 3/13 [21:50<1:11:31, 429.11s/it][A[A[A
Generating walks (CPU: 2):  31%|███       | 4/13 [33:43<1:17:12, 514.68s/it][A

Generating walks (CPU: 3):  33%|███▎      | 4/12 [33:52<1:08:57, 517.24s/it][A[A


Generating walks (CPU: 1):  31%|███       | 4/13 [34:14<1:18:30, 523.36s/it][A[A[A
Generating walks (CPU: 2):  

In [10]:
EMBEDDING_DIMS = [64]
WALK_LENGTHS = [40]
NUM_WALKS = [10]
WORKERS = [4]
Ps = [0.2, 2]
Qs = [1]
for EMBEDDING_DIM in EMBEDDING_DIMS:
    for WALK_LENGTH in WALK_LENGTHS:
        for NUM_WALK in NUM_WALKS:
            for WORKER in WORKERS:
                for P in Ps:
                    for Q in Qs:
                        filename = "ppi_node2vec_full_embeddings_"+str(EMBEDDING_DIM)+"_"+str(WALK_LENGTH)+"_"+str(NUM_WALK)+"_"+str(P)+"_"+str(Q)+".emb"
                        print(filename)
                        if not Path(filename).is_file():
                            node2vec = Node2Vec(G, dimensions=EMBEDDING_DIM, walk_length=WALK_LENGTH, num_walks=NUM_WALK, workers=WORKER, p = P, q = Q, temp_folder="tmp/")
                            model = node2vec.fit(window=10, min_count=1, batch_words=4)
                            model.wv.save_word2vec_format(filename)
                        
                        data_emb = np.loadtxt(filename,skiprows=1)
                        emb_dim = len(data_emb[0])-1
                        num_nodes = len(list(G.nodes()))
                        embedding = np.zeros((len(G.nodes()),emb_dim))
                        for idx in range(data_emb.shape[0]):
                            embedding[int(data_emb[idx][0])] = data_emb[idx][1:]
                        X = np.zeros((num_nodes,emb_dim))
                        idx = 0
                        for node in G.nodes():
                            X[idx] = embedding[node]
                            idx += 1

                        y = target
                        s = np.arange(X.shape[0])
                        np.random.shuffle(s)
                        X2 = X[s]
                        y2 = y[s]
                        roc = []
                        prec = []
                        rec = []
                        f1 = []
                        kf = KFold(n_splits=5)
                        for train_index, test_index in kf.split(X2):
                            X_train2, X_test2 = X2[train_index], X2[test_index]
                            y_train2, y_test2 = y2[train_index], y2[test_index]
                            #clf = MLPClassifier(verbose=1)
                            #clf.fit(X_train2,y_train2)
                            forest = RandomForestClassifier(random_state=1,verbose=1,n_estimators=10)
                            clf = MultiOutputClassifier(forest, n_jobs=-1)
                            clf.fit(X_train2,y_train2)

                            pred = clf.predict(X_test2)


                            roc.append(roc_auc_score(y_test2,pred, average='micro'))
                            prec.append(precision_score(y_test2,pred, average='micro'))
                            rec.append(recall_score(y_test2,pred, average='micro'))
                            f1.append(f1_score(y_test2,pred, average='micro'))

                        result = str(EMBEDDING_DIM)+","+str(WALK_LENGTH)+","+str(NUM_WALK)+","+str(P)+","+str(Q)+","+str(np.mean(roc))+","+str(np.mean(prec))+","+str(np.mean(rec))+","+str(np.mean(f1))+"\n"
                        f= open("result_multiclass_ppi_node2vec.txt","a+")
                        f.write(result)
                        f.close()
                        
                        
                        

ppi_node2vec_full_embeddings_64_10_40_0.2_1.emb
ppi_node2vec_full_embeddings_64_10_40_1_1.emb
ppi_node2vec_full_embeddings_64_10_40_2_1.emb


In [11]:
EMBEDDING_DIMS = [64]
WALK_LENGTHS = [10]
NUM_WALKS = [40]
WORKERS = [4]
Ps = [1]
Qs = [0.2, 2]
for EMBEDDING_DIM in EMBEDDING_DIMS:
    for WALK_LENGTH in WALK_LENGTHS:
        for NUM_WALK in NUM_WALKS:
            for WORKER in WORKERS:
                for P in Ps:
                    for Q in Qs:
                        filename = "ppi_node2vec_full_embeddings_"+str(EMBEDDING_DIM)+"_"+str(WALK_LENGTH)+"_"+str(NUM_WALK)+"_"+str(P)+"_"+str(Q)+".emb"
                        print(filename)
                        if not Path(filename).is_file():
                            node2vec = Node2Vec(G, dimensions=EMBEDDING_DIM, walk_length=WALK_LENGTH, num_walks=NUM_WALK, workers=WORKER, p = P, q = Q, temp_folder="tmp/")
                            model = node2vec.fit(window=10, min_count=1, batch_words=4)
                            model.wv.save_word2vec_format(filename)
                        
                        data_emb = np.loadtxt(filename,skiprows=1)
                        emb_dim = len(data_emb[0])-1
                        num_nodes = len(list(G.nodes()))
                        embedding = np.zeros((len(G.nodes()),emb_dim))
                        for idx in range(data_emb.shape[0]):
                            embedding[int(data_emb[idx][0])] = data_emb[idx][1:]
                        X = np.zeros((num_nodes,emb_dim))
                        idx = 0
                        for node in G.nodes():
                            X[idx] = embedding[node]
                            idx += 1

                        y = target
                        s = np.arange(X.shape[0])
                        np.random.shuffle(s)
                        X2 = X[s]
                        y2 = y[s]
                        roc = []
                        prec = []
                        rec = []
                        f1 = []
                        kf = KFold(n_splits=5)
                        for train_index, test_index in kf.split(X2):
                            X_train2, X_test2 = X2[train_index], X2[test_index]
                            y_train2, y_test2 = y2[train_index], y2[test_index]
                            #clf = MLPClassifier(verbose=1)
                            #clf.fit(X_train2,y_train2)
                            forest = RandomForestClassifier(random_state=1,verbose=1,n_estimators=10)
                            clf = MultiOutputClassifier(forest, n_jobs=-1)
                            clf.fit(X_train2,y_train2)

                            pred = clf.predict(X_test2)


                            roc.append(roc_auc_score(y_test2,pred, average='micro'))
                            prec.append(precision_score(y_test2,pred, average='micro'))
                            rec.append(recall_score(y_test2,pred, average='micro'))
                            f1.append(f1_score(y_test2,pred, average='micro'))

                        result = str(EMBEDDING_DIM)+","+str(WALK_LENGTH)+","+str(NUM_WALK)+","+str(P)+","+str(Q)+","+str(np.mean(roc))+","+str(np.mean(prec))+","+str(np.mean(rec))+","+str(np.mean(f1))+"\n"
                        f= open("result_multiclass_ppi_node2vec.txt","a+")
                        f.write(result)
                        f.close()
                        
                        
                        

ppi_node2vec_full_embeddings_64_10_40_1_0.2.emb
ppi_node2vec_full_embeddings_64_10_40_1_1.emb
ppi_node2vec_full_embeddings_64_10_40_1_2.emb
