In [1]:
import pandas as pd
import networkx as nx
import numpy as np 
import stellargraph as sg
import tensorflow as tf
from matplotlib import pyplot as plt
from gensim.models import Word2Vec

In [85]:
from sklearn.preprocessing import MultiLabelBinarizer, scale, StandardScaler
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, multilabel_confusion_matrix
from skmultilearn.problem_transform import BinaryRelevance

In [3]:
network = pd.read_csv('data/bio-pathways-network.csv')
graph = nx.from_pandas_edgelist(network, 'Gene ID 1', 'Gene ID 2')

In [4]:
G = sg.StellarGraph.from_networkx(graph)

In [5]:
rw = sg.data.BiasedRandomWalk(G)

walks = rw.run(
    nodes=list(G.nodes()),  # root nodes
    length=100,  # maximum length of a random walk
    n=10,  # number of random walks per root node
    p=0.5,  # Defines (unormalised) probability, 1/p, of returning to source node
    q=2.0,  # Defines (unormalised) probability, 1/q, for moving away from source node
)

In [6]:
str_walks = [[str(n) for n in walk] for walk in walks]
model = Word2Vec(str_walks, size=128, window=5, min_count=0, sg=1, workers=-1, iter=1)

In [7]:
multi_diseases = pd.read_csv('data/all-proteins.csv', index_col=0)
multi_diseases

Unnamed: 0,inherited metabolic disorder,integumentary system disease,urinary system disease,nervous system disease,gastrointestinal system disease,substance-related disorder,immune system disease,musculoskeletal system disease,psoriatic arthritis,cancer,...,chromosomal disease,hypospadias,ciliopathy,developmental disorder of mental health,sleep disorder,bacterial infectious disease,respiratory system disease,polycystic ovary syndrome,reproductive system disease,orofacial cleft
3295,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5189,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5190,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5192,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5193,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81918,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
139378,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18744,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
node_ids = model.wv.index2word 
embedding_set = set(node_ids)
drop_rows = []
for n in multi_diseases.index:
    if str(n) not in embedding_set:
        drop_rows.append(n)
multi_diseases = multi_diseases.drop(drop_rows)

In [9]:
X = (
    model.wv.vectors
)  # numpy.ndarray of size number of nodes times embeddings dimensionality
y = multi_diseases.values

In [11]:
# Use validation and test set splits similar to graphnets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, test_size=None)
print(
    "Array shapes:\n X_train = {}\n y_train = {}\n X_test = {}\n y_test = {}".format(
        X_train.shape, y_train.shape, X_test.shape, y_test.shape
    )
)

Array shapes:
 X_train = (4311, 128)
 y_train = (4311, 30)
 X_test = (17246, 128)
 y_test = (17246, 30)


In [16]:
clf = BinaryRelevance(LogisticRegression())
clf.fit(X_train, y_train)

BinaryRelevance(classifier=LogisticRegression(), require_dense=[True, True])

In [18]:
y_pred = clf.predict(X_test)

In [19]:
multilabel_confusion_matrix(y_test, y_pred)

array([[[17088,     0],
        [  158,     0]],

       [[16980,     0],
        [  266,     0]],

       [[16738,     0],
        [  508,     0]],

       [[16294,     0],
        [  952,     0]],

       [[16915,     0],
        [  331,     0]],

       [[17151,     0],
        [   95,     0]],

       [[16956,     0],
        [  290,     0]],

       [[16851,     0],
        [  395,     0]],

       [[17234,     0],
        [   12,     0]],

       [[15497,     0],
        [ 1749,     0]],

       [[17221,     0],
        [   25,     0]],

       [[17100,     0],
        [  146,     0]],

       [[16592,     0],
        [  654,     0]],

       [[16955,     0],
        [  291,     0]],

       [[17170,     0],
        [   76,     0]],

       [[16825,     0],
        [  421,     0]],

       [[17220,     0],
        [   26,     0]],

       [[17150,     0],
        [   96,     0]],

       [[17161,     0],
        [   85,     0]],

       [[17236,     0],
        [   10,     0]],



In [20]:
print('accuracy: ', accuracy_score(y_test, y_pred))
print('samples: ', f1_score(y_test, y_pred, average='samples'))
print('macro:',f1_score(y_test, y_pred, average='micro'))
print('micro:',f1_score(y_test, y_pred, average='macro'))
print('weighted:',f1_score(y_test, y_pred, average='weighted'))

accuracy:  0.7529282152383161
samples:  0.0
macro: 0.0
micro: 0.0
weighted: 0.0


  _warn_prf(


In [23]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_test, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


(array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([ 158,  266,  508,  952,  331,   95,  290,  395,   12, 1749,   25,
         146,  654,  291,   76,  421,   26,   96,   85,   10,   81,   13,
          20,  207,   17,   15,  229,  112,  153,   38]))

**Motif embedded results**

In [24]:
motif3_exact = pd.read_csv('motifs location/result3.csv', index_col=0)
motif3_subsampled = pd.read_csv('motifs location/subsampling_3.csv', index_col=1).drop('Unnamed: 0', axis=1)
motif4 = pd.read_csv('motifs location/subsampling_4.csv', index_col=1).drop('Unnamed: 0', axis=1)
motif_orig = pd.read_csv('data/bio-pathways-proteinmotifs.csv', index_col=0)

3-motifs exact

In [41]:
X = np.concatenate((model.wv.vectors, motif3_exact.values), axis=1)

In [42]:
# Use validation and test set splits similar to graphnets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, test_size=None)
print(
    "Array shapes:\n X_train = {}\n y_train = {}\n X_test = {}\n y_test = {}".format(
        X_train.shape, y_train.shape, X_test.shape, y_test.shape
    )
)

Array shapes:
 X_train = (4311, 133)
 y_train = (4311, 30)
 X_test = (17246, 133)
 y_test = (17246, 30)


In [45]:
clf = BinaryRelevance(LogisticRegression())
clf.fit(scale(X_train), y_train)

BinaryRelevance(classifier=LogisticRegression(), require_dense=[True, True])

In [47]:
y_pred = clf.predict(X_test)

In [48]:
multilabel_confusion_matrix(y_test, y_pred)

array([[[ 3825, 13272],
        [   86,    63]],

       [[16985,     0],
        [  261,     0]],

       [[ 1770, 14983],
        [  121,   372]],

       [[ 3076, 13253],
        [  493,   424]],

       [[16931,     6],
        [  309,     0]],

       [[  298, 16853],
        [    8,    87]],

       [[  386, 16592],
        [   25,   243]],

       [[  478, 16382],
        [   49,   337]],

       [[  395, 16839],
        [    0,    12]],

       [[  354, 15182],
        [  184,  1526]],

       [[  774, 16446],
        [    5,    21]],

       [[  299, 16804],
        [    9,   134]],

       [[ 4061, 12564],
        [  393,   228]],

       [[ 1785, 15192],
        [   96,   173]],

       [[  370, 16810],
        [    3,    63]],

       [[ 2369, 14460],
        [  158,   259]],

       [[ 1397, 15828],
        [    7,    14]],

       [[ 1585, 15566],
        [   31,    64]],

       [[  564, 16589],
        [   14,    79]],

       [[17237,     0],
        [    9,     0]],



In [50]:
print('accuracy: ', accuracy_score(y_test, y_pred))
print('samples: ', f1_score(y_test, y_pred, average='samples'))
print('macro:',f1_score(y_test, y_pred, average='micro'))
print('micro:',f1_score(y_test, y_pred, average='macro'))
print('weighted:',f1_score(y_test, y_pred, average='weighted'))

accuracy:  0.0055085237156442075
samples:  0.02211196235307545
macro: 0.022900420730857456


  _warn_prf(


micro: 0.01948433187516241
weighted: 0.06156131427198995


In [51]:
precision_recall_fscore_support(y_test, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


(array([0.00472441, 0.        , 0.02422664, 0.03100095, 0.        ,
        0.00513577, 0.01443421, 0.02015671, 0.00071212, 0.09133349,
        0.00127528, 0.00791121, 0.01782364, 0.01125936, 0.00373378,
        0.0175963 , 0.00088373, 0.00409469, 0.00473962, 0.        ,
        0.00361652, 0.00072076, 0.00109217, 0.01038144, 0.00077252,
        0.        , 0.01230953, 0.00577219, 0.00859855, 0.00197498]),
 array([0.42281879, 0.        , 0.75456389, 0.46237732, 0.        ,
        0.91578947, 0.90671642, 0.87305699, 1.        , 0.89239766,
        0.80769231, 0.93706294, 0.36714976, 0.64312268, 0.95454545,
        0.62110312, 0.66666667, 0.67368421, 0.84946237, 0.        ,
        0.72151899, 1.        , 1.        , 0.87939698, 0.92857143,
        0.        , 0.91555556, 0.76271186, 0.90566038, 0.94285714]),
 array([0.00934441, 0.        , 0.04694599, 0.05810607, 0.        ,
        0.01021426, 0.02841607, 0.03940368, 0.00142323, 0.16570746,
        0.00254653, 0.01568995, 0.03399687, 

3-motifs subsampled

In [59]:
X = np.concatenate((model.wv.vectors, motif3_subsampled.values), axis=1)

In [60]:
# Use validation and test set splits similar to graphnets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, test_size=None)
print(
    "Array shapes:\n X_train = {}\n y_train = {}\n X_test = {}\n y_test = {}".format(
        X_train.shape, y_train.shape, X_test.shape, y_test.shape
    )
)

Array shapes:
 X_train = (4311, 131)
 y_train = (4311, 30)
 X_test = (17246, 131)
 y_test = (17246, 30)


In [61]:
clf = BinaryRelevance(LogisticRegression())
clf.fit(scale(X_train), y_train)

BinaryRelevance(classifier=LogisticRegression(), require_dense=[True, True])

In [62]:
y_pred = clf.predict(X_test)

In [63]:
multilabel_confusion_matrix(y_test, y_pred)

array([[[ 3581, 13507],
        [   13,   145]],

       [[ 2447, 14549],
        [   17,   233]],

       [[ 2820, 13925],
        [   11,   490]],

       [[ 1561, 14747],
        [   22,   916]],

       [[ 2574, 14358],
        [   14,   300]],

       [[ 2245, 14919],
        [    3,    79]],

       [[ 2237, 14734],
        [    6,   269]],

       [[ 2035, 14839],
        [   16,   356]],

       [[17233,     0],
        [   13,     0]],

       [[ 1087, 14463],
        [   20,  1676]],

       [[ 3330, 13893],
        [    3,    20]],

       [[ 2929, 14172],
        [    6,   139]],

       [[ 1579, 15031],
        [   10,   626]],

       [[ 2059, 14905],
        [    3,   279]],

       [[ 2711, 14475],
        [    2,    58]],

       [[ 2384, 14450],
        [    8,   404]],

       [[ 5114, 12107],
        [    0,    25]],

       [[ 2393, 14772],
        [    2,    79]],

       [[ 2734, 14420],
        [    8,    84]],

       [[ 5532, 11704],
        [    0,    10]],



In [64]:
print('accuracy: ', accuracy_score(y_test, y_pred))
print('samples: ', f1_score(y_test, y_pred, average='samples'))
print('macro:',f1_score(y_test, y_pred, average='micro'))
print('micro:',f1_score(y_test, y_pred, average='macro'))
print('weighted:',f1_score(y_test, y_pred, average='weighted'))

accuracy:  0.060999652093239015
samples:  0.027656064569995743
macro: 0.03481170214884592


  _warn_prf(


micro: 0.0293706027390281
weighted: 0.08426014641959216


In [65]:
precision_recall_fscore_support(y_test, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


(array([0.01062115, 0.01576241, 0.03399237, 0.05848177, 0.02046664,
        0.00526737, 0.01792975, 0.02342876, 0.        , 0.10384782,
        0.0014375 , 0.00971281, 0.03998212, 0.0183746 , 0.00399092,
        0.02719806, 0.00206067, 0.00531951, 0.00579151, 0.00085368,
        0.00617503, 0.00065771, 0.00135227, 0.01147776, 0.00130524,
        0.        , 0.0135866 , 0.0075541 , 0.01020953, 0.00313306]),
 array([0.91772152, 0.932     , 0.97804391, 0.97654584, 0.95541401,
        0.96341463, 0.97818182, 0.95698925, 0.        , 0.98820755,
        0.86956522, 0.95862069, 0.98427673, 0.9893617 , 0.96666667,
        0.98058252, 1.        , 0.97530864, 0.91304348, 1.        ,
        0.98863636, 0.7       , 0.88888889, 0.92146597, 0.82352941,
        0.        , 0.94495413, 0.99107143, 0.98076923, 0.91891892]),
 array([0.02099928, 0.03100053, 0.06570126, 0.1103548 , 0.04007481,
        0.01047745, 0.03521403, 0.04573778, 0.        , 0.18794505,
        0.00287026, 0.01923077, 0.07684282, 

4-motifs

In [66]:
X = np.concatenate((model.wv.vectors, motif4.values), axis=1)

In [67]:
# Use validation and test set splits similar to graphnets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, test_size=None)
print(
    "Array shapes:\n X_train = {}\n y_train = {}\n X_test = {}\n y_test = {}".format(
        X_train.shape, y_train.shape, X_test.shape, y_test.shape
    )
)

Array shapes:
 X_train = (4311, 139)
 y_train = (4311, 30)
 X_test = (17246, 139)
 y_test = (17246, 30)


In [68]:
clf = BinaryRelevance(LogisticRegression())
clf.fit(scale(X_train), y_train)

BinaryRelevance(classifier=LogisticRegression(), require_dense=[True, True])

In [69]:
y_pred = clf.predict(X_test)

In [70]:
multilabel_confusion_matrix(y_test, y_pred)

array([[[ 9799,  7298],
        [   75,    74]],

       [[ 2714, 14284],
        [   16,   232]],

       [[ 1470, 15276],
        [   19,   481]],

       [[ 1069, 15236],
        [   20,   921]],

       [[ 1360, 15567],
        [   21,   298]],

       [[ 2172, 14978],
        [    6,    90]],

       [[ 5661, 11320],
        [   48,   217]],

       [[ 5078, 11777],
        [   94,   297]],

       [[15394,  1840],
        [   10,     2]],

       [[  587, 14971],
        [    8,  1680]],

       [[ 5007, 12212],
        [    4,    23]],

       [[ 4036, 13079],
        [   23,   108]],

       [[  768, 15836],
        [    1,   641]],

       [[  767, 16198],
        [    0,   281]],

       [[ 1747, 15428],
        [    3,    68]],

       [[ 1025, 15787],
        [    3,   431]],

       [[ 2201, 15021],
        [    1,    23]],

       [[ 3029, 14121],
        [    3,    93]],

       [[ 1486, 15665],
        [    4,    91]],

       [[ 2579, 14657],
        [    1,     9]],



In [71]:
print('accuracy: ', accuracy_score(y_test, y_pred))
print('samples: ', f1_score(y_test, y_pred, average='samples'))
print('macro:',f1_score(y_test, y_pred, average='micro'))
print('micro:',f1_score(y_test, y_pred, average='macro'))
print('weighted:',f1_score(y_test, y_pred, average='weighted'))

accuracy:  0.03334106459468862
samples:  0.027897949882251475
macro: 0.03328067149869149


  _warn_prf(


micro: 0.028741676448801504
weighted: 0.08146471207895595


In [72]:
precision_recall_fscore_support(y_test, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


(array([0.01003798, 0.01598236, 0.03052612, 0.05700316, 0.01878349,
        0.00597292, 0.01880905, 0.02459831, 0.00108578, 0.10089484,
        0.00187985, 0.00818988, 0.03890271, 0.01705201, 0.00438823,
        0.02657541, 0.00152885, 0.00654285, 0.00577558, 0.00061366,
        0.00545968, 0.00078555, 0.00104418, 0.01258707, 0.00074212,
        0.        , 0.01340699, 0.00695128, 0.0103002 , 0.00330969]),
 array([0.4966443 , 0.93548387, 0.962     , 0.97874601, 0.93416928,
        0.9375    , 0.81886792, 0.75959079, 0.16666667, 0.99526066,
        0.85185185, 0.82442748, 0.99844237, 1.        , 0.95774648,
        0.99308756, 0.95833333, 0.96875   , 0.95789474, 0.9       ,
        1.        , 0.84615385, 1.        , 0.98564593, 0.71428571,
        0.        , 0.97297297, 0.99122807, 0.98064516, 0.82352941]),
 array([0.01967823, 0.0314278 , 0.05917451, 0.1077319 , 0.0368265 ,
        0.01187022, 0.03677343, 0.04765343, 0.0021575 , 0.1832161 ,
        0.00375143, 0.01621865, 0.07488755, 

Original orbitals

In [74]:
drop_rows = []
for n in motif_orig.index:
    if n not in motif4.index:
        drop_rows.append(n)
motif_orig = motif_orig.drop(drop_rows)

In [75]:
X = np.concatenate((model.wv.vectors, motif_orig.values), axis=1)

In [76]:
# Use validation and test set splits similar to graphnets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, test_size=None)
print(
    "Array shapes:\n X_train = {}\n y_train = {}\n X_test = {}\n y_test = {}".format(
        X_train.shape, y_train.shape, X_test.shape, y_test.shape
    )
)

Array shapes:
 X_train = (4311, 201)
 y_train = (4311, 30)
 X_test = (17246, 201)
 y_test = (17246, 30)


In [87]:
clf = BinaryRelevance(LogisticRegression(max_iter=1000))
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
clf.fit(X_scaled, y_train)

BinaryRelevance(classifier=LogisticRegression(max_iter=1000),
                require_dense=[True, True])

In [88]:
y_pred = clf.predict(X_test)

In [89]:
multilabel_confusion_matrix(y_test, y_pred)

array([[[17095,     0],
        [  151,     0]],

       [[10157,  6825],
        [  139,   125]],

       [[16738,     0],
        [  508,     0]],

       [[ 3615, 12693],
        [  211,   727]],

       [[ 9116,  7808],
        [  165,   157]],

       [[17111,    38],
        [   96,     1]],

       [[ 7168,  9811],
        [   96,   171]],

       [[ 4406, 12464],
        [   92,   284]],

       [[   69, 17165],
        [    0,    12]],

       [[15189,   324],
        [ 1693,    40]],

       [[14302,  2919],
        [   22,     3]],

       [[  190, 16915],
        [    2,   139]],

       [[ 4862, 11749],
        [  195,   440]],

       [[  307, 16661],
        [    5,   273]],

       [[ 8615,  8559],
        [   39,    33]],

       [[15328,  1506],
        [  370,    42]],

       [[ 8983,  8240],
        [   14,     9]],

       [[16606,   545],
        [   93,     2]],

       [[ 4077, 13083],
        [   19,    67]],

       [[15130,  2109],
        [    5,     2]],



In [90]:
print('accuracy: ', accuracy_score(y_test, y_pred))
print('samples: ', f1_score(y_test, y_pred, average='samples'))
print('macro:',f1_score(y_test, y_pred, average='micro'))
print('micro:',f1_score(y_test, y_pred, average='macro'))
print('weighted:',f1_score(y_test, y_pred, average='weighted'))

accuracy:  0.0012176736634581932
samples:  0.020696051030747824
macro: 0.02484389254308316
micro: 0.019702732689223897
weighted: 0.04092098370997712


  _warn_prf(


In [91]:
precision_recall_fscore_support(y_test, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


(array([0.        , 0.01798561, 0.        , 0.05417288, 0.01971124,
        0.02564103, 0.01713084, 0.022278  , 0.00069861, 0.10989011,
        0.00102669, 0.00815058, 0.03609812, 0.01612141, 0.00384078,
        0.02713178, 0.00109104, 0.00365631, 0.00509506, 0.00094742,
        0.00313152, 0.00069796, 0.0006407 , 0.01156942, 0.0008673 ,
        0.00067889, 0.01470588, 0.00639955, 0.00856414, 0.00218659]),
 array([0.        , 0.47348485, 0.        , 0.7750533 , 0.48757764,
        0.01030928, 0.64044944, 0.75531915, 1.        , 0.02308136,
        0.12      , 0.9858156 , 0.69291339, 0.98201439, 0.45833333,
        0.10194175, 0.39130435, 0.02105263, 0.77906977, 0.28571429,
        0.13333333, 1.        , 0.27777778, 0.90640394, 0.23076923,
        0.23076923, 0.40654206, 0.41071429, 0.57668712, 0.29032258]),
 array([0.        , 0.03465484, 0.        , 0.10126759, 0.03789067,
        0.01470588, 0.03336911, 0.04327949, 0.00139624, 0.03814974,
        0.00203597, 0.01616749, 0.06862133, 