In [2]:
!pip install -r requirements.txt

Collecting pyspark
  Downloading pyspark-3.0.1.tar.gz (204.2 MB)
[K     |████████████████████████████████| 204.2 MB 31 kB/s s eta 0:00:01    |████                            | 26.0 MB 12.9 MB/s eta 0:00:14     |████▊                           | 30.4 MB 12.9 MB/s eta 0:00:14     |██████▊                         | 43.1 MB 12.4 MB/s eta 0:00:14     |████████████▋                   | 80.2 MB 10.2 MB/s eta 0:00:13     |████████████████████▏           | 129.0 MB 3.0 MB/s eta 0:00:25     |████████████████████▊           | 132.0 MB 3.0 MB/s eta 0:00:24     |████████████████████████████▉   | 184.0 MB 310 kB/s eta 0:01:06     |██████████████████████████████▏ | 192.8 MB 8.0 MB/s eta 0:00:02
[?25hCollecting findspark
  Downloading findspark-1.4.2-py2.py3-none-any.whl (4.2 kB)
Collecting gensim
  Downloading gensim-3.8.3-cp38-cp38-manylinux1_x86_64.whl (24.2 MB)
[K     |████████████████████████████████| 24.2 MB 13.0 MB/s eta 0:00:01     |████████████████████▌           | 15.5 MB 8.0 MB/s eta 0:0

In [3]:
import random
import os
import numpy as np
import rdflib
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.manifold import TSNE

from rdf2vec.converters import rdflib_to_kg
from rdf2vec.walkers import RandomWalker
from rdf2vec import RDF2VecTransformer

import warnings
warnings.filterwarnings('ignore')

## Import the rdf file (ttl, nt, all other supported by rdflib)

In [4]:
rdf_file ='neuro_dkg11.ttl'
#rdf_file = 'input/covid19-literature-knowledge-graph/sample_kg.nt'
#fileext = '.nq.gz'

#predicates for Random Walker to follow
label_predicates = ['<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>']

In [5]:
kg = rdflib_to_kg(rdf_file, filetype='turtle')

100%|██████████| 1420/1420 [00:00<00:00, 37981.47it/s]


In [6]:
# We'll all possible walks of depth 2
random_walker = RandomWalker(2, 4)

# Create embeddings with random walks
transformer = RDF2VecTransformer(walkers=[random_walker], sg=1)


<SparkContext master=local[10] appName=pyspark-shell>


In [7]:
all_entities = kg.get_all_entities()

In [8]:
all_entities[:10]

['http://www.w3id.org/neurodkg/Instances/context9',
 'http://www.w3id.org/drugbank:DB00715',
 '6-16',
 'http://www.w3id.org/neurodkg/Instances/context146',
 'http://www.w3id.org/doid/12129',
 'Multiple Sclerosis',
 'http://www.w3id.org/doid/1824',
 'http://purl.bioontology.org/ontology/OMIM/MTHU004076',
 'http://www.w3id.org/neurodkg/Instances/context143',
 'http://www.w3id.org/neurodkg/Instances/context49']

In [9]:
walk_embeddings = transformer.fit_transform(kg, all_entities)

./walks/randwalks_n4_depth2_pagerank_uniform.txt
Time elapsed to generate features: 00:00:06
Extracted 0 walks for 966 instances!
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform


In [None]:
walk_embeddings[:10]

In [11]:
len(all_entities)

966

In [12]:
len(walk_embeddings)

966

## Generating a dataframe for entity embeddings

In [19]:
df =pd.DataFrame(zip(all_entities, walk_embeddings), columns=['entity', 'embedding'])
    

In [20]:
# a function for converting entity names
# if you need to provide entity names with CURIE format (e.g. DRUGBANK:DB00012)
def replace_prefix(entity):
    if entity.startswith('http://www.w3id.org/drugbank:'):
        return entity.replace('http://www.w3id.org/drugbank:', 'DRUGBANK:')
    else:
        return entity

df.entity = df.entity.apply(replace_prefix)

In [21]:
df.to_json('neurodkg_embedding.json',orient='records')

In [1]:
import pandas as pd
import numpy as np
df =pd.read_json('neurodkg_embedding.json',orient='records')

In [2]:
df.head()

Unnamed: 0,entity,embedding
0,http://www.w3id.org/neurodkg/Instances/context9,"[-0.054165896000000005, -0.0011995625, -0.0233..."
1,DRUGBANK:DB00715,"[-0.0284365881, -0.00028513540000000003, -0.01..."
2,6-16,"[5.2648e-05, 0.0004987813, -0.0005283586, -0.0..."
3,http://www.w3id.org/neurodkg/Instances/context146,"[-0.0484109446, -0.0004909278, -0.0216075275, ..."
4,http://www.w3id.org/doid/12129,"[-0.0305632632, 0.0001149263, -0.0130742388, -..."


In [None]:
np.array(df['embedding'].values)

In [10]:
embedding_mat =[]
for i, row in df.iterrows():
    emb=row['embedding']
    embedding_mat.append(emb)

In [13]:
entities = df.entity.to_list()

### alternatively you can store as csv with X columns (X is the dimension of the emebdding)

In [20]:
df_emb =pd.DataFrame( embedding_mat, columns= ['feature'+str(i) for i in range(len(emb))])

In [22]:
df_emb['entity'] = entities

In [23]:
df_emb.to_csv('neurodkg_embedding.csv', index=False)

In [24]:
df_emb.head()

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,...,feature491,feature492,feature493,feature494,feature495,feature496,feature497,feature498,feature499,entity
0,-0.054166,-0.0012,-0.023321,-0.025693,0.007637,0.028217,-0.059239,-0.111202,0.058704,0.097079,...,-0.023,0.048445,-0.12615,0.008552,0.045284,-0.042427,0.002812,-0.016124,0.061127,http://www.w3id.org/neurodkg/Instances/context9
1,-0.028437,-0.000285,-0.012574,-0.013913,0.005279,0.015798,-0.031356,-0.057418,0.030944,0.049776,...,-0.012453,0.024986,-0.064281,0.004008,0.024315,-0.021258,0.001715,-0.008117,0.032511,DRUGBANK:DB00715
2,5.3e-05,0.000499,-0.000528,-0.000492,-0.000374,0.0009,7.9e-05,-0.00056,-0.000531,-6.2e-05,...,0.000155,-0.00039,0.000237,0.000389,-0.000486,4.5e-05,0.00066,-0.000614,0.00013,6-16
3,-0.048411,-0.000491,-0.021608,-0.021919,0.007285,0.026013,-0.051671,-0.096173,0.051644,0.085442,...,-0.020548,0.04225,-0.110233,0.006751,0.039705,-0.037478,0.003409,-0.013098,0.053585,http://www.w3id.org/neurodkg/Instances/context146
4,-0.030563,0.000115,-0.013074,-0.01316,0.005226,0.015172,-0.032794,-0.060222,0.032836,0.053627,...,-0.012203,0.02562,-0.069523,0.004061,0.024678,-0.023869,0.001071,-0.009343,0.033984,http://www.w3id.org/doid/12129
