In [1]:
import random
import os
import numpy as np
import rdflib
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.manifold import TSNE

from rdf2vec.converters import rdflib_to_kg
from rdf2vec.walkers import RandomWalker
from rdf2vec import RDF2VecTransformer

import warnings
warnings.filterwarnings('ignore')

In [2]:
rdf_file ='sample/human_interactome.ttl'
#rdf_file = 'input/covid19-literature-knowledge-graph/sample_kg.nt'
#fileext = '.nq.gz'
label_predicates = ['<http://bio2rdf.org/irefindex_vocabulary:source>',
                   '<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>']

In [3]:
kg = rdflib_to_kg(rdf_file, filetype='turtle', label_predicates=label_predicates)

<http://bio2rdf.org/irefindex_vocabulary:source> does not look like a valid URI, trying to serialize this will break.
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type> does not look like a valid URI, trying to serialize this will break.
100%|██████████| 279723/279723 [00:12<00:00, 22036.59it/s]


In [4]:
# We'll all possible walks of depth 2
random_walker = RandomWalker(2, 4)

# Create embeddings with random walks
transformer = RDF2VecTransformer(walkers=[random_walker], sg=1)


<SparkContext master=local[10] appName=pyspark-shell>


In [5]:
all_entities = kg.get_all_entities()

In [6]:
all_entities[:10]

['http://bio2rdf.org/ncbigene:11183',
 'http://bio2rdf.org/ncbigene:54471',
 'http://bio2rdf.org/ncbigene:9968',
 'http://bio2rdf.org/ncbigene:117166',
 'http://bio2rdf.org/ncbigene:91694',
 'http://bio2rdf.org/ncbigene:66002',
 'http://bio2rdf.org/ncbigene:100505385',
 'http://bio2rdf.org/ncbigene:3028',
 'http://bio2rdf.org/ncbigene:2888',
 'http://bio2rdf.org/ncbigene:5908']

In [7]:
walk_embeddings = transformer.fit_transform(kg, all_entities)

./walks/randwalks_n4_depth2_pagerank_uniform.txt
Time elapsed to generate features: 00:00:11
Extracted 0 walks for 13461 instances!
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform


In [8]:
walk_embeddings[:10]

[array([ 5.69770932e-02,  1.02802336e-01, -5.10475338e-02,  6.29858375e-02,
         8.49409625e-02, -9.82846413e-03, -6.73464835e-02,  9.54058915e-02,
         1.67154714e-01, -4.18573357e-02, -2.58951150e-02,  9.54599679e-02,
        -1.39596788e-02,  4.98061590e-02,  2.35022642e-02, -6.63277805e-02,
         1.80128478e-02,  1.11006081e-01, -1.09437190e-01, -9.69468243e-03,
        -4.84287851e-02, -6.49395958e-02,  5.31608704e-03,  7.90989920e-02,
        -7.64245614e-02,  7.73394257e-02,  4.30092774e-02, -1.42280728e-01,
         4.39886712e-02, -8.94944891e-02, -5.11563011e-02,  1.21121733e-02,
         1.48105472e-02, -3.70060727e-02, -7.13459477e-02, -4.95074466e-02,
        -7.71612003e-02, -5.61206751e-02,  5.49863987e-02, -1.14068627e-01,
        -4.86306362e-02,  1.23226000e-02, -4.11522463e-02,  9.71646607e-03,
         4.38364036e-03, -1.25594452e-01, -1.60344183e-01,  1.03080915e-02,
        -7.05543607e-02,  1.46551982e-01, -2.14029457e-02, -4.06321026e-02,
         4.8