In [None]:
import os
import findspark

if 'VIRTUAL_ENV' in os.environ:
    findspark.init(python_path=os.environ['VIRTUAL_ENV']+'/bin/python')
else:
    findspark.init()

import pyspark
sc = pyspark.SparkContext('local[*]')
sqlContext = pyspark.sql.SQLContext(sc)

In [None]:
import re
import csv
import random
import ujson as json
from itertools import izip
from operator import add, itemgetter
from collections import Counter, defaultdict

In [None]:
from datetime import datetime
from time import time

import urllib
s3baseuri = "s3a://"

def zip_sum(*x):
  return [sum(i) for i in izip(*x)]
def trim_link_protocol(s):
    idx = s.find('://')
    return s if idx == -1 else s[idx+3:]
def get_timestamp():
  return datetime.fromtimestamp(time()).strftime('%Y%m%d%H%M%S')
def write_file_to_s3(localfile, s3_bucket, s3_filename):
    from boto.s3.connection import S3Connection
    from boto.s3.connection import Key
    conn = S3Connection(key, secret)
    bucket = conn.get_bucket(s3_bucket)
    if len(list(bucket.list(s3_filename))) == 0:
        k = Key(bucket)
        k.key = s3_filename
        k.set_contents_from_filename(localfile)

In [None]:
def get_mention_aligned_links(doc):
  text = doc['full_text']
  for m in doc['mentions']:
    mention_start, mention_stop = m['start'], m['stop']
    # filter mentions which occur outside of document full_text
    if mention_start >= 0 and mention_stop > mention_start:
      link_start = mention_stop+2
      # naively detect whether this mention sits inside a markdown link anchor
      if text[mention_start-1] == '[' and text[mention_stop:link_start] == '](':
        link_stop = text.find(')', link_start)

        if text[link_start:link_stop].startswith('http://'):
            link_start += 7
        elif text[link_start:link_stop].startswith('https://'):
            link_start += 8

        if link_stop != -1:
          yield slice(link_start, link_stop), slice(mention_start,mention_stop)

def get_links(doc):
  for m in re.finditer('(?<!\\\\)\[(([^]]|(\\\\]))+)(]\(\s*(http[s]?://)?)([^)]+)\s*\)', doc['full_text']):
    parts = m.groups()
    a, uri = parts[0], parts[5]
    if uri and not a.startswith('www') and not a.startswith('http') and not 'secure.adnxs.com' in uri:
      if 'digg.com' in uri:
        continue # todo: add check for anchor diversity to filter this kidn of thing
      mention_start = m.start() + 1
      mention_stop = mention_start + len(parts[0])
      link_start = mention_stop + len(parts[3])
      link_stop = link_start + len(parts[5])
      yield slice(link_start, link_stop), slice(mention_start, mention_stop)

import base64
import urlparse
def resolve_hardcoded_redirects(l):
  try:
    if l.startswith('www.prweb.net'):
      l = base64.b64decode(l[len('www.prweb.net/Redirect.aspx?id='):])
    elif l.startswith('cts.businesswire.com/ct/') or l.startswith('ctt.marketwire.com/'):
      l = urlparse.parse_qs(l)['url'][0]
  except: pass
  return trim_link_protocol(l)

anchor_filters = set([
  'facebook',
  'twitter',
  'zacks investment research',
  'reuters',
  'linkedin',
  'marketbeat'
])

if False:
  def get_link_labels(doc):
    text = doc['full_text']
    aligned_spans = set()
    for l, a in get_mention_aligned_links(doc):
      aligned_spans.add((l.start, l.stop))
      uri = text[l]
      if not 'search' in uri and not text[a].lower().strip() in anchor_filters:
        yield (1.0, uri)
    for l, a in get_links(doc):
      if (l.start, l.stop) not in aligned_spans:
        yield (0.0, text[l])

def get_anchor_target_pairs(doc):
  text = doc['full_text']
  aligned_spans = set()
  for l, a in get_mention_aligned_links(doc):
    aligned_spans.add((l.start, l.stop))
    yield to_item(a, l, text, True)
  for l, a in get_links(doc):
    if (l.start, l.stop) not in aligned_spans:
      is_mention = False
      if text[a].startswith('@') and not ' ' in text[a]:
        is_mention = True # twitter NER = solved
      yield to_item(a, l, text, is_mention)

def to_item(a, l, text, is_mention, window = 200):
    return \
        text[a],\
        resolve_hardcoded_redirects(text[l]),\
        is_mention,\
        text[max(a.start-window, 0):a.start-1],\
        text[l.stop+1:l.stop+window]

In [None]:
sc\
    .textFile(raw_corpus_path)\
    .map(json.loads)\
    .flatMap(get_anchor_target_pairs)\
    .take(1)

URI Classification

In [None]:
def normalize_uri(uri):
  uri = uri.lower()
  if uri.startswith('//'):
    uri = uri[2:]
  if uri.startswith('www.'):
    uri = uri[4:]

  # trim uri protocol
  idx = uri.find('://')
  uri = uri[idx+3:] if idx != -1 else uri

  # convert 'blah.com/users.php?id=bob' into 'blah.com/users.php/id=bob'
  uri = re.sub('([a-z]+)\?', r"\1/", uri)
  
  # convert 'blah.com/users#bob' into 'blah.com/users/bob'
  uri = uri.replace('#', '/')

  parts = uri.rstrip('/').split('/')
  suffix = parts[-1].lower()
  if len(parts) > 1 and suffix.startswith('index') or suffix.startswith('default'):
    parts = parts[:-1]
  if len(parts) > 1:
    parts[-1] = '<eid>'
  else:
    parts.append('<nil>')
  return '/'.join(parts)

#normalize_uri('vanityfair.com/index.aspx?rofl')

In [None]:
def get_uri_domain(uri):
  return uri.split('/')[0]

def get_uri_features(uri):
  features = []

  uri_parts = re.sub('[0-9]', 'N', uri).split('/')
  dom = uri_parts[0]
  uri_parts[0] = "<domain>"
  features += list('/'.join(p) for p in izip(uri_parts, uri_parts[1:]))
  features += [dom+':'+f for f in features]
  features += uri_parts

  dom_parts = dom.split('.')
  if len(dom_parts) >= 3:
    features.append('SD:' + '.'.join(dom_parts[:-2]))
  return features

from pyspark.ml.classification import NaiveBayes, LogisticRegression
from pyspark.ml.feature import HashingTF, StringIndexer, CountVectorizer

def balance_dataset(dataset, minor = 1.0, major = 0.0):
  major_count = dataset.filter(dataset.label == major).count()
  minor_count = dataset.filter(dataset.label == minor).count()
  return dataset.filter(dataset.label == major)\
                .sample(withReplacement=False, fraction=minor_count/float(major_count))\
                .unionAll(dataset.filter(dataset.label == minor))

def stats_at_p(r, p):
  tp = 1.0 if (r['label'] == 1.0 and r['probability'][1] >= p) else 0.0
  fp = 1.0 if (r['label'] == 0.0 and r['probability'][1] >= p) else 0.0
  fn = 1.0 if (r['label'] == 1.0 and r['probability'][1] < p) else 0.0
  return p, (tp, fp, fn)

def evaluate(dataset, ps = None):
  if ps == None:
    ps = [0.5]
  stats_by_p = dataset\
    .flatMap(lambda r: (stats_at_p(r, p) for p in ps))\
    .reduceByKey(lambda a, b: [x+y for x,y in zip(a, b)])\
    .filter(lambda (p, (tp, fp, fn)): (tp+fp) > 0 and (tp+fn) > 0)\
    .mapValues(lambda (tp, fp, fn): ((float(tp) / (tp+fp)), (float(tp) / (tp+fn))))\
    .mapValues(lambda (p, r): (p, r, 2 * (p*r/(p+r))))\
    .collect()
  return stats_by_p
    
classifier = LogisticRegression(featuresCol="hashed_features")

In [None]:
REBUILD_CORPUS = False

raw_corpus_path = 'articles'
link_corpus_path = 'links'

In [None]:
anchor_target_pairs = sc\
  .textFile(raw_corpus_path)\
  .map(json.loads)\
  .flatMap(get_anchor_target_pairs)

In [None]:
anchor_target_pairs.take(1)

In [None]:
if REBUILD_CORPUS:
    train, test = [
        split.flatMap(lambda (prefix, instances): instances)\
             .map(lambda (uri, is_mention): (uri, 1.0 if is_mention else 0.0, get_uri_features(uri)))\
             .repartition(128)\
             .cache()
        for split in
          anchor_target_pairs\
              .map(lambda (anchor, target, is_mention, left, right): (normalize_uri(target), is_mention))\
              .groupByKey()\
              .filter(lambda (k,vs): len(vs) >= 10)\
              .mapValues(Counter)\
              .mapValues(lambda cs: cs[True] > cs[False])\
              .map(lambda (uri, is_mention): (get_uri_domain(uri), (uri, is_mention)))\
              .groupByKey()\
              .randomSplit([0.9, 0.1])
    ]
    sqlContext\
        .createDataFrame(train, ['uri','label','features'])\
        .write.mode('overwrite')\
        .save(link_corpus_path + '/train')
    sqlContext\
        .createDataFrame(test, ['uri','label','features'])\
        .write.mode('overwrite')\
        .save(link_corpus_path + '/test')

train = sqlContext.load(link_corpus_path + '/train')
test = sqlContext.load(link_corpus_path + '/test')
full = train.unionAll(test)

In [None]:
train.filter(train['label']==1.0).count(), train.filter(train['label']==0.0).count()

Dev Evaluation

In [None]:
hashing_tf = HashingTF(inputCol="features", outputCol="hashed_features", numFeatures=500000)
#hashing_tf = CountVectorizer(inputCol="features", outputCol="hashed_features").fit(train)
train = hashing_tf.transform(train)
test = hashing_tf.transform(test)
dev_model = classifier.fit(balance_dataset(train))

In [None]:
def iter_labeled_uris(lines):
  reader = csv.DictReader(lines, skipinitialspace=True)
  for r in reader:
    if r['_golden'] == 'true':
      continue
    yield r['a'], r['is_web_page_a_an_entity_page'] == 'yes', float(r['is_web_page_a_an_entity_page:confidence'])
    yield r['b'], r['is_web_page_b_an_entity_page'] == 'yes', float(r['is_web_page_b_an_entity_page:confidence'])

labeled_uris = sqlContext.createDataFrame(sc\
  .parallelize(
    iter_labeled_uris(sc\
    .textFile(s3baseuri + 'abbrevi8-rnd/web/links/a885525.csv')\
    .map(lambda r: r.encode('utf-8'))\
    .collect()))\
  .map(lambda (uri, entity, conf): (uri, 1.0 if entity else 0.0, conf, get_uri_features(normalize_uri(uri))))
, ['uri', 'label', 'confidence', 'features'])

In [None]:
train_prs = evaluate(dev_model.transform(train), ps=[p/20. for p in xrange(1, 20)])
dev_prs = evaluate(dev_model.transform(test), ps=[p/20. for p in xrange(1, 20)])
test_prs = evaluate(dev_model.transform(hashing_tf.transform(labeled_uris)), ps=[p/20. for p in xrange(1, 20)])

print 'Evaluation @ Confidence >= 0.5'
print 'Train P/R=(%.2f, %.2f), F=%.3f' % dict(train_prs)[0.8]
print '  Dev P/R=(%.2f, %.2f), F=%.3f' % dict(dev_prs)[0.8]
print ' Test P/R=(%.2f, %.2f), F=%.3f' % dict(test_prs)[0.8]

In [None]:
# display(sorted([(k, v[0]*100, v[1]*100) for k,v in dev_prs]+[(0., 0., 100.)]+[(1., 100.0, 0.)]))
display(sorted([(k, v[0], v[1], v[2]) for k,v in dev_prs]))

Full Model

In [None]:
full_hashing_tf = HashingTF(inputCol="features", outputCol="hashed_features", numFeatures=500000)
model = classifier.fit(balance_dataset(full_hashing_tf.transform(full)))

In [None]:
uris = [
  'facebook.com/efoim',
  'twitter.com/person',
  'twitter.com/person/status/1231',
  'linkedin.com/company/zcbvx',
  'linkedin.com/in/zcbvx',
  'en.wikipedia.org/wiki/someone',
  'en.wikipedia.org/w/index.php?id=123',
  'www.nytimes.com/topic/person/sheldon-silver',
]
model.transform(
  full_hashing_tf.transform(
    sqlContext.createDataFrame(
      [(u, get_uri_features(normalize_uri(u))) for u in uris], 
      ['uri','features'])))\
  .map(lambda r: (r['uri'], r['probability'][1]))\
  .collect()

In [None]:
cc_target_counts = sc\
  .textFile(s3baseuri + 'abbrevi8-rnd/web/links/cc/')\
  .map(lambda line: line.split('\t'))\
  .filter(lambda ps: len(ps) == 3)

cc_corpus_links = sqlContext.createDataFrame(
  sc\
    .textFile(s3baseuri + 'abbrevi8-rnd/web/links/cc/')\
    .sample(False, 0.1)\
    .map(lambda line: line.split('\t'))\
    .filter(lambda ps: len(ps) == 3)\
    .map(lambda (source, anchor, target): target)\
    .map(trim_link_protocol)\
    .filter(lambda uri: ('.co' in uri or '.net' in uri or '.org' in uri or '.edu' in uri))\
    .map(lambda t: (normalize_uri(t), 1))\
    .reduceByKey(add)\
    .map(lambda (uri, count): (uri, count, get_uri_features(uri)))
  ,['uri', 'count', 'features'])

corpus_links = model.transform(full_hashing_tf.transform(cc_corpus_links))

corpus_links\
  .filter(corpus_links['prediction'] == 1.0)\
  .map(lambda r: (r['uri'], r['count'], r['probability'][1]))\
  .filter(lambda (uri, count, p): p >= 0.8)\
  .map(lambda (uri, count, p): count)\
  .sum()
corpus_links\
  .map(lambda r: r['count'])\
  .sum()
corpus_links\
  .filter(corpus_links['prediction'] == 1.0)\
  .map(lambda r: (r['uri'], r['count'], r['probability'][1]))\
  .filter(lambda (uri, count, p): p >= 0.85)\
  .map(lambda (u,c,p): (normalize_uri(u), c))\
  .reduceByKey(add)\
  .sortBy(lambda (u,c): c, ascending=False)\
  .take(500)
"""
(u'twitter.com/<eid>', 250606),
 (u'shop.nordstrom.com/c/<eid>', 96401),
 (u'avo.alaska.edu/activity/avoreport.php/<eid>', 29133),
 (u'failblog.cheezburger.com/<eid>', 15984),
 (u'cnbc.com/<eid>', 15504),
 (u'archive.org/details/<eid>', 13158),
 (u'lightology.com/index.php/<eid>', 12381),
 (u'linkedin.com/company/<eid>', 11318),
 (u'wikia.com/<eid>', 9871),
 (u'jjill.com/jjillonline/prodnav/grid.aspx/<eid>', 9388),
"""

In [None]:
corpus_links = full_hashing_tf\
  .transform(
    sqlContext.createDataFrame(
      anchor_target_pairs\
        .map(lambda (anchor, target, mention): (anchor, target, mention, get_uri_features(target)))
      , ['anchor', 'uri','label','features']))
corpus_links = model.transform(corpus_links).cache()

In [None]:
corpus_links\
  .rdd\
  .filter(lambda r: 'facebook' in r['uri'] and not r['uri'].endswith('/'))\
  .map(lambda r: (r['uri'], r['anchor'], r['probability']))\
  .take(50)

In [None]:
predicted_pairs = corpus_links\
  .rdd\
  .filter(lambda r: r['probability'][1] >= 0.65)\
  .map(lambda r: (r['anchor'], r['uri']))\
  .distinct()

In [None]:
predicted_pairs.take(1)

In [None]:
uris = set()
graph = defaultdict(set)
for a, u in predicted_pairs.collect():
  a = a.lower()
  if '.' in u:
    graph[a].add(u)
    graph[u].add(a)
    uris.add(u)

In [None]:
len(graph)

In [None]:
def get_neighbours(graph, node, depth):
  pending = set([node])
  clique = set(pending)
  for _ in xrange(depth):
    pending = set().union(*[graph[node] for node in pending])
    pending = pending - clique
    if not pending:
      break
    clique = clique | pending
  return clique

def normalize_uri(u):
  u = u.lower().strip('/')
  u = urllib.unquote(u)
  if u.startswith('www.'):
    u = u[4:]
  return u

uri_nodes = list(uris)
neighborhood_sz = 2
#sampled_domains = Counter()
#sampled_uris = set()
#samples = []
while len(samples) < 1000:
  node = random.choice(uri_nodes)
  neighbours = get_neighbours(graph, node, 4) & uris
  if len(neighbours) > 1:
    neighborhood = random.sample(neighbours, min(neighborhood_sz, len(neighbours)))
    b = neighborhood.pop()
    while neighborhood:
      a, b = b, neighborhood.pop()
      norm_a, norm_b = normalize_uri(a), normalize_uri(b)
      if norm_a == norm_b or norm_a in sampled_uris or norm_b in sampled_uris:
        continue
      dom_a, dom_b = norm_a.split('/')[0], norm_b.split('/')[0]
      if dom_a == dom_b or dom_a.startswith(dom_b) or dom_b.startswith(dom_a):
        continue
      dom_samples = max(sampled_domains[dom_a], sampled_domains[dom_b])
      if random.random() > math.sqrt(dom_samples/25.0):
        samples.append((a,b))
        sampled_uris.add(norm_a)
        sampled_uris.add(norm_b)
        sampled_domains[dom_a] += 1
        sampled_domains[dom_b] += 1
        #print '%s\t%s' % (a,b)

if True:
  with open('/tmp/samples.csv', 'w') as f:
    f.write('a,b\n')
    for s in samples:
      f.write(('"'+'","'.join(s)+'"\n').encode('utf-8'))
  write_file_to_s3('/tmp/samples.csv', 'abbrevi8-rnd', '/web/annotation/samples.%s.csv' % get_timestamp())
samples

In [None]:
sum(['"a"' in a+b for a,b in samples])

In [None]:
#node = random.sample(graph, 1)[0]
#node = 'abcnews.go.com/topics/news/minnesota.htm'
node = random.sample(uris, 1)[0]
print node
#[n for n in get_neighbours(graph, node, 1) if n in uris]
get_neighbours(graph, node, 4) & uris

In [None]:
neighborhood

In [None]:
if False:
  tf_train\
    .filter(tf_train['prediction'] == 1.0)\
    .map(lambda r: (get_uri_prefix(r['uri'])))\
    .reduceByKey(add)\
    .map(lambda (k,v): (v,k))\
    .sortByKey(ascending=False)\
    .filter(lambda (k,v): k <= 500)\
    .take(500)

### Anchor Document Frequency

In [None]:
docs\
  .map(lambda d: (trim_link_protocol(d['source_url']).split('/')[0], set(d['full_text'][ms] for ls, ms in get_mention_aligned_links(d))))\
  .mapValues(lambda anchors: set(a.lower().strip() for a in anchors))\
  .flatMap(lambda (src, anchors): [(a, 1) for a in anchors])\
  .reduceByKey(add)\
  .filter(lambda (k, count): count > 1)\
  .map(lambda (k, count): (count, k))\
  .sortByKey(ascending=False)\
  .take(100)

### Anchors by Source

In [None]:
docs\
  .map(lambda d: (trim_link_protocol(d['source_url']).split('/')[0], set(d['full_text'][ms] for ls, ms in get_mention_aligned_links(d))))\
  .mapValues(lambda anchors: set(a.lower().strip() for a in anchors))\
  .flatMap(lambda (src, anchors): [((src, a), 1) for a in anchors] + [((src, None), 1)])\
  .reduceByKey(add)\
  .filter(lambda (k, count): count > 1)\
  .map(lambda ((src, a), count): (src, (count, a)))\
  .groupByKey()\
  .mapValues(lambda vs: sorted(vs, reverse=True)[:5])\
  .mapValues(lambda vs: [(a, v/float(vs[0][0]), v) for v, a in vs])\
  .sortBy(lambda (k, vs): vs[0][2], ascending=False)\
  .take(100)

### Top Mention-Aligned Endpoints

In [None]:
docs\
  .flatMap(lambda d: (d['full_text'][ls] for ls, ms in get_mention_aligned_links(d)))\
  .map(resolve_hardcoded_redirects)\
  .map(lambda uri: (uri, 1))\
  .reduceByKey(add)\
  .sortBy(lambda (k, v): v, ascending=False)\
  .take(100)

### Top Endpoints Prefixes

In [None]:
top_prefixes = docs\
  .flatMap(lambda d: (d['full_text'][ls] for ls, ms in get_mention_aligned_links(d)))\
  .map(resolve_hardcoded_redirects)\
  .map(lambda l: '/'.join(l.split('/')[:-1]) or l)\
  .map(lambda uri: (uri, 1))\
  .reduceByKey(add)\
  .sortBy(lambda (k, v): v, ascending=False)\
  .take(100)
top_prefixes

In [None]:
for uri, count in top_prefixes:
  print '\url{%s} & %i \\\\' % (uri, count)

In [None]:
pairs = docs\
  .flatMap(lambda d: ((d['full_text'][ms].lower().strip(), d['full_text'][ls]) for ls, ms in get_links(d)))\
  .filter(lambda (a, uri): a not in anchor_filters)\
  .mapValues(resolve_hardcoded_redirects)\
  .distinct()\
  .map(lambda (label, uri): (label, uri, get_uri_features(uri)))

pairs = sqlContext.createDataFrame(pairs, ['label','uri', 'features'])
pairs = hashing_tf.transform(pairs)
pairs = model.transform(pairs)

In [None]:
pairs.count()

In [None]:
pairs = docs\
  .flatMap(lambda d: ((d['full_text'][ms].lower().strip(), d['full_text'][ls]) for ls, ms in get_mention_aligned_links(d)))\
  .filter(lambda (a, uri): a not in anchor_filters)\
  .mapValues(resolve_hardcoded_redirects)\
  .distinct()\
  .map(lambda (label, uri): (label, uri, get_link_features(uri)))\
  .cache()

In [None]:
pairs\
  .map(lambda (label, uri, features): (label, uri))\
  .filter(lambda (label, uri): uri.startswith('www.nytimes.com/topic/person/'))\
  .map(lambda (k,v): (v, k))\
  .groupByKey()\
  .mapValues(Counter)\
  .filter(lambda (uri, labels): sum(labels.itervalues()) > 1)\
  .take(5)
  #.map(lambda (k,v): len(v))\
  #.stats()

In [None]:
pdf = sqlContext.createDataFrame(pairs, ['label','uri', 'features'])
pdf = hashing_tf.transform(pdf)
pdf = model.transform(pdf)
pdf\
  .rdd\
  .filter(lambda r: r['probability'][1] > 0.5)\
  .map(lambda r: (r['label'],r['uri']))\
  .take(100)

Top Classified Endpoints

In [None]:
if False:
  top_classified_endpoints = pdf\
    .rdd\
    .map(lambda r: (r['uri'], r['probability'][1]))\
    .map(lambda (l, p): ('/'.join(l.split('/')[:-1]) or l, p))\
    .map(lambda uri: (uri, 1))\
    .reduceByKey(add)\
    .sortBy(lambda (k, v): v, ascending=False)\
    .take(100)

for uri, count in top_classified_endpoints:
  print '\url{%s} & %.2f & %i \\\\' % (uri[0], uri[1], count)

In [None]:
if True:
  most_likely_endpoints = pdf\
    .rdd\
    .map(lambda r: (r['uri'], r['probability'][1]))\
    .map(lambda (l, p): ('/'.join(l.split('/')[:-1]) or l, p))\
    .map(lambda uri: (uri, 1))\
    .reduceByKey(add)\
    .filter(lambda (k,v): v > 200)\
    .sortBy(lambda (k, v): k[1], ascending=False)\
    .take(100)
for uri, count in most_likely_endpoints:
  print '\url{%s} & %.2f & %i \\\\' % (uri[0], uri[1], count)

In [None]:
\url{economictimes.indiatimes.com/topic} & 13553 \\
\url{www.huffingtonpost.com/news} & 13483 \\
\url{en.wikipedia.org/wiki} & 7241 \\
\url{www.huffingtonpost.com.au/news} & 6350 \\
\url{data.cnbc.com/quotes} & 3215 \\
\url{www.globenewswire.com/newsroom} & 2410 \\
\url{www.livemint.com/Search/Link/Keyword} & 2342 \\
\url{www.linkedin.com/in} & 2300 \\
\url{www.benzinga.com/stock} & 2176 \\
\url{sports.yahoo.com/soccer/players} & 2022 \\

In [None]:
pdf = sqlContext.createDataFrame(pairs, ['label','uri', 'features'])
pdf = hashing_tf.transform(pdf)
pdf = model.transform(pdf)
pdf = pdf\
  .rdd\
  .filter(lambda r: r['probability'][1] > 0.95)\
  .map(lambda r: (r['label'],r['uri']))\
  .collect()

In [None]:
random.sample(graph, 5)