In [1]:
sc

In [2]:
data_path = 'hdfs://schwa11:9000/webkb'

In [3]:
from HTMLParser import HTMLParser
import ujson as json
from operator import add
from collections import Counter
from itertools import combinations
import re

def trim_utm(url):
    if "utm_" not in url:
        return url
    matches = re.findall('(.+\?)([^#]*)(.*)', url)
    if len(matches) == 0:
        return url
    match = matches[0]
    query = match[1]
    sanitized_query = '&'.join([p for p in query.split('&') if not p.startswith('utm_')])
    return match[0]+sanitized_query+match[2]

html_parser = HTMLParser()

def normalize_url(url, base=None):
    if base:
        try:
            url = urlparse.urljoin(base, url)
        except ValueError:
            pass

    url = html_parser.unescape(url)
    url = url.lower()
    if url.startswith('https://'):
        url = url[8:]
    if url.startswith('http://'):
        url = url[7:]
    if url.startswith('www.'):
        url = url[4:]
    url = trim_utm(url)
    url = url.strip('/?#')
    return url

def normalize_endpoint_url(url):
    url = normalize_url(url)
    # convert 'blah.com/users.php?id=bob' into 'blah.com/users.php/id=bob'
    url = re.sub('([a-z]+)\?', r"\1/", url)
    url = re.sub('[0-9]', 'N', url) # todo: maybe remove
    # convert 'blah.com/users#bob' into 'blah.com/users/bob'
    url = url.replace('#', '/')

    parts = url.rstrip('/').split('/')
    suffix = parts[-1].lower()
    if len(parts) > 1 and suffix.startswith('index') or suffix.startswith('default'):
        parts = parts[:-1]
    if len(parts) > 1:
        parts[-1] = '<eid>'
    else:
        parts.append('<nil>')
    return '/'.join(parts)

In [4]:
from sift.models import text
from sift.util import ngrams

In [5]:
# mention count
if False:
    links = sc\
        .textFile(data_path + '/docs')\
        .map(json.loads)\
        .flatMap(lambda d: (l for l in d['links'] if l['endpoint'] >= 0.825))\
        .cache()
    links.count()
    links.map(lambda l: normalize_url(l['target'])).distinct().count()
    links.map(lambda l: normalize_endpoint_url(l['target'])).distinct().count()

In [6]:
bad_anchors = sc\
    .textFile(data_path + '/docs')\
    .map(json.loads)\
    .flatMap(lambda d: (
            (d['text'][l['start']:l['stop']].lower().strip(), l['target']) for l in d['links'] if l['endpoint'] > 0.99))\
    .distinct()\
    .groupByKey()\
    .mapValues(list)\
    .coalesce(128)\
    .mapValues(lambda urls: Counter(normalize_endpoint_url(u) for u in urls))\
    .map(lambda (a, urls): (a, sum(1.0 for k, v in urls.iteritems() if v >= 10)))\
    .sortBy(lambda (k, count): count, ascending=False)\
    .filter(lambda (k, count): count >= 3)\
    .map(lambda (k, c): k)\
    .collect()
bad_anchors = set(bad_anchors)

In [7]:
# todo: exclude anchors that are named entities

In [8]:
def filter_doc_endpoint_links(doc, threshold=0.95):
    links = []
    for l in doc['links']:
        if l['endpoint'] > threshold:
            anchor = doc['text'][l['start']:l['stop']].lower().strip()
            if anchor not in bad_anchors:
                l['target'] = normalize_url(l['target'])
                links.append(l)
    doc['links'] = links
    return doc

In [9]:
if False:
    sc\
        .textFile(data_path + '/docs')\
        .map(json.loads)\
        .flatMap(lambda d: (l['target'] for l in d['links']))\
        .map(normalize_url)\
        .filter(lambda l: '?' in l)\
        .flatMap(lambda l: l.split('?')[-1].split('&'))\
        .map(lambda params: params.split('=')[0])\
        .map(lambda k: (k, 1))\
        .reduceByKey(add)\
        .map(lambda (k, v): (v, k))\
        .sortByKey(ascending=False)\
        .take(1000)

In [10]:
docs = sc\
    .textFile(data_path + '/docs')\
    .map(json.loads)\
    .map(filter_doc_endpoint_links)\
    .filter(lambda d: d['links'])\
    .map(lambda d: (d['_id'], d))\
    .groupByKey()\
    .map(lambda (k, ds): list(ds)[0])\
    .cache()
docs.count()

1712230

In [11]:
from ngram import NGram

In [12]:
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

In [13]:
import random
import math
from collections import defaultdict

def get_ngrams(text):
    return [t for t in ngrams(text, 2) if t not in stop]

def get_entity_rep(mentions):
    anchor_ngrams = set()
    text_ngrams = set()
    sources = set()
    for m in mentions:
        sources.add(m['source'])
        anchor_ngrams = anchor_ngrams.union(get_ngrams(m['text'][slice(*m['span'])]))
        text_ngrams = text_ngrams.union(get_ngrams(m['text']))
    return sources, anchor_ngrams, text_ngrams

def set_features(x, y):
    for f in x.intersection(y):
        yield 'SAME:' + f
    for f in x.symmetric_difference(y):
        yield 'DIFF:' + f

def aggregate_mentions(mentions):
    anchors, texts = Counter(), Counter()
    for _, anchor, text in mentions:
        anchors.update(anchor)
        texts.update(text)
    return anchors, texts

def bucket_normalize_counts(counts, buckets):
    total = float(max(v for v in counts.itervalues()))
    return {k:int(round(buckets*(v/total))) for k, v in counts.iteritems()}

from pyspark.ml.linalg import Vectors
import math

def bow_sim(a, b, norm=True):
    if not a or not b:
        return 0.

    if norm:
        a_sq = 1.0 * math.sqrt(sum(val * val for val in a.values()))
        b_sq = 1.0 * math.sqrt(sum(val * val for val in b.values()))

    if len(b) < len(a):
        a, b = b, a

    cossim = sum(value * b.get(index, 0.0) for index, value in a.items())
    if norm:
        cossim /= a_sq * b_sq

    return cossim

def get_base_features(a, b):
    a_anchor, a_body = a
    b_anchor, b_body = b

    features = []
    features.append(bow_sim(a_anchor, b_anchor))
    features.append(bow_sim(a_body, b_body))
    
    if a_anchor and b_anchor:
        top_a = a_anchor.most_common(1)[0][0]
        top_b = b_anchor.most_common(1)[0][0]
        features.append(NGram.compare(top_a, top_b, N=3))
    else:
        features.append(0.0)

    return Vectors.dense(features)

def similarity_features(a_counts, b_counts):
    if len(a_counts) > len(b_counts):
        a_counts, b_counts = b_counts, a_counts

    bucket_size = 10
    if any(c > bucket_size for c in a_counts.itervalues()):
        a_counts = bucket_normalize_counts(a_counts, bucket_size)
    if any(c > bucket_size for c in b_counts.itervalues()):
        b_counts = bucket_normalize_counts(b_counts, bucket_size)

    for k, a_count in a_counts.iteritems():
        b_count = b_counts.get(k, 0)
        if a_count > 0 and b_count > 0:
            for _ in xrange(min(a_count, b_count)):
                yield k

    for k in set(a_counts.iterkeys()).symmetric_difference(b_counts.iterkeys()):
        n = max(a_counts.get(k, 0), b_counts.get(k, 0))
        if n > 1:
            for _ in xrange(n-1):
                yield 'NOT:' + k

def generate_pair_features(a, b):
    #a_src, a_anchor, a_body = a
    #b_src, b_anchor, b_body = b
    
    a_anchor, a_body = a #aggregate_mentions(a)
    b_anchor, b_body = b #aggregate_mentions(b)

    for f in similarity_features(a_anchor, b_anchor):
        yield 'ANCH:'+f
    for f in similarity_features(a_body, b_body):
        yield 'TEXT:'+f

    #if a_anchor == b_anchor:
    #    yield 'ANCH:EQUAL'
    #if a_anchor.intersection(b_anchor):
    #    yield 'ANCH:OVERLAP'
    #for f in set_features(a_anchor, b_anchor):
    #    yield 'ANCH:' + f
    #for f in set_features(a_body, b_body):
    #    yield 'TEXT:' + f

def get_positives(instances, num=2):
    for _ in xrange(num):
        random.shuffle(instances)

        sources = set()
        filtered_instances = []
        for i in instances:
            src, anchor, text = i
            if src not in sources:
                sources.add(src)
                filtered_instances.append(i)

        split_idx = int((0.5 * random.random() + 0.25) * len(filtered_instances))
        if split_idx == 0 or split_idx == len(filtered_instances):
            split_idx == 1

        a_inst, b_inst = filtered_instances[:split_idx], filtered_instances[split_idx:]
        if a_inst and b_inst:
            yield aggregate_mentions(a_inst), aggregate_mentions(b_inst) #list(generate_pair_features(a_ms, b_ms)), get_base_features(a_ms, b_ms)

def get_positives_by_inst(instances):
    sources = set()
    instances_by_src = defaultdict(list)
    for i in instances:
        src, anchor, text = i
        sources.add(src)
        instances_by_src[src].append(i)

    if len(sources) >= 2:
        for source, inst in instances_by_src.iteritems():
            other_sources = list(sources - set([source]))
            for i in inst:
                a = i
                b = random.choice(instances_by_src[random.choice(other_sources)])
                yield list(generate_pair_features(a, b))

In [14]:
import random

MAX_MENTIONS = 500

def fold_mention(mentions, mention):
    if isinstance(mention, list):
        mentions.extend(mention)
    else:
        mentions.append(mention)
    
    if len(mentions) == MAX_MENTIONS + 1:
        del mentions[random.randint(0, MAX_MENTIONS)]
    elif len(mentions) > MAX_MENTIONS:
        mentions = random.sample(mentions, MAX_MENTIONS)
    return mentions

def fold_aggregate_mentions(agg_mentions, mention):
    a_anchors, a_texts = agg_mentions
    b_anchors, b_texts = mention
    
    if len(b_anchors) < len(a_anchors):
        a_anchors, b_anchors = b_anchors, a_anchors
    if len(b_texts) < len(a_texts):
        a_texts, b_texts = b_texts, a_texts
    
    for k, v in a_anchors.iteritems():
        b_anchors[k] += v
    for k, v in a_texts.iteritems():
        b_texts[k] += v
    
    return b_anchors, b_texts

em = text.EntityMentions(sentence_window=3, lowercase=True, normalize_url=False, strict_sentences=False)

mentions_by_target = em(docs)\
    .map(lambda m: (m['_id'], m))\
    .foldByKey([], fold_mention, 1000)\
    .mapValues(lambda ms: [(m['source'], get_ngrams(m['text'][slice(*m['span'])]), get_ngrams(m['text'])) for m in ms])\
    .repartition(512)\
    .cache()

In [15]:
if False:
    reload(text)
    em = text.EntityMentions(sentence_window=3, lowercase=True, normalize_url=False, strict_sentences=False)

    mentions = em(docs)\
        .map(lambda m: (m['_id'], m))\
        .mapValues(lambda m: (m['source'], get_ngrams(m['text'][slice(*m['span'])]), get_ngrams(m['text'])))\
        .cache()
    mentions.count() #3966013

    mentions_by_target = mentions\
        .groupByKey()\
        .mapValues(list)\
        .repartition(256)\
        .cache()

In [16]:
mentions_by_target.count()

1079730

In [17]:
agg_mentions_by_target = mentions_by_target.mapValues(aggregate_mentions).cache()
num_entities = agg_mentions_by_target.count()

In [18]:
neg_pair_sample_rate = 2
pos_pair_sample_rate = 2

In [19]:
from itertools import repeat

random_pairs = agg_mentions_by_target\
    .flatMap(lambda m: repeat(m, neg_pair_sample_rate))\
    .map(lambda m: (random.randint(0, num_entities), m))\
    .join(agg_mentions_by_target.zipWithUniqueId().map(lambda (k, v): (v, k)))\
    .map(lambda (k, (a, b)): (a, b))\
    .filter(lambda (a, b): (a[0] != b[0]) and len(
            set(re.findall(r"[a-zA-Z]+", a[0].split('/')[-1].lower())).intersection(
                re.findall(r"[a-zA-Z]+", b[0].split('/')[-1].lower()))) == 0
    )\
    .map(lambda (a, b): (random.random(), a, b, 0.0))

positive_pairs =  mentions_by_target\
    .filter(lambda (t, ms): len(ms) >= 2)\
    .mapValues(lambda vs: (random.random(), get_positives(vs, pos_pair_sample_rate)))\
    .flatMap(lambda (target, (split_idx, insts)): ((split_idx, (target, inst[0]), (target, inst[1])) for inst in insts if inst is not None))\
    .map(lambda (split_idx, a, b): (split_idx, a, b, 1.0))\

In [20]:
#dataset = random_pairs.union(positive_pairs)\
#    .map(lambda (split_idx, a, b, label): (split_idx, a[0], b[0], label, get_base_features(a[1], b[1]), list(generate_pair_features(a[1], b[1]))))\
#    .toDF(['split_idx', 'left', 'right', 'label', 'base_features', 'sim_features'])
    
dataset = random_pairs.union(positive_pairs)\
    .map(lambda (split_idx, a, b, label): (split_idx, a[0], b[0], label, get_base_features(a[1], b[1])))\
    .toDF(['split_idx', 'left', 'right', 'label', 'base_features'])

In [21]:
dataset.where(dataset.label == 1.0).count(), dataset.where(dataset.label == 0.0).count()

(486698, 563487)

In [22]:
# todo: with resampling some instance pairs may be repeated, and thereful we can have duplicates across train/test
train = dataset.filter(dataset.split_idx < 0.9)
test = dataset.filter(dataset.split_idx >= 0.9)
train.count(), test.count()

(945286, 105112)

In [23]:
from pyspark.ml.classification import NaiveBayes, LogisticRegression, RandomForestClassifier, MultilayerPerceptronClassifier, GBTClassifier
from pyspark.ml.feature import VectorAssembler, HashingTF, IDF, StringIndexer, CountVectorizer
from pyspark.ml import Pipeline

feature_columns = []
feature_stages = []

feature_columns.append("base_features")

#feature_stages += [
#    HashingTF(inputCol="sim_features", outputCol="hashed_sim_features", numFeatures=int(math.pow(2, 20))),
#    IDF(inputCol="hashed_sim_features", outputCol="idf_hashed_sim_features"),
#]
#feature_columns.append("idf_hashed_sim_features")

pipeline = Pipeline(stages=feature_stages + [
    VectorAssembler(inputCols=feature_columns, outputCol="features"),
    LogisticRegression(featuresCol="features")
])

In [24]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

if False:
    model = pipeline.fit(train)
    
    predictions = model.transform(test).cache()
    
    evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="prediction", metricName="areaUnderROC")
    areaUnderROC = evaluator.evaluate(predictions)
    evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="prediction", metricName="areaUnderPR")
    areaUnderPR = evaluator.evaluate(predictions)
    f1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1").evaluate(predictions)
    print("F1 = %s" % f1)
    print("Area under PR = %s" % areaUnderPR)
    print("Area under ROC = %s" % areaUnderROC)

In [25]:
# Basic
# F1 = 0.990590483548
# Area under PR = 0.994198922704
# Area under ROC = 0.990303278748

# RF Sim
# Area under PR = 0.973362932404
# Area under ROC = 0.953585027892

In [26]:
if False:
    from numpy.random import choice
    from itertools import tee
    import ujson as json
    from collections import Counter

    instances = []
    with open('kbco-annotations.jsonl', 'r') as f:
        for line in f:
            instances.append(json.loads(line))

    decisions = []

    for instance in instances:
        a_valid_link = instance['annotation']['a_link_type'] in ['entity', 'tag']
        b_valid_link = instance['annotation']['b_link_type'] in ['entity', 'tag']
        if a_valid_link and b_valid_link:
            decision = instance['annotation']['decision']
            if decision in ['yes', 'no']:
                decisions.append(decision)
    Counter(decisions)

In [27]:
# evaluate p/r/f at different thresholds
if False:
    def stats_at_p(r, p):
        tp = 1.0 if (r['label'] == 1.0 and r['probability'][1] >= p) else 0.0
        fp = 1.0 if (r['label'] == 0.0 and r['probability'][1] >= p) else 0.0
        fn = 1.0 if (r['label'] == 1.0 and r['probability'][1] < p) else 0.0
        return p, (tp, fp, fn)

    def evaluate(dataset, ps = None):
        if ps == None:
            ps = [0.5]
        stats_by_p = dataset\
            .flatMap(lambda r: (stats_at_p(r, p) for p in ps))\
            .reduceByKey(lambda a, b: [x+y for x,y in zip(a, b)])\
            .filter(lambda (p, (tp, fp, fn)): (tp+fp) > 0 and (tp+fn) > 0)\
            .mapValues(lambda (tp, fp, fn): ((float(tp) / (tp+fp)), (float(tp) / (tp+fn))))\
            .mapValues(lambda (p, r): (p, r, 2 * (p*r/(p+r))))\
            .collect()
        return stats_by_p

    ps = [0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 0.995]
    dev_prs = evaluate(model.transform(test).rdd, ps=ps)

    for c in ps:
        p, r, f = dict(dev_prs)[c]
        print '%.2f P/R=(%.2f, %.2f, %.2f)' % (c, p, r, f)

In [28]:
if False:
    sample_mentions = agg_mentions_by_target.filter(lambda (t, ms): ('trump' in t))
    x_target, x_mentions = sample_mentions.filter(lambda (t, _): t == 'uproxx.com/topic/donald-trump').collect()[0]
    model.transform(
        sample_mentions\
            .map(lambda (y_target, y_mentions): (x_target, y_target, list(generate_pair_features(x_mentions, y_mentions))))\
            .toDF(['x', 'y', 'features']))\
        .select(['x', 'y', 'probability', 'features'])\
        .rdd\
        .map(lambda r: (r['x'], r['y'], round(r['probability'][1], 3)))\
        .filter(lambda r: 'photo' not in r[1])\
        .collect()

In [29]:
model = pipeline.fit(train.union(test))

In [3]:
import ujson as json

In [16]:
ats_path = 'hdfs://schwa11:9000/webkb/candidates'

In [23]:
sc.textFile(ats_path).count()

431493

In [27]:
redirects = dict(sc\
    .textFile('hdfs://schwa11:9000/webkb/clusters')\
    .map(json.loads)\
    .collect())

In [28]:
sources_by_target = {}
for src, target in redirects.iteritems():
    sources_by_target.setdefault(target, set()).add(src)

In [32]:
[(t, srcs) for t, srcs in sources_by_target.iteritems() if any('tesla' in url for url in srcs)]

[(u'data.cnbc.com/quotes/tsla',
  {u'abcnews.go.com/topics/news/energy/electric-cars.htm',
   u'baltimoresun.com/topic/business/automotive-industry/tesla-motors-orcrp006147-topic.html',
   u'bloomberg.com/quote/tsla:us',
   u'chicagotribune.com/topic/business/automotive-industry/tesla-motors-orcrp006147-topic.html',
   u'chron.com/search/?action=search&channel=business%2ftechnology&inlinelink=1&searchindex=gsa&query=%22tesla+motors%22',
   u'courant.com/topic/business/automotive-industry/tesla-motors-orcrp006147-topic.html',
   u'dailyrecord.co.uk/all-about/electric-vehicles',
   u'data.cnbc.com/quotes/tsla',
   u'economictimes.indiatimes.com/topic/electric-cars',
   u'firmenpresse.de/directory/515606/tesla-motors-inc.html',
   u'gadgets.ndtv.com/tags/tesla',
   u'globalnews.ca/tag/tesla',
   u'latimes.com/topic/business/automotive-industry/tesla-orcrp006147-topic.html',
   u'leftlanenews.com/new-car-buying/tesla/model-x',
   u'markets.businessinsider.com/stock/tsla-quote',
   u'mashab

In [30]:
sources_by_target[redirects['en.wikipedia.org/w/tesla_inc.']]

KeyError: 'en.wikipedia.org/w/tesla_inc.'

In [24]:
sc\
    .textFile(ats_path)\
    .map(json.loads)\
    .filter(lambda (a, ts): any('tesla' in url for url in ts))\
    .collect()

[[u'flickrtesla club belgium',
  [u'flickr.com/photos/teslaclubbe/12271223586/in/photolist-jgnefs-9bsexs-haf167-ex9yxq-ozwwpo-d83k9o-gjvgtf-haetfc-and1gd-ccdkpb-e2rcql-5ubgnz-z4xlj7-rrmltf-4t8uwz-phopoy-onttsq-mf8frs-jgn7al-dyxsbk-hjbtxg-emx5tu-526a61-rdqdcl-52281c-bcrsgv-8nfnmj-gm9skq-sagpsj-tuwdgy-92ixxj-bnvzh-dtv2y-dylxoz-xvkqqi-qelgz-5yrnyp-6awhmd-hbpkcx-gntfyk-nzskma-uqrbs9-nx97as-nnttpn-hjfr8c-817ujv-mxwghs-e2n95p-6wcd8-4qnk5s',
   u'flickr.com/photos/teslaclubbe/12271217906/in/photolist-jgncyw-jgk6e8-py7dud-jgnefs-9bsexs-haf167-ex9yxq-ozwwpo-d83k9o-hjbtxg-gjvgtf-haetfc-and1gd-ccdkpb-e2rcql-5ubgnz-z4xlj7-rrmltf-4t8uwz-phopoy-qgerku-qvnpvh-onttsq-mf8frs-jgn7al-dyxsbk-emx5tu-526a61-rdqdcl-52281c-bcrsgv-8nfnmj-gm9skq-sagpsj-tuwdgy-92ixxj-bnvzh-dtv2y-dylxoz-xvkqqi-817ujv-qelgz-5yrnyp-6awhmd-hbpkcx-gntfyk-nzskma-uqrbs9-nx97as-nnttpn']],
 [u'electric vehicles',
  [u'independent.co.uk/topic/electric-cars',
   u'abcnews.go.com/topics/business/automotive/tesla-motors.htm',
   u'economicti

In [54]:
import string
strip_punc = {ord(s):u'' for s in string.punctuation}

def filter_doc_endpoint_links(doc):
    links = []
    for l in doc['links']:
        if l['endpoint'] >= 0.9:
            anchor = doc['text'][l['start']:l['stop']].lower().strip().translate(strip_punc)
            if anchor not in bad_anchors and len(anchor) < 50:
                l['anchor'] = anchor
                l['target'] = normalize_url(l['target'])
                links.append(l)
    doc['links'] = links
    return doc

ats_path = 'hdfs://schwa11:9000/commoncrawl/web/201707-ats/'
if False:
    docs = sc\
        .textFile('hdfs://schwa11:9000/commoncrawl/web/201707-endpoints/')\
        .map(json.loads)\
        .map(filter_doc_endpoint_links)\
        .filter(lambda d: d['links'])

    docs\
        .flatMap(lambda d: ((l['anchor'], l['target']) for l in d['links']))\
        .map(json.dumps)\
        .saveAsTextFile(ats_path, compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")

anchor_target_pairs = sc.textFile(ats_path).map(json.loads).cache()
anchor_target_pairs.count()

147823975

In [31]:
if False:
    !hadoop fs -rm -r hdfs://schwa11:9000/commoncrawl/web/201707-mentions
    em = text.EntityMentions(sentence_window=3, lowercase=True, normalize_url=False, strict_sentences=False)
    em(docs)\
        .map(lambda m: (m['_id'], m))\
        .foldByKey([], fold_mention, 2048)\
        .mapValues(lambda ms: [(m['source'], get_ngrams(m['text'][slice(*m['span'])]), get_ngrams(m['text'])) for m in ms])\
        .repartition(2048)\
        .map(json.dumps)\
        .saveAsTextFile(mentions_path, compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")

In [32]:
mentions_path = 'hdfs://schwa11:9000/commoncrawl/web/201707-mentions/'
mentions_by_target = sc.textFile(mentions_path).map(json.loads)

agg_mentions_by_target = mentions_by_target.mapValues(aggregate_mentions).persist()
num_entities = agg_mentions_by_target.count()

In [33]:
num_entities

10970267

# cc-news

In [82]:
if False:
    def filter_doc_endpoint_links(doc):
        links = []
        for l in doc['links']:
            if l['endpoint'] >= 0.95:
                anchor = doc['text'][l['start']:l['stop']].lower().strip().translate(strip_punc)
                if anchor not in bad_anchors and len(anchor) < 50:
                    l['anchor'] = anchor
                    l['target'] = normalize_url(l['target'])
                    links.append(l)
        doc['links'] = links
        return doc

    docs = sc\
        .textFile(data_path + '/docs')\
        .map(json.loads)\
        .map(filter_doc_endpoint_links)\
        .filter(lambda d: d['links'])\
        .map(lambda d: (d['_id'], d))\
        .groupByKey()\
        .map(lambda (k, ds): list(ds)[0])\
        .cache()
    docs.count()

    em = text.EntityMentions(sentence_window=3, lowercase=True, normalize_url=False, strict_sentences=False)

    mentions_by_target = em(docs)\
        .map(lambda m: (m['_id'], m))\
        .foldByKey([], fold_mention, 1000)\
        .mapValues(lambda ms: [(m['source'], get_ngrams(m['text'][slice(*m['span'])]), get_ngrams(m['text'])) for m in ms])\
        .repartition(512)\
        .cache()
    agg_mentions_by_target = mentions_by_target.mapValues(aggregate_mentions).cache()

In [28]:
import string
strip_punc = {ord(s):u'' for s in string.punctuation}

anchor_target_pairs = docs\
    .flatMap(lambda d: (
            (d['text'][l['start']:l['stop']].lower().strip().translate(strip_punc), l['target']) for l in d['links']))\
    .filter(lambda (a, t): a not in bad_anchors)\
    .cache()
anchor_target_pairs.count()

3914438

In [29]:
#MODEL_ID = 'simple_2_2.rf.kbco.model'
#model.write().overwrite().save(MODEL_ID)

## Cluster Entities

In [58]:
from tqdm import tqdm_notebook as tqdm
import xxhash
from pybloom import ScalableBloomFilter

from itertools import izip_longest
def grouper(iterable, n):
    args = [iter(iterable)] * n
    return izip_longest(*args)

def fold_aggregate_mentions(agg_mentions, mention):
    a_anchors, a_texts = agg_mentions
    b_anchors, b_texts = mention
    
    if len(b_anchors) < len(a_anchors):
        a_anchors, b_anchors = b_anchors, a_anchors
    if len(b_texts) < len(a_texts):
        a_texts, b_texts = b_texts, a_texts
    
    for k, v in a_anchors.iteritems():
        b_anchors[k] += v
    for k, v in a_texts.iteritems():
        b_texts[k] += v
    
    return b_anchors, b_texts

def share_an_endpoint(a, b, endpoints_by_target):
    a, b = endpoints_by_target[a], endpoints_by_target[b]
    if len(b) < len(a):
        a, b = b, a
    for i in a:
        if i in b:
            return True
    return False

def get_decisions(redirects, anchor_targets, target_counts, decided_pairs, endpoints_by_target, mulitple_decision_per_entity=False, max_decisions=1000000):
    decision_entities = set()
    decisions = []
    num_subsampled = 0

    anchors = anchor_targets.keys()
    random.shuffle(anchors)
    
    progress = tqdm(anchors)
    for anchor in progress:
        targets = anchor_targets[anchor]
        targets = set(redirects.get(t, t) for t in targets)
        if not mulitple_decision_per_entity:
            targets = [t for t in targets if t not in decision_entities]
        else:
            targets = list(targets)
        if len(targets) >= 2:
            if len(targets) >= 100:
                random.shuffle(targets)
            
            for tgts in grouper(targets, 100):
                if tgts[-1] == None:
                    tgts = [t for t in tgts if t != None]
                    if len(tgts) < 2:
                        break
                #targets = sorted(tgts, key=lambda k: target_counts[k], reverse=True)
                candidates = combinations(tgts, 2) if mulitple_decision_per_entity else grouper(tgts, 2)
                for x, y in candidates:
                    if x is None or y is None:
                        break
                    #for x, y in sorted(combinations(tgts, 2), key=lambda (a,b): min(target_counts[a], target_counts[b]), reverse=True):
                    #if mulitple_decision_per_entity or (x not in decision_entities and y not in decision_entities):
                    key = (x, y) if x < y else (y, x)
                    key = xxhash.xxh64('|'.join([x, y, str(target_counts[x]), str(target_counts[y])]).encode('utf-8')).hexdigest()
                    if key not in decided_pairs:
                        decided_pairs.add(key)

                        if not share_an_endpoint(x, y, endpoints_by_target):
                            decisions.append((x, y))
                            decision_entities.add(x)
                            decision_entities.add(y)
                            if len(decisions) % 10000 == 0:
                                progress.set_description("%i decisions" % len(decisions))
                            if len(decisions) >= max_decisions:
                                break
                if len(decisions) >= max_decisions:
                    break
            if len(decisions) >= max_decisions:
                break

    return decisions, num_subsampled

def light_resolve_decisions(model, redirects, decisions, agg_mentions_by_target):
    decision_entities = {}
    for i, (x, y) in enumerate(decisions):
        decision_entities.setdefault(x, []).append(i)
        decision_entities.setdefault(y, []).append(i)
    
    print('Broadcasting..')
    #decision_entities_bc = sc.broadcast(decision_entities)
    #redirects_bc = sc.broadcast({
    #    k: v for k, v in redirects.iteritems()
    #    if k in decision_entities or v in decision_entities
    #})
    redirects_min = {
        k: v for k, v in redirects.iteritems()
        if k in decision_entities or v in decision_entities
    }

    try:
        print('Clustering..')
        instances = agg_mentions_by_target\
            .map(lambda (t, ms): (redirects_min.get(t, t), ms))\
            .filter(lambda (t, ms): t in decision_entities)\
            .foldByKey((Counter(), Counter()), fold_aggregate_mentions)\
            .flatMap(lambda (t, ms): ((iid, (t, ms)) for iid in decision_entities[t]))\
            .groupByKey()\
            .filter(lambda (iid, ms): len(ms) == 2)\
            .map(lambda (iid, ((x, x_mentions), (y, y_mentions))): (x, y, get_base_features(x_mentions, y_mentions)))\
            .toDF(['x', 'y', 'base_features'])
        results = model.transform(instances).rdd.map(lambda r: (r['x'], r['y'], r['probability'][1])).collect()
    finally:
        pass
        #redirects_bc.unpersist(blocking=True)
        #redirects_bc.destroy()
        #decision_entities_bc.unpersist(blocking=True)
        #decision_entities_bc.destroy()

    return results

def resolve_decisions(model, clusters, decisions):
    instances = sc\
        .parallelize(decisions)\
        .zipWithUniqueId()\
        .flatMap(lambda ((x, y), iid): [(x, iid), (y, iid)])\
        .join(clusters)\
        .map(lambda (target, (iid, mentions)): (iid, (target, mentions)))\
        .groupByKey()\
        .filter(lambda (iid, ms): len(ms) == 2)\
        .map(lambda (iid, ((x, x_mentions), (y, y_mentions))): (x, y, get_base_features(x_mentions, y_mentions)))\
        .toDF(['x', 'y', 'base_features'])

    return model.transform(instances)\
        .rdd\
        .map(lambda r: (r['x'], r['y'], r['probability'][1]))\
        .collect()

#def transitive_resolve(source, redirects):
#    target = redirects[source]
#    while source != target:
#        source = target
#        target = redirects[source]
#    return target

def apply_merges(redirects, merges, target_counts, endpoints_by_target):
    sources_by_target = {}
    for src, target in redirects.iteritems():
        sources_by_target.setdefault(target, set()).add(src)

    for x, y, merge_prob in merges:
        if merge_prob >= 0.5:
            x = redirects.get(x, x)
            y = redirects.get(y, y)
            if x != y and not share_an_endpoint(x, y, endpoints_by_target):
                if target_counts[y] < target_counts[x]:
                    x, y = y, x
                for src in sources_by_target.get(x, set([x])):
                    redirects[src] = y
                    sources_by_target.setdefault(y, set()).add(src)
                    target_counts[y] += target_counts[src]
                    target_counts[src] = 0
                if x in sources_by_target:
                    del sources_by_target[x]
                endpoints_by_target[y].update(endpoints_by_target[x])
                del endpoints_by_target[x]

    return redirects

In [56]:
anchor_targets = dict(anchor_target_pairs\
    .map(tuple)\
    .distinct()\
    .groupByKey()\
    .mapValues(list)\
    .collect())

target_counts = dict(anchor_target_pairs\
    .map(lambda (a, t): (t, 1))\
    .reduceByKey(add)\
    .collect())

agg_cluster_mentions = agg_mentions_by_target

decided_pairs = set() #ScalableBloomFilter(initial_capacity=int(1e8), mode=ScalableBloomFilter.SMALL_SET_GROWTH, error_rate=1e-6)
redirects = {}

#targets = agg_cluster_mentions.map(lambda (k, ms): k).collect()

endpoints_by_target = defaultdict(set)
for target in targets:
    endpoints_by_target[target].add(normalize_endpoint_url(target))

merge_history = {}
merge_count_history = {}

In [None]:
MAX_DECISIONS_PER_ITER = 500000

decided_pairs = set()
merge_history = {}
merge_count_history = {}

for i in xrange(0, 50):
    print 'Iteration:', i
    greedy_merge = i != 0 and (len(decisions) < MAX_DECISIONS_PER_ITER/5.)
    if greedy_merge:
        print 'Greedy', len(merge_history[i-1])
    decisions, num_subsampled = get_decisions(
        redirects,
        anchor_targets,
        target_counts,
        decided_pairs,
        endpoints_by_target,
        mulitple_decision_per_entity=greedy_merge,
        max_decisions=MAX_DECISIONS_PER_ITER)
    if not decisions:
        print 'Done'
        break
    print 'Decisions', len(decisions), len(decided_pairs), num_subsampled
    merges = light_resolve_decisions(model, redirects, decisions, agg_cluster_mentions)
    merge_history[i] = merges
    merge_count_history[i] = sum(1 for x, y, m in merges if m >= 0.5)
    if (merge_count_history[i] + merge_count_history.get(i-1, 0)) == 0:
        if len(decisions) < MAX_DECISIONS_PER_ITER/10:
            print 'Probably Done'
            break
    print 'Merges', merge_count_history[i], '/', len(merge_history[i])

    redirects = apply_merges(redirects, merges, target_counts, endpoints_by_target)
    print 'Redirects', len(redirects)
        #.reduceByKey(lambda x, y: tuple(x[i]+y[i] for i in xrange(len(x))))
    #print 'Clusters', agg_cluster_mentions.count()

Iteration: 0



Decisions 500000 939806 0
Broadcasting..
Clustering..
Merges 464968 / 499977
Redirects 464968
Iteration: 1



Decisions 500000 1819678 0
Broadcasting..
Clustering..
Merges 473206 / 499990
Redirects 896212
Iteration: 2



Decisions 500000 2870570 0
Broadcasting..
Clustering..
Merges 459787 / 499972
Redirects 1285344
Iteration: 3



Decisions 500000 3938260 0
Broadcasting..
Clustering..
Merges 451046 / 499987
Redirects 1634487
Iteration: 4



Decisions 500000 4654842 0
Broadcasting..
Clustering..
Merges 472339 / 499988
Redirects 2064261
Iteration: 5



Decisions 500000 5644448 0
Broadcasting..
Clustering..
Merges 440163 / 499968
Redirects 2274994
Iteration: 6



Decisions 500000 6647858 0
Broadcasting..
Clustering..
Merges 454022 / 499987
Redirects 2577973
Iteration: 7



Decisions 500000 8020990 0
Broadcasting..
Clustering..
Merges 429423 / 499974
Redirects 2793730
Iteration: 8



Decisions 500000 9527787 0
Broadcasting..
Clustering..
Merges 436664 / 499974
Redirects 3008410
Iteration: 9



Decisions 431632 11047928 0
Broadcasting..
Clustering..
Merges 361885 / 431607
Redirects 3198948
Iteration: 10



Decisions 324407 12457639 0
Broadcasting..
Clustering..
Merges 260075 / 324381
Redirects 3328198
Iteration: 11



Decisions 253990 13792817 0
Broadcasting..
Clustering..
Merges 194411 / 253967
Redirects 3427890
Iteration: 12



Decisions 200712 15075136 0
Broadcasting..
Clustering..
Merges 145048 / 200686
Redirects 3500977
Iteration: 13



Decisions 161622 16312977 0
Broadcasting..
Clustering..
Merges 108296 / 161598
Redirects 3556428
Iteration: 14



Decisions 132328 17521563 0
Broadcasting..
Clustering..
Merges 82486 / 132305
Redirects 3598305
Iteration: 15



Decisions 109957 18705847 0
Broadcasting..
Clustering..
Merges 62935 / 109933
Redirects 3630599
Iteration: 16



Decisions 92891 19872776 0
Broadcasting..
Clustering..
Merges 47666 / 92868
Redirects 3655089
Iteration: 17
Greedy 92868


Decisions 500000 27653093 0
Broadcasting..
Clustering..
Merges 167808 / 499997
Redirects 3674381
Iteration: 18





Exception in thread Thread-13:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 810, in __bootstrap_inner
    self.run()
  File "/n/schwafs/home/andy/repos/sift/ve/local/lib/python2.7/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/n/schwafs/home/andy/repos/sift/ve/lib/python2.7/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



Decisions 80165 28806868 0
Broadcasting..
Clustering..
Merges 37684 / 80142
Redirects 3694463
Iteration: 19
Greedy 80142


Decisions 500000 36561292 0
Broadcasting..
Clustering..
Merges 218770 / 499959
Redirects 3714393
Iteration: 20





Exception in thread Thread-14:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 810, in __bootstrap_inner
    self.run()
  File "/n/schwafs/home/andy/repos/sift/ve/local/lib/python2.7/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/n/schwafs/home/andy/repos/sift/ve/lib/python2.7/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



Decisions 68630 37701563 0
Broadcasting..
Clustering..
Merges 28636 / 68605
Redirects 3729818
Iteration: 21
Greedy 68605


Decisions 500000 45723528 0
Broadcasting..
Clustering..
Merges 179280 / 499976
Redirects 3747113
Iteration: 22





Exception in thread Thread-15:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 810, in __bootstrap_inner
    self.run()
  File "/n/schwafs/home/andy/repos/sift/ve/local/lib/python2.7/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/n/schwafs/home/andy/repos/sift/ve/lib/python2.7/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



Decisions 58773 46850207 0
Broadcasting..
Clustering..
Merges 21359 / 58748
Redirects 3758856
Iteration: 23
Greedy 58748


Decisions 500000 53783157 0
Broadcasting..
Clustering..
Merges 200585 / 499895
Redirects 3777961
Iteration: 24





Exception in thread Thread-16:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 810, in __bootstrap_inner
    self.run()
  File "/n/schwafs/home/andy/repos/sift/ve/local/lib/python2.7/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/n/schwafs/home/andy/repos/sift/ve/lib/python2.7/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



Decisions 51092 54897477 0
Broadcasting..
Clustering..
Merges 16001 / 51068
Redirects 3786836
Iteration: 25
Greedy 51068


Decisions 500000 63388324 0
Broadcasting..
Clustering..
Merges 163272 / 499913
Redirects 3805168
Iteration: 26





Exception in thread Thread-17:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 810, in __bootstrap_inner
    self.run()
  File "/n/schwafs/home/andy/repos/sift/ve/local/lib/python2.7/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/n/schwafs/home/andy/repos/sift/ve/lib/python2.7/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



Decisions 45572 64493454 0
Broadcasting..
Clustering..
Merges 12718 / 45550
Redirects 3812511
Iteration: 27
Greedy 45550


Decisions 500000 80215285 0
Broadcasting..
Clustering..
Merges 143125 / 499637
Redirects 3832146
Iteration: 28





Exception in thread Thread-18:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 810, in __bootstrap_inner
    self.run()
  File "/n/schwafs/home/andy/repos/sift/ve/local/lib/python2.7/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/n/schwafs/home/andy/repos/sift/ve/lib/python2.7/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



Decisions 41511 81313096 0
Broadcasting..
Clustering..
Merges 10403 / 41490
Redirects 3838242
Iteration: 29
Greedy 41490


Decisions 500000 95602459 0
Broadcasting..
Clustering..
Merges 114677 / 499973
Redirects 3855090
Iteration: 30





Exception in thread Thread-19:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 810, in __bootstrap_inner
    self.run()
  File "/n/schwafs/home/andy/repos/sift/ve/local/lib/python2.7/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/n/schwafs/home/andy/repos/sift/ve/lib/python2.7/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



Decisions 38011 96690047 0
Broadcasting..
Clustering..
Merges 8343 / 37992
Redirects 3860037
Iteration: 31
Greedy 37992


Decisions 500000 111921390 0
Broadcasting..
Clustering..
Merges 129874 / 499980
Redirects 3877984
Iteration: 32





Exception in thread Thread-20:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 810, in __bootstrap_inner
    self.run()
  File "/n/schwafs/home/andy/repos/sift/ve/local/lib/python2.7/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/n/schwafs/home/andy/repos/sift/ve/lib/python2.7/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



Decisions 35210 112997754 0
Broadcasting..
Clustering..
Merges 6820 / 35188
Redirects 3882063
Iteration: 33
Greedy 35188


Decisions 500000 127055569 0
Broadcasting..
Clustering..
Merges 103999 / 499995
Redirects 3896863
Iteration: 34





Exception in thread Thread-21:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 810, in __bootstrap_inner
    self.run()
  File "/n/schwafs/home/andy/repos/sift/ve/local/lib/python2.7/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/n/schwafs/home/andy/repos/sift/ve/lib/python2.7/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



Decisions 32310 128124774 0
Broadcasting..
Clustering..
Merges 5539 / 32290
Redirects 3900261
Iteration: 35
Greedy 32290


Decisions 500000 156931577 0
Broadcasting..
Clustering..
Merges 100495 / 499757
Redirects 3915205
Iteration: 36





Exception in thread Thread-22:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 810, in __bootstrap_inner
    self.run()
  File "/n/schwafs/home/andy/repos/sift/ve/local/lib/python2.7/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/n/schwafs/home/andy/repos/sift/ve/lib/python2.7/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



Decisions 30311 157990132 0
Broadcasting..
Clustering..
Merges 4746 / 30289
Redirects 3918157
Iteration: 37
Greedy 30289


Decisions 500000 164700822 0
Broadcasting..
Clustering..
Merges 54264 / 499804
Redirects 3926047
Iteration: 38





Exception in thread Thread-23:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 810, in __bootstrap_inner
    self.run()
  File "/n/schwafs/home/andy/repos/sift/ve/local/lib/python2.7/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/n/schwafs/home/andy/repos/sift/ve/lib/python2.7/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



Decisions 27326 165751960 0
Broadcasting..
Clustering..
Merges 3182 / 27304
Redirects 3927977
Iteration: 39
Greedy 27304


Decisions 500000 176994658 0
Broadcasting..
Clustering..
Merges 61192 / 499893
Redirects 3937166
Iteration: 40





Exception in thread Thread-24:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 810, in __bootstrap_inner
    self.run()
  File "/n/schwafs/home/andy/repos/sift/ve/local/lib/python2.7/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/n/schwafs/home/andy/repos/sift/ve/lib/python2.7/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



Decisions 26093 178040753 0
Broadcasting..
Clustering..
Merges 3038 / 26074
Redirects 3939085
Iteration: 41
Greedy 26074


Decisions 500000 195277562 0
Broadcasting..
Clustering..
Merges 82455 / 499040
Redirects 3949132
Iteration: 42





Exception in thread Thread-25:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 810, in __bootstrap_inner
    self.run()
  File "/n/schwafs/home/andy/repos/sift/ve/local/lib/python2.7/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/n/schwafs/home/andy/repos/sift/ve/lib/python2.7/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



Decisions 24676 196316429 0
Broadcasting..
Clustering..
Merges 2542 / 24657
Redirects 3950749
Iteration: 43
Greedy 24657


Decisions 500000 216552316 0
Broadcasting..
Clustering..
Merges 59686 / 499741
Redirects 3959279
Iteration: 44





Exception in thread Thread-26:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 810, in __bootstrap_inner
    self.run()
  File "/n/schwafs/home/andy/repos/sift/ve/local/lib/python2.7/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/n/schwafs/home/andy/repos/sift/ve/lib/python2.7/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



Decisions 23793 217583974 0
Broadcasting..
Clustering..
Merges 2262 / 23774
Redirects 3960782
Iteration: 45
Greedy 23774


Decisions 500000 238121292 0
Broadcasting..
Clustering..
Merges 64309 / 499953
Redirects 3970465
Iteration: 46





Exception in thread Thread-27:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 810, in __bootstrap_inner
    self.run()
  File "/n/schwafs/home/andy/repos/sift/ve/local/lib/python2.7/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/n/schwafs/home/andy/repos/sift/ve/lib/python2.7/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



Decisions 22818 239144643 0
Broadcasting..
Clustering..
Merges 2121 / 22800
Redirects 3971881
Iteration: 47
Greedy 22800


Decisions 500000 256375467 0
Broadcasting..
Clustering..
Merges 59083 / 499893
Redirects 3980565
Iteration: 48





Exception in thread Thread-28:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 810, in __bootstrap_inner
    self.run()
  File "/n/schwafs/home/andy/repos/sift/ve/local/lib/python2.7/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/n/schwafs/home/andy/repos/sift/ve/lib/python2.7/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



Decisions 21526 257389757 0
Broadcasting..
Clustering..
Merges 1780 / 21509
Redirects 3981761
Iteration: 49
Greedy 21509


Decisions 500000 287209594 0
Broadcasting..
Clustering..
Merges 48852 / 499959
Redirects 3988978
Iteration: 50





Exception in thread Thread-29:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 810, in __bootstrap_inner
    self.run()
  File "/n/schwafs/home/andy/repos/sift/ve/local/lib/python2.7/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/n/schwafs/home/andy/repos/sift/ve/lib/python2.7/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



Decisions 20996 288213753 0
Broadcasting..
Clustering..
Merges 1623 / 20979
Redirects 3990064
Iteration: 51
Greedy 20979


Decisions 500000 314440369 0
Broadcasting..
Clustering..
Merges 37249 / 499053
Redirects 3994918
Iteration: 52





Exception in thread Thread-30:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 810, in __bootstrap_inner
    self.run()
  File "/n/schwafs/home/andy/repos/sift/ve/local/lib/python2.7/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/n/schwafs/home/andy/repos/sift/ve/lib/python2.7/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



Decisions 20090 315434552 0
Broadcasting..
Clustering..
Merges 1263 / 20073
Redirects 3995786
Iteration: 53
Greedy 20073


Decisions 500000 332389837 0
Broadcasting..
Clustering..
Merges 35184 / 499208
Redirects 4000393
Iteration: 54





Exception in thread Thread-31:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 810, in __bootstrap_inner
    self.run()
  File "/n/schwafs/home/andy/repos/sift/ve/local/lib/python2.7/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/n/schwafs/home/andy/repos/sift/ve/lib/python2.7/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



Decisions 19455 333376068 0
Broadcasting..
Clustering..
Merges 1054 / 19443
Redirects 4001071
Iteration: 55
Greedy 19443


Decisions 500000 364492165 0
Broadcasting..
Clustering..


In [None]:
11

In [61]:
len(redirects)

4001071

In [175]:
#Decisions 3614055 69384167

In [176]:
!hadoop fs -ls $data_path

Found 8 items
drwxr-xr-x   - andy supergroup          0 2017-10-03 12:51 hdfs://schwa11:9000/webkb/candidates
drwxr-xr-x   - andy supergroup          0 2017-10-03 12:51 hdfs://schwa11:9000/webkb/clusters
drwxr-xr-x   - andy supergroup          0 2017-11-22 20:56 hdfs://schwa11:9000/webkb/clusters-simple_2_1.kbco.model
drwxr-xr-x   - andy supergroup          0 2017-11-22 18:51 hdfs://schwa11:9000/webkb/clusters-simple_4_1.kbco.model
drwxr-xr-x   - andy supergroup          0 2017-08-05 14:03 hdfs://schwa11:9000/webkb/dataset
drwxr-xr-x   - andy supergroup          0 2017-08-05 14:19 hdfs://schwa11:9000/webkb/docs
drwxr-xr-x   - andy supergroup          0 2017-08-05 14:05 hdfs://schwa11:9000/webkb/test
drwxr-xr-x   - andy supergroup          0 2017-08-05 14:04 hdfs://schwa11:9000/webkb/train


In [64]:
MODEL_ID = 'simple_2_2.kbco.model'

In [None]:
if True:
    #!hadoop fs -rm -r /webkb/ccweb/candidates
    sc\
        .parallelize(anchor_targets.iteritems())\
        .map(json.dumps)\
        .saveAsTextFile(data_path + '/ccweb/candidates')
if False:
    #!hadoop fs -rm -r /webkb/ccweb/clusters-simple_2_2.kbco.model
    sc\
        .parallelize(redirects.iteritems())\
        .map(json.dumps)\
        .saveAsTextFile(data_path + '/ccweb/clusters-'+ MODEL_ID)

In [14]:
import numpy
from numpy.random import choice
from collections import defaultdict
from itertools import tee
import ujson as json

instances = []
with open('kbco-annotations.jsonl', 'r') as f:
    for line in f:
        instances.append(json.loads(line))

instance_metrics = []

for instance in instances:
    a_valid_link = instance['annotation']['a_link_type'] in ['entity', 'tag']
    b_valid_link = instance['annotation']['b_link_type'] in ['entity', 'tag']
    if a_valid_link and b_valid_link:
        decision = instance['annotation']['decision']
        if decision in ['yes', 'no']:
            decision = decision == 'yes'
            #system_decision = True # baseline
            # todo: exclude cases where instances not in redirects
            system_decision = redirects.get(instance['item']['a'], instance['item']['a']) == redirects.get(instance['item']['b'], instance['item']['b'])
            
            tp = 1.0 if decision and system_decision else 0
            tn = 1.0 if not decision and not system_decision else 0
            fp = 1.0 if system_decision and not decision else 0
            fn = 1.0 if not system_decision and decision else 0
            
            instance_metrics.append({
                'tp': tp,
                'tn': tn,
                'fp': fp,
                'fn': fn
            })
            if len(instance_metrics) == 500:
                break

def compute_aggregate_metrics(metric_samples):
    agg_by_key = {}
    for item in metric_samples:
        for k, v in item.items():
            if k not in agg_by_key:
                agg_by_key[k] = 0
            agg_by_key[k] += v
    tp = agg_by_key['tp']
    tn = agg_by_key['tn']
    fp = agg_by_key['fp']
    fn = agg_by_key['fn']

    p = tp/(tp+fp) if (tp+fp) > 0 else 0.
    r = tp/(tp+fn) if (tp+fp) > 0 else 0.
    n = tp+tn+fn+fp
    a = (tp+tn)/n
    f = 2*p*r/(p+r) if (p+r) > 0 else 0.
    
    return {
        'a': a,
        'p': p,
        'r': r,
        'f': f,
        'n': n
    }

N_RESAMPLES = 10000

metric_samples = []
for _ in range(N_RESAMPLES):
    metric_samples.append(compute_aggregate_metrics(choice(instance_metrics, size=len(instance_metrics), replace=True)))

agg_metric_samples = defaultdict(list)
for s in metric_samples:
    for k, v in s.iteritems():
        agg_metric_samples[k].append(v)

base_metric_estimates = compute_aggregate_metrics(instance_metrics)

#print(MODEL_ID)
for k, samples in agg_metric_samples.iteritems():
    scale = 1 if k == 'n' else 100

    print k.ljust(10), '   %.2f' % (base_metric_estimates[k] * scale)
    print '95% CI'.rjust(10), ' - '.join(['%.1f' % (s*scale) for s in numpy.percentile(samples, [2.5, 97.5])])
    print '99% CI'.rjust(10), ' - '.join(['%.1f' % (s*scale) for s in numpy.percentile(samples, [0.5, 99.5])])

a             22.20
    95% CI 18.6 - 25.8
    99% CI 17.6 - 27.0
p             100.00
    95% CI 0.0 - 100.0
    99% CI 0.0 - 100.0
r             0.51
    95% CI 0.0 - 1.3
    99% CI 0.0 - 1.6
f             1.02
    95% CI 0.0 - 2.6
    99% CI 0.0 - 3.1
n             500.00
    95% CI 500.0 - 500.0
    99% CI 500.0 - 500.0


In [None]:
a             77.20
    95% CI 73.4 - 80.8
    99% CI 72.2 - 81.8
p             82.44
    95% CI 78.7 - 86.0
    99% CI 77.4 - 87.1
r             90.03
    95% CI 86.9 - 92.9
    99% CI 85.8 - 93.8
f             86.06
    95% CI 83.4 - 88.5
    99% CI 82.6 - 89.3
n             500.00
    95% CI 500.0 - 500.0
    99% CI 500.0 - 500.0

In [None]:
if False:
    ms = [merges[91], merges[128]] + [(
        u'news.nationalpost.com/tag/european-court-of-human-rights',
        u'chroniclelive.co.uk/all-about/newcastle-crown-court',
        True
    ), (
        u'data.cnbc.com/quotes/aig',
        u'mirror.co.uk/all-about/court-case',
        True
    )]

    redirects = {t:t for t in agg_cluster_mentions.map(lambda (k, ms): k).collect()}
    redirects = apply_merges(redirects, ms, target_counts)
    sources_by_target = defaultdict(set)
    for src, target in redirects.iteritems():
        sources_by_target[target].add(src)

In [74]:
sources_by_target = defaultdict(set)
for src, target in redirects.iteritems():
    sources_by_target[target].add(src)

In [75]:
print 'Total clusters:', len(sources_by_target)
for s, srcs in sorted(sources_by_target.iteritems(), key=lambda (k, v): len(v), reverse=True)[:10]:
    print len(srcs), '\t', s

Total clusters: 659365
222 	hollywoodreporter.com/heat-vision/stranger-things-breakout-millie-bobby-brown-set-star-godzilla-sequel-969505
168 	rotoworld.com/player/nba/1316/paul-millsap
161 	sun-sentinel.com/topic/sports/basketball/goran-dragic-pespt0000010725-topic.html
150 	rotoworld.com/player/nba/1593/blake-griffin
136 	fortune.com/fortune500/qualcomm-110
133 	abcnews.go.com/topics/sports/tennis/australian-open.htm
132 	celebrityinsider.org/tag/justin-bieber
129 	liverpoolecho.co.uk/all-about/liverpool-fc
126 	espn.com/nba/player/_/id/1975/carmelo-anthony
123 	chicagotribune.com/topic/sports/basketball/jimmy-butler-pespt0011638-topic.html


In [None]:
Total clusters: 704324
194 	justjared.com/tags/2017-oscars
142 	in.bookmyshow.com/plays/agnipankh/et00054692#!svg-balg
114 	itunes.apple.com/gb/app/the-scotsman/id909030944?mt=8
114 	draftexpress.com/profile/isaac-bonga-84711/stats
113 	mirror.co.uk/all-about/andy-murray
112 	timesofindia.indiatimes.com/city/hubballi/dharwad-teen-wakes-up-on-way-to-his-funeral/articleshow/57242895.cms
109 	wwd.com/fashion-news/designer-luxury/pharrell-williams-kristen-stewart-handbag-campaign-to-chanel-duties-10736434
105 	espn.com/mlb/team/_/name/wsh/washington-nationals
104 	globalgrind.com/tag/drake
104 	celebrityinsider.org/tag/brad-pitt

In [68]:
sorted(target_counts.iteritems(), key=lambda (t, c): c, reverse=True)[:10]

[(u'play.google.com/store/apps/details?id=com.indianexpress.android&hl=en',
  37134),
 (u'topix.com/who/donald-trump', 10216),
 (u'indianexpress.com/about/narendra-modi', 8116),
 (u'mirror.co.uk/all-about/manchester-united-fc', 8110),
 (u'chicagotribune.com/topic/politics-government/donald-trump-pebsl000163-topic.html',
  7883),
 (u'express.co.uk/football/teams/17/chelsea', 6345),
 (u'abcnews.go.com/topics/news/donald-trump.htm', 6092),
 (u'express.co.uk/football/teams/8/manchester-united', 5493),
 (u'washingtontimes.com/topics/national-football-league', 5237),
 (u'liverpoolecho.co.uk/all-about/liverpool-fc', 5042)]

In [41]:
#[a for a, ts in anchor_targets.iteritems() if 'radiofreemoscow.org/tag/donald-trump' in ts]
#'radiofreemoscow.org/tag/donald-trump' in anchor_targets['trump']
#for t in sources_by_target[redirects['radiofreemoscow.org/tag/donald-trump']]:
#    print [a for a, ts in anchor_targets.iteritems() if t in ts]

In [70]:
sources_by_target[redirects['nytimes.com/topic/person/donald-trump']]

{u'allure.com/topic/donald-trump',
 u'aurorasentinel.com/tag/president-trump',
 u'baltimoresun.com/topic/politics-government/donald-trump-pebsl000163-topic.html',
 u'breitbart.com/tag/president-trump',
 u'capitalgazette.com/topic/politics-government/donald-trump-pebsl000163-topic.html',
 u'carrollcountytimes.com/topic/politics-government/donald-trump-pebsl000163-topic.html',
 u'chicagotribune.com/topic/politics-government/donald-trump-pebsl000163-topic.html',
 u'citypaper.com/topic/politics-government/donald-trump-pebsl000163-topic.html',
 u'courant.com/topic/business/consumer-goods-industries/clothing-textiles-industry/tommy-hilfiger-pebsl000104-topic.html',
 u'courant.com/topic/politics-government/donald-trump-pebsl000163-topic.html',
 u'dailycollegian.com/tag/president-trump',
 u'dailyegyptian.com/tag/donald',
 u'dailypress.com/topic/politics-government/donald-trump-pebsl000163-topic.html',
 u'darkroom.baltimoresun.com/tag/president-trump',
 u'data.cnbc.com/quotes/pvh',
 u'dazeddigi

In [71]:
sources_by_target[redirects['radiofreemoscow.org/tag/donald-trump']] #'nytimes.com/topic/person/donald-trump']

{u'abcnews.go.com/topics/news/canada.htm',
 u'agcanada.com/tag/british-columbia',
 u'archdaily.com/tag/houses-of-parliament',
 u'beaumontenterprise.com/search/?action=search&channel=news%2ftexas&inlinelink=1&searchindex=gsa&query=%22donald+trump%22',
 u'bristolpost.co.uk/all-about/politics',
 u'catch21.co.uk/tag/vote',
 u'caymannewsservice.com/tag/donald-trump',
 u'chrisd.ca/tag/british-columbia',
 u'chron.com/search/?action=search&channel=sports%2ftexans&inlinelink=1&searchindex=gsa&query=%22donald+trump%22',
 u'chroniclelive.co.uk/all-about/politics',
 u'dailyrecord.co.uk/all-about/uk-government',
 u'dazeddigital.com/tag/general-election-2017',
 u'economictimes.indiatimes.com/topic/polls',
 u'en.wikipedia.org/wiki/british_columbia',
 u'fox61.com/tag/vote',
 u'fr.wikipedia.org/wiki/liste_de_sondages_sur_les_%c3%a9lections_l%c3%a9gislatives_fran%c3%a7aises_de_2017',
 u'gaytravelinformation.com/category/british-columbia',
 u'gethampshire.co.uk/all-about/politics',
 u'glamourmagazine.co.

In [318]:
x = redirects['radiofreemoscow.org/tag/donald-trump']
y = redirects[redirects['nytimes.com/topic/person/donald-trump']]
x, y

(u'globalgrind.com/tag/donald+trunp', u'globalgrind.com/tag/donald-trump')

In [45]:
x = redirects['radiofreemoscow.org/tag/donald-trump']
y = redirects['nytimes.com/topic/person/donald-trump']

#resolve_decisions(model, agg_cluster_mentions, [
#    (x,y)
#])

In [78]:
x = 'en.wikipedia.org/wiki/tesla,_inc.'
y = 'topix.com/who/tesla'
x = 'en.wikipedia.org/wiki/tesla_model_x'
y = 'postnewsgroup.com/blog/tag/tesla'
#x = 'radiofreemoscow.org/tag/donald-trump'
#y = 'nytimes.com/topic/person/donald-trump'
#x, y = redirects[x], redirects[y]
#key = (x, y, target_counts[x], target_counts[y]) if x < y else (y, x)


key = (x, y) if x < y else (y, x)
key = xxhash.xxh64('|'.join([x, y, str(target_counts[x]), str(target_counts[y])]).encode('utf-8')).hexdigest()
key in decided_pairs, x == y, x, y, key

(False,
 False,
 'en.wikipedia.org/wiki/tesla_model_x',
 'postnewsgroup.com/blog/tag/tesla',
 '167c9af6ad6831d89f1e750e8f3505a8fa068055')

In [None]:
{u'en.wikipedia.org/wiki/tesla_model_x'},
 {u'postnewsgroup.com/blog/tag/tesla'},

In [250]:
len(sources_by_target)

666386

In [81]:
[srcs for t, srcs in sorted(sources_by_target.iteritems(),key=lambda (k,v):len(v), reverse=True) if 'trump' in t and not 'see-sweeping' in t]

[{u'allure.com/topic/donald-trump',
  u'aurorasentinel.com/tag/president-trump',
  u'baltimoresun.com/topic/politics-government/donald-trump-pebsl000163-topic.html',
  u'breitbart.com/tag/president-trump',
  u'capitalgazette.com/topic/politics-government/donald-trump-pebsl000163-topic.html',
  u'carrollcountytimes.com/topic/politics-government/donald-trump-pebsl000163-topic.html',
  u'chicagotribune.com/topic/politics-government/donald-trump-pebsl000163-topic.html',
  u'citypaper.com/topic/politics-government/donald-trump-pebsl000163-topic.html',
  u'courant.com/topic/business/consumer-goods-industries/clothing-textiles-industry/tommy-hilfiger-pebsl000104-topic.html',
  u'courant.com/topic/politics-government/donald-trump-pebsl000163-topic.html',
  u'dailycollegian.com/tag/president-trump',
  u'dailyegyptian.com/tag/donald',
  u'dailypress.com/topic/politics-government/donald-trump-pebsl000163-topic.html',
  u'darkroom.baltimoresun.com/tag/president-trump',
  u'data.cnbc.com/quotes/pvh

In [464]:
sources_by_target[redirects['blackamericaweb.com/tag/steve-bannon']]

{u'abcnews.go.com/topics/news/donald-trump.htm',
 u'aurorasentinel.com/tag/president-donald-trump',
 u'blackamericaweb.com/tag/steve-bannon',
 u'boiseweekly.com/boise/articlearchives?tag=president%20donald%20trump',
 u'dailyegyptian.com/tag/donald-john-trump',
 u'economictimes.indiatimes.com/topic/us-president-donald-trump',
 u'electronicintifada.net/tags/muslim-ban',
 u'glamour.com/about/donald-trump',
 u'globalvillageextra.com/en/index.php/tag/president-donald-trump',
 u'golfdigest.com/topic/donald-trump',
 u'haaretz.com/misc/tags/tags-1.679173',
 u'herald.ng/tag/muslim-ban',
 u'independent.co.uk/topic/donaldtrump',
 u'justjared.com/tags/president-donald-trump',
 u'ksfm.cbslocal.com/tag/president-donald-trump',
 u'marquettewire.org/tag/president-donald-trump',
 u'mashable.com/category/donald-trump',
 u'mirror.co.uk/all-about/donald-trump-muslim-ban',
 u'newsone.com/tag/donald-trump',
 u'newsx.com/tags/donald-trump',
 u'pagesix.com/tag/president-donald-trump',
 u'people.com/people/new

In [264]:
x, y = redirects['radiofreemoscow.org/tag/donald-trump'], redirects['nytimes.com/topic/person/donald-trump']
key = (x, y) if x < y else (y, x)
key in decided_pairs

True

In [163]:
sources_by_target[redirects['breitbart.com/tag/trump']]

{u'breitbart.com/tag/trump'}

In [271]:
sources_by_target[redirects['radiofreemoscow.org/tag/donald-trump']]

{u'agcanada.com/tag/european-union',
 u'aljazeera.com/topics/organisations/european-union.html',
 u'arktimes.com/arkansas/articlearchives?tag=trump%20white%20house',
 u'aurorasentinel.com/tag/president-donald-trump',
 u'baltimoresun.com/topic/business/economy/european-union-orgov000067-topic.html',
 u'birminghammail.co.uk/all-about/european-union',
 u'blackamericaweb.com/tag/donald-trump',
 u'breitbart.com/tag/european-union',
 u'calvinayre.com/tag/donald-trump',
 u'capitalgazette.com/topic/politics-government/government/white-house-plcul000110-topic.html',
 u'carrollcountytimes.com/topic/politics-government/government/white-house-plcul000110-topic.html',
 u'chicagotribune.com/topic/business/economy/european-union-orgov000067-topic.html',
 u'chron.com/search/?action=search&channel=news%2fmedical&inlinelink=1&searchindex=gsa&query=%22trump+white+house%22',
 u'citypaper.com/topic/politics-government/donald-trump-pebsl000163-topic.html',
 u'courant.com/topic/business/economy/european-unio

In [270]:
sources_by_target[redirects['nytimes.com/topic/person/donald-trump']]

{u'aurorasentinel.com/tag/president-trump',
 u'blackamericaweb.com/tag/president-trump',
 u'breitbart.com/tag/president-trump',
 u'dailycollegian.com/tag/president-trump',
 u'economictimes.indiatimes.com/topic/president-trump',
 u'herald.ng/tag/president-trump',
 u'krebsonsecurity.com/tag/president-trump',
 u'ksfm.cbslocal.com/tag/president-trump',
 u'northcoastjournal.com/humboldt/articlearchives?tag=president%20trump',
 u'nytimes.com/topic/person/donald-trump',
 u'orlandoweekly.com/orlando/articlearchives?tag=president%20trump',
 u'phillymag.com/tag/president-trump',
 u'rejournals.com/tag/president-trump',
 u'southflorida.com/topic/politics-government/donald-trump-pebsl000163-topic.html',
 u'thedailycougar.com/tag/president-trump',
 u'thefader.com/artist/donald-trump',
 u'universe.byu.edu/tag/president-trump',
 u'washingtontimes.com/topics/president-trump',
 u'wdef.com/tag/president-trump',
 u'wgnradio.com/tag/president-trump',
 u'wzdm.com/tag/president-trump',
 u'wzlx.cbslocal.com/t

In [161]:
redirects['nytimes.com/topic/person/donald-trump']

u'thefader.com/artist/donald-trump'

In [160]:
sources_by_target[redirects['breitbart.com/tag/melania-trump']]

{u'breitbart.com/tag/melania-trump',
 u'capitalgazette.com/topic/politics-government/melania-trump-pcelb00150-topic.html',
 u'carrollcountytimes.com/topic/politics-government/melania-trump-pcelb00150-topic.html',
 u'chicagotribune.com/topic/melania-trump/pcelb00150-topic.html',
 u'dailypress.com/topic/melania-trump/pcelb00150-topic.html',
 u'economictimes.indiatimes.com/topic/melania-trump',
 u'mashable.com/category/melania-trump',
 u'mcall.com/topic/politics-government/melania-trump-pcelb00150-topic.html'}

In [159]:
sources_by_target[redirects['breitbart.com/tag/ivanka-trump']]

{u'breitbart.com/tag/ivanka-trump'}

In [157]:
sources_by_target[redirects['breitbart.com/tag/trump-tower']]

{u'bisnow.com/tags/trump-tower',
 u'breitbart.com/tag/trump-tower',
 u'dailycollegian.com/tag/trump-tower',
 u'gaycitynews.nyc/tag/trump-tower',
 u'thedailybeast.com/content/dailybeast/topics/trump-tower.html',
 u'torontoist.com/tag/trump-tower',
 u'yahoo.com/celebrity/tagged/trump-tower'}

In [156]:
sources_by_target[redirects['thehill.com/people/hillary-clinton']]

{u'thehill.com/people/hillary-clinton'}

In [155]:
sources_by_target[redirects['espn.com/sports/tennis/players/profile?playerid=735']]

{u'chicagotribune.com/topic/sports/tennis/andrea-petkovic-pespt000009196-topic.html',
 u'economictimes.indiatimes.com/topic/andrea-petkovic',
 u'espn.com/sports/tennis/players/profile?playerid=735',
 u'mirror.co.uk/all-about/andrea-petkovic',
 u'sports.yahoo.com/olympics/rio-2016/a/1126203'}

In [154]:
sources_by_target[redirects['globalgrind.cassuislife.com/tag/kanye-west']]

{u'globalgrind.cassuislife.com/tag/kanye-west',
 u'indiewire.com/t/kanye-west',
 u'ksfm.cbslocal.com/tag/kanye-kardashian',
 u'metacritic.com/person/kanye-west?filter-options=music',
 u'ok.co.uk/search/kanye%20west',
 u'power92chicago.com/tag/kanye-west',
 u'radaronline.com/tag/kanye-west',
 u'teenvogue.com/tag/kanye-west',
 u'thesuperficial.com/tag/kanye-west',
 u'wwd.com/tag/kanye-west'}

In [153]:
sources_by_target[redirects['glamour.com/about/jessica-alba']]

{u'allure.com/topic/jessica-alba',
 u'broadwayworld.com/people/jessica-alba',
 u'chicagotribune.com/topic/entertainment/movies/jessica-alba-peclb000042-topic.html',
 u'contactmusic.com/jessica-alba',
 u'en.wikipedia.org/wiki/jessica_alba',
 u'eonline.com/au/news/jessica_alba',
 u'forbes.com/profile/jessica-alba',
 u'glamour.com/about/jessica-alba',
 u'hollywoodlife.com/tag/jessica-alba',
 u'imdb.com/name/nm0004695/?ref_=nv_sr_1',
 u'indiewire.com/t/jessica-alba',
 u'justjared.com/tags/jessica-alba',
 u'latinpost.com/tags/jessica-alba',
 u'macdailynews.com/tag/jessica-alba',
 u'nypost.com/tag/jessica-alba',
 u'self.com/topic/jessica-alba',
 u'tmz.com/person/jessica-alba',
 u'topix.com/who/jessica-alba',
 u'vanityfair.com/people/jessica-alba#intcid=dt-hot-link',
 u'vogue.com/tag/celebrity/jessica-alba',
 u'wwd.com/tag/jessica-alba',
 u'yahoo.com/celebrity/tagged/jessica-alba'}

In [38]:
target_anchors = dict(anchor_target_pairs\
    .map(lambda (k, v): (v, k))\
    .groupByKey()\
    .mapValues(list)\
    .collect())

In [62]:
anchor_targets_rdd = anchor_target_pairs.groupByKey().mapValues(list).coalesce(128).cache()

In [67]:
target_anchors_rdd = anchor_target_pairs.map(lambda (k, v): (v, k)).groupByKey().mapValues(list).coalesce(128).cache()

In [100]:
len(anchor_targets['trump'])

160

In [101]:
len(anchor_targets['donald trump'])

383

In [182]:
set(anchor_targets['trump']) - set(anchor_targets['donald trump'])

{u'41nbc.com/tag/trump',
 u'africasacountry.com/tag/trump',
 u'agcanada.com/tag/trump',
 u'arktimes.com/arkansas/articlearchives?tag=trump',
 u'aurorasentinel.com/tag/trump',
 u'baltimoresun.com/topic/business/ivanka-trump-peclb00000010046-topic.html',
 u'bankerandtradesman.com/tag/trump',
 u'breitbart.com/tag/trump',
 u'capitalgazette.com/topic/business/ivanka-trump-peclb00000010046-topic.html',
 u'carrollcountytimes.com/topic/business/ivanka-trump-peclb00000010046-topic.html',
 u'cccadvocate.com/tag/trump',
 u'chicagotribune.com/topic/business/ivanka-trump-peclb00000010046-topic.html',
 u'christianitytoday.com/ct/topics/d/donald-trump',
 u'citynews.com.au/tag/trump',
 u'collegian.com/tag/trump',
 u'commercialrecord.com/tag/trump',
 u'courant.com/topic/business/ivanka-trump-peclb00000010046-topic.html',
 u'csindy.com/coloradosprings/articlearchives?tag=trump',
 u'dailycollegian.com/tag/trump',
 u'dailyegyptian.com/tag/trump',
 u'dallasvoice.com/tag/trump',
 u'darkroom.baltimoresun.com

In [171]:
anchor_target_pairs.map(lambda (k, v): v).distinct().count()

957573

In [None]:
a -> b
a -> d

b -> c

In [195]:
from itertools import combinations

In [196]:
redirects = anchor_target_pairs\
    .groupByKey()\
    .filter(lambda (k, v): k not in bad_anchors)\
    .mapValues(sorted)\
    .flatMap(lambda (anchor, targets): combinations(targets, 2))

In [None]:
anchor_target_pairs\
    .groupByKey()\
    .filter(lambda (k, v): k not in bad_anchors)\
    .mapValues(sorted)

In [200]:
clusters = .count()

69792476

In [202]:
redirects.take(10)

[(u'tribuneonlineng.com/author/david', u'tribuneonlineng.com/author/toba'),
 (u'baseball-reference.com/player_search.cgi?results=sanchca01,sanche012car,sanche009car',
  u'baseball-reference.com/search/search.fcgi?pid=sanchca01,sanche014car,sanche012car&search=carlos+sanchez'),
 (u'baseball-reference.com/player_search.cgi?results=sanchca01,sanche012car,sanche009car',
  u'baseball-reference.com/search/search.fcgi?results=sanchca01,sanche012car,sanche014car&search=carlos+sanchez'),
 (u'baseball-reference.com/player_search.cgi?results=sanchca01,sanche012car,sanche009car',
  u'baseball-reference.com/search/search.fcgi?results=sanchca01,sanche014car,sanche012car&search=carlos+sanchez'),
 (u'baseball-reference.com/player_search.cgi?results=sanchca01,sanche012car,sanche009car',
  u'cbssports.com/mlb/players/playerpage/1956971/carlos-sanchez'),
 (u'baseball-reference.com/player_search.cgi?results=sanchca01,sanche012car,sanche009car',
  u'espn.com/mlb/player/_/id/32264/carlos-sanchez'),
 (u'base

In [199]:
redirects.distinct().count()

69733175

In [193]:
redirects.filter(lambda (s, t): t == 'breitbart.com/tag/trump').take(1)

[]

In [176]:
redirects = anchor_target_pairs\
    .groupByKey()\
    .filter(lambda (k, v): k not in bad_anchors)\
    .mapValues(sorted)\
    .flatMap(lambda (anchor, targets): ((t, targets[0]) for t in targets))\
    .groupByKey()\
    .mapValues(sorted)\
    .flatMap(lambda (s, targets): ((s, targets[0]) for t in targets))\
    .distinct()
redirects.count()

940321

In [177]:
redirects.lookup('abcnews.go.com/topics/news/donald-trump.htm')

[u'24wrestling.com/tag/donald-trump']

In [179]:
redirects.lookup('africasacountry.com/tag/donald-trump')

[u'24wrestling.com/tag/donald-trump']

In [183]:
redirects.lookup('breitbart.com/tag/trump')

[u'41nbc.com/tag/trump']

In [188]:
redirects2.lookup('24wrestling.com/tag/donald-trump')

[]

In [170]:
redirects = redirects.collect()

In [185]:
redirects2 = redirects\
    .map(lambda (s, t): (t, s))\
    .leftOuterJoin(redirects)\
    .map(lambda (target, (src, redirect)): (src, redirect or target))\
    .filter(lambda (s, t): s != t)\
    .distinct()\
    .cache()
redirects2.count()

583434

In [154]:
redirects3 = redirects\
    .map(lambda (s, t): (t, s))\
    .leftOuterJoin(redirects)\
    .map(lambda (target, (src, redirect)): (src, redirect or target))\
    .filter(lambda (s, t): s != t)\
    .distinct()\
    .cache()
redirects3.count()

640716

In [132]:
731221 - 612503

118718

In [155]:
rd1 = redirects.collect()

In [156]:
rd2 = redirects2.collect()

In [161]:
[v for k, v in rd1 if k == 'liverpoolecho.co.uk/all-about/anfield']

[u'dailypost.co.uk/all-about/liverpool-fc']

In [164]:
[v for k, v in rd2 if k == 'liverpoolecho.co.uk/all-about/anfield']

[u'chroniclelive.co.uk/all-about/jurgen-klopp',
 u'chesterchronicle.co.uk/all-about/liverpool-fc',
 u'archdaily.com/tag/liverpool',
 u'chroniclelive.co.uk/all-about/liverpool-fc']

In [160]:
list(set(rd2) - set(rd1))[0]

(u'liverpoolecho.co.uk/all-about/anfield',
 u'chroniclelive.co.uk/all-about/liverpool-fc')

In [121]:
redirects = redirects\
    .map(lambda (s, t): (t, s))\
    .leftOuterJoin(redirects)\
    .map(lambda (target, (src, redirect)): (src, redirect or target))\
    .cache()
redirects.count()

3400347

In [122]:
redirects.distinct().count()

1329372

In [None]:
local_redirects = dict(redirects.collect())

In [51]:
from collections import Counter
anchors = Counter()
for t in anchor_targets['government']:
    anchors.update(target_anchors[t])

for anchor, count in sorted(anchors.iteritems(), key=lambda (k, v): v, reverse=True)[:30]:
    print count, '\t', anchor

55 	government
6 	government’s
5 	the government
4 	tory government
3 	mps
3 	politician
3 	governments
3 	commons
3 	parliament
3 	tory
3 	prime minister
3 	conservatives
3 	houses of parliament
3 	conservative party
3 	politicians
3 	political
2 	conservative
2 	by-election
2 	government's
2 	whitehall
2 	general election
2 	ministers
2 	local authorities
2 	council
2 	vote
2 	the tories
2 	labour
2 	downing street
2 	elections
2 	uk government


380 	donald trump
69 	donald trump’s
61 	trump
54 	president trump
42 	president donald trump
23 	donald j. trump
18 	donald trump 
18 	 donald trump
16 	president trump’s
13 	donald trump's
9 	trump 
9 	us president donald trump
9 	trump’s
9 	president donald trump’s
8 	donald trump’
8 	donald trump
7 	president
7 	trump administration
7 	donald
6 	donald 
6 	donald trump,
6 	donald trump.
6 	mr trump
5 	president 
5 	president-elect donald trump
5 	 donald trump 
5 	[donald] trump
5 	mr. trump
5 	president trump's
4 	u.s. president donald trump


In [None]:
anchor_targets