In [50]:
import requests
import json
from collections import Counter, defaultdict
from bs4 import BeautifulSoup
from math import sqrt

### Load the dataset

In [21]:
docs = []
with open('entity_lists.multi_json') as f:
    for line in f:
        docs.append(json.loads(line))

In [26]:
docs[0].keys()

[u'url', u'headline', u'timestamp', u'entities']

### Count entity co-occurrence

In [42]:
occurrence = Counter()
N = 0.
for d in docs:
    es = list(set(e['id'] for e in d['entities']))
    N += len(es)
    for i in xrange(len(es)):
        for j in xrange(i, len(es)):
            a, b = es[i], es[j]
            if a < b:
                a, b = b, a
            occurrence[a,b] += 1
            occurrence[a, a] += 1
            occurrence[b, b] += 1

### Use t-test to rank entity pairs

In [44]:
def t_test(c1, c2, c12, n):
    p1 = c1 / n
    p2 = c2 / n
    p12 = c12 / n
    return (p12 - p1 * p2) / sqrt(p12 / n)

In [45]:
scores = {}
for a, b in occurrence.keys():
    if a != b:
        scores[a, b] = t_test(occurrence[a,a], occurrence[b,b], occurrence[a, b], N)

### List the top related entity pairs

In [47]:
sorted(scores.iteritems(), key=lambda (k, v): v, reverse=True)[:10]

[((u'Jelena Doki\u0107', u'Dinara Safina'), 2.330584101453757),
 ((u'Sydney Business Chamber', u'Patricia Forsythe'), 2.1610823258623144),
 ((u'West Metro', u'Infrastructure Australia'), 2.104464521261935),
 ((u'HMAS Sydney (1912)', u'AS.34 Kormoran'), 2.057547197131716),
 ((u'West Metro', u'CBD Metro'), 2.0512912244487085),
 ((u'Paul McLeay', u'Eddie Obeid'), 2.000149801840159),
 ((u'Hassan Nasrallah', u'Beirut'), 1.9853310986479789),
 ((u'Quadrant Bus Station', u'Keith Windschuttle'), 1.9645132907757097),
 ((u'South Yarra, Victoria', u'Bob Jane'), 1.9541724725189658),
 ((u'The Chief-Leader', u'John Hartley (academic)'), 1.9405480724570368)]

### List the least related entity pairs

In [57]:
sorted(scores.iteritems(), key=lambda (k, v): v)[:10]

[((u'New South Wales', u'Gaza'), -383.4599267172421),
 ((u'Palestinian people', u'New South Wales'), -323.77019146410703),
 ((u'Mahmoud Abbas', u'Australia'), -273.1080886964633),
 ((u'New South Wales', u'Hamas'), -267.94862131942654),
 ((u'Gaza Strip', u'Australia'), -265.7657388589313),
 ((u'Sydney', u'Australia'), -260.269639592796),
 ((u'Hamas', u'Australia'), -253.0340493629753),
 ((u'Sydney', u'Middle East'), -250.0596262300864),
 ((u'Israel', u'Australia'), -240.87133407738432),
 ((u'Gaza', u'Australia'), -218.33396118000132)]

### Construct an index of documents by entity ID

In [51]:
index = defaultdict(set)

for d in docs:
    es = list(set(e['id'] for e in d['entities']))
    for e in es:
        index[e].add(d['url'])

In [56]:
index['HMAS Sydney (1912)'].intersection(index['AS.34 Kormoran'])

{u'http://newsstore.smh.com.au/apps/viewDocument.ac?docID=SMH0901032K9AD5DVHTI',
 u'http://newsstore.smh.com.au/apps/viewDocument.ac?docID=SMH090120BM6DA4GSQJO',
 u'http://newsstore.smh.com.au/apps/viewDocument.ac?docID=SMH090121DJ3K657GT8J',
 u'http://newsstore.smh.com.au/apps/viewDocument.ac?docID=SMH090123FH2CI77TI9B',
 u'http://newsstore.smh.com.au/apps/viewDocument.ac?docID=SMH090124E847U50IPKH'}