# Exercise 08: semantic similarity

First, let's create a function to calculate a *MinHash*:

In [3]:
from datasketch import MinHash

def mh_digest (data):
    mh = MinHash(num_perm=512)

    for d in data:
        mh.update(d.encode('utf8'))

    return mh

Then we'll iterate through each parsed document, adding the keywords to the MinHash:

In [4]:
import pynlp

files = ["a4.json", "a3.json", "a2.json", "a1.json"]

stopwords = pynlp.load_stopwords("stop.txt")
files_set = {}
files_mh = {}

for json_file in files:
    keywords = set([])

    for lex in pynlp.lex_iter(json_file):
        if (lex.pos != ".") and (lex.root not in stopwords):
            keywords.add(lex.root)

    files_set[json_file] = keywords
    files_mh[json_file] = mh_digest(keywords)

    print(json_file, keywords)

a4.json {'editorial', 'advice', 'pass', 'step', 'during', 'plan', 'co', 'break', 'location', 'involve', 'physical', 'realize', 'way', 'progress', 'group', 'situation', 'sum', 'exam', 'student', 'guidance', 'online', 'interact', 'everyone', 'friend', 'author', 'classroom', 'conference', 'learning', 'reader', 'early', 'year', 'feedback', 'long', 'whether', '2016', 'loop', 'look', 'shoulder', 'company', 'ongoing', 'instructor', 'person', 'past', 'book', 'course', 'begin', 'standard', 'team', 'bring', 'hang', 'wise', 'training', 'back', 'worker', 'seasoned', 'experienced', '50%', 'always', 'people', 'live', 'attendance', 'introduce', 'expert', 'participate', 'knowledge', "o'reilly", 'media', 'process', 'attend', 'addition', 'learn', 'virtual', 'particular', 'analyze', "mid-'80s", 'individual', 'assess', 'shot', 'instructional', 'together', 'publish', 'video', 'provide', 'apply', 'start', 'experience'}


FileNotFoundError: [Errno 2] No such file or directory: 'a3.json'

Let's compare the HTML documents, using a pairwise MinHash to approximate their Jaccard similarity:

In [None]:
import itertools

sim = []

for i1, i2 in itertools.combinations(range(len(files)), 2):
    j = files_mh[files[i1]].jaccard(files_mh[files[i2]])
    sim.append((j, files[i1], files[i2],))

for jaccard, file1, file2 in sorted(sim, key=lambda x: x[0], reverse=True):
    print("%0.4f\t%s\t%s" % (jaccard, file1, file2))

Note the top-ranked ("most similar") pair, where both `html/article2.html` and `html/article3.html` are about machine learning. Take a look at their overlapping keywords:

In [None]:
files_set["a3.json"] & files_set["a2.json"]