In [1]:
DATA_DIR = "./data"

In [2]:
class Document(object):
    def __init__(self, data):
        self.documentId = data['documentId']
        self.text = data['text']
        self.title = data['title']
        self.slug = data['slug']
        
    def __hash__(self):
        return int(self.documentId)
    
    def __eq__(self, x):
        return self.documentId == x.documentId

In [3]:
import os
import json

def load_corpus():
    corpus = set()
    for (root,dirs,files) in os.walk(DATA_DIR, topdown=True): 
        if not files:
            continue
        path = os.path.join(root, files[0])
        with open(path, 'r') as f:
            corpus.add(Document(json.load(f)))
    return corpus

In [4]:
corpus = load_corpus()

In [5]:
doc_ids = set([x.documentId for x in corpus])

In [6]:
print("total documents in corpus", len(corpus))

total documents in corpus 15298


In [7]:
import re

def sanitize(text):
    return re.sub('[^a-z]+', ' ', text.lower())

In [8]:
def tokenize(text):
    return text.split()

In [9]:
from collections import defaultdict, Counter

def build_inverted_index(corpus, field="text"):
    index = defaultdict(dict)
    for doc in corpus:
        words_counter = Counter(tokenize(sanitize(getattr(doc, field))))
        for word, count in words_counter.items():
            index[word][doc.documentId] = count
    return index

In [10]:
inverted_index = build_inverted_index(corpus)

In [11]:
title_inverted_index = build_inverted_index(corpus, field="title")

In [12]:
from collections import defaultdict, Counter

def build_index(corpus):
    index = defaultdict()
    for doc in corpus:
        index[doc.documentId] = doc
    return index

In [13]:
index = build_index(corpus)

In [14]:
def search(text):
    tokens = tokenize(sanitize(text))
    ids = doc_ids.intersection(*list(filter(lambda x: x, [
        title_inverted_index.get(token, {}).keys()
        for token in tokens
    ])))
    return [
        index[x]
        for x in sorted(ids, key=lambda x: len(index[x].title))[:10]
    ]

In [15]:
print([(x.documentId, x.title) for x in search("harry potter")])

[('13', 'Harry Potter'), ('3403', 'LEGO Harry Potter'), ('907', 'Harry Potter Wiki'), ('47353', "Harry Potter's car"), ('28786', "Harry Potter's sock"), ('17026', "Harry Potter's wand"), ('6990', 'Harry Potter and Me'), ('25015', "Harry Potter's watch"), ('16393', 'Harry Potter Lexicon'), ('16592', "Harry Potter's scars")]


In [84]:
import plotly.express as px
import pandas as pd

def plot_word_histogram(doc_id, count=100):
    doc = index[str(doc_id)]
    words = tokenize(sanitize(doc.text))
    counter = Counter(words)
    df = pd.DataFrame(counter.most_common()[:count], columns=["word", "count"])
    fig = px.bar(df, x="count", y="word", orientation='h', title='Word histogram for page \'%s\'' % (doc.title))
    fig.show()

In [83]:
plot_word_histogram('13', count=100)