In [1]:
DATA_DIR = "./data"

In [21]:
class Document(object):
    def __init__(self, data):
        self.documentId = data['documentId']
        self.time = data['time']
        self.sender = data['sender']
        self.sender_id = data['sender_id']
        self.body = data['message']

In [33]:
import os
import email
import json

def clean_body(body):
    patterns = [
        "-----Original Message-----",
        "***************************",
        "----------------------",
    ]
    for pattern in patterns:
        index = body.find(pattern)
        if index != -1:
            body = body[:index]
    return body.strip()
        

def load_corpus():
    index = 0
    users = {}
    corpus = []
    with open(os.path.join(DATA_DIR, "users.json")) as f:
        for k, v in json.loads(f.read()).items():
            users[v] = k
    with open(os.path.join(DATA_DIR, "messages.json")) as f:
        data = json.loads(f.read())
        for msg in data:
            body = clean_body(msg["message"])
            if not body:
                continue
            corpus.append(Document({
                "documentId": index,
                "time": msg["time"],
                "sender": msg["sender"],
                "sender_id": users[msg["sender"]],
                "message": body,
            }))
            index += 1
    return corpus

In [34]:
corpus = load_corpus()

In [35]:
import re

def sanitize(text):
    return re.sub('[^a-z]+', ' ', text.lower())

In [36]:
def tokenize(text):
    return text.split()

In [37]:
from collections import defaultdict, Counter

def build_inverted_index(corpus, field="body"):
    index = defaultdict(dict)
    for i, doc in enumerate(corpus):
        words_counter = Counter(tokenize(sanitize(getattr(doc, field))))
        for word, count in words_counter.items():
            index[word][doc.documentId] = count
    return index

In [38]:
inverted_index = build_inverted_index(corpus)