In [1]:
from functools import total_ordering

In [2]:
@total_ordering
class Posting:

    def __init__(self, doc_id):
        self.doc_id = doc_id

    def __eq__(self, other):
        return other == self.doc_id

    def __gt__(self, other):
        return self.doc_id > other

    def __repr__(self) -> str:
        return str(self.doc_id)

    def from_corpus(self, corpus):
        return corpus[self.doc_id]

In [3]:
class PostingsList:

    def __init__(self) -> None:
        self._postings_list = []

    @classmethod
    def from_postings_list(cls, postings_list: list):
        plist = cls()
        postings_list.sort()
        plist._postings_list = postings_list
        return plist

    @classmethod
    def from_doc_id(cls, doc_id):
        plist = cls()
        plist._postings_list = [Posting(doc_id)]
        return plist

    def merge(self, other: "PostingsList"):
        self._postings_list += other._postings_list
        self._postings_list.sort()
        for i in range(1, len(self._postings_list)):
            if self._postings_list[i] == self._postings_list[i-1]:
                self._postings_list.remove(i)

    def __repr__(self) -> str:
        return ", ".join(map(str, self._postings_list))

    def get_from_corpus(self, corpus):
        return list(map(lambda x: x.from_corpus(corpus), self._postings_list))

    def intersection(self, other: "PostingsList"):
        plist = []
        i = 0
        j = 0
        while (i < len(self._postings_list)) and (j < len(other._postings_list)):
            if self._postings_list[i] == other._postings_list[j]:
                plist += self._postings_list[i]
                i += 1
                j += 1
            elif self._postings_list[i] <= other._postings_list[j]:
                i += 1
            else:
                j += 1
        return plist

    def union(self, other: "PostingsList"):
        plist = []
        i = 0
        j = 0
        while (i < len(self._postings_list)) and (j < len(other._postings_list)):
            if self._postings_list[i] == other._postings_list[j]:
                plist += self._postings_list[i]
                i += 1
                j += 1
            elif self._postings_list[i] < other._postings_list[j]:
                plist += self._postings_list[i]
                i += 1
            else:
                plist += other._postings_list[j]
                j += 1
        return plist

In [4]:
class ImpossibleMergeException(Exception):
    pass


@total_ordering
class Term:
    def __init__(self, term: str, doc_id) -> None:
        self.term = term
        self.postings_list = PostingsList.from_doc_id(doc_id)

    def __eq__(self, other) -> bool:
        return self.term == other.term

    def __gt__(self, other) -> bool:
        return self.term > other.term

    def merge(self, other: "Term"):
        if self == other:
            self.postings_list.merge(other.postings_list)
        else:
            raise ImpossibleMergeException

    def __repr__(self) -> str:
        return self.term + ": " + str(self.postings_list)

In [45]:
class TrieNode:

    def __init__(self):
        self.children = dict()
        self.is_leaf = False
        self.postings_list = None

    def set_postings_list(self, postings_list: PostingsList) -> None:
        self.postings_list = postings_list
        self.is_leaf = True

    def __repr__(self) -> str:
        to_return = ""
        if self.is_leaf:
            to_return += ": " + str(self.postings_list) + "\n"
        for key in self.children:
            to_return += str(key) + self.children[key].__repr__()
        return to_return

In [43]:
class MissingKeyException(Exception):
    pass


class Trie:

    def __init__(self) -> None:
        self.root = TrieNode()

    def insert(self, node: Term):
        current = self.root
        for char in node.term:
            if char not in current.children:
                current.children[char] = TrieNode()
            current = current.children[char]
        if current.postings_list is not None:
            current.postings_list.merge(node.postings_list)
        else:
            current.set_postings_list(node.postings_list)
        return self

    def search(self, key: str):
        current = self.root
        for char in key:
            if char in current.children:
                current = current.children[char]
            else:
                raise MissingKeyException
        if current.postings_list is not None:
            return current.postings_list
        else:
            raise MissingKeyException

    def remove(self, key: str) -> None:
        current = self.root
        for char in key:
            idx = ord(char) - ord('a')
            child = current.children[idx]
            if child is None:
                raise MissingKeyException
            else:
                current = child
        current = None

    def merge(self, other: "Trie"):
        stack = [(self.root, other.root)]
        while stack:
            node_self, node_other = stack.pop()
            if node_other.postings_list is not None:
                if node_self.postings_list is not None:
                    node_self.postings_list.merge(node_other.postings_list)
                else:
                    node_self.set_postings_list(node_other.postings_list)
            for key in node_other.children:
                value = node_other.children[key]
                if key not in node_self.children:
                    node_self.children[key] = value
                else:
                    stack += [(node_self.children[key], value)]

    def __repr__(self) -> str:
        return self.root.__repr__()

In [48]:
cane = Trie().insert(Term("cani", 3))
cave = Trie().insert(Term("canino", 2))
cane.merge(cave)
cane

cani: 3
no: 2

In [None]:
def tokenize(content) -> list:
    return []


class InvertedIndex:

    def __init__(self) -> None:
        self.trie = Trie()

    @classmethod
    def from_corpus(cls, corpus):
        intermediate_dict = {}

        for doc_id, content in enumerate(corpus):
            tokens = tokenize(content)
            for token in tokens:
                term = Term(token, doc_id)
                try:
                    intermediate_dict[token].merge(term)
                except KeyError:
                    intermediate_dict[token] = term
        idx = cls()
        for key in intermediate_dict:
            idx.trie.insert(Term(key, intermediate_dict[key]))
        return idx

    def __getitem__(self, key: str):
        # check if empty
        return self.trie.search(key)

    def insert(self, term: "Term") -> None:
        self.trie.insert(term)

    def __repr__(self) -> str:
        return ""