In [49]:
import re
from typing import Tuple, Generator

In [34]:
doc_read_pattern = re.compile("<doc><docno>([^<]*)</docno>([^<]*)</doc>")


def read_doc(text: str) -> Tuple[str, str]:
    matcher = doc_read_pattern.search(text)
    if matcher == None:
        return None, None
    return matcher.group(1), matcher.group(2)


In [35]:
with open("doc.txt", "r") as f:
    lines = f.readlines()


In [53]:

class IndexObject:
    def __init__(self, size: int) -> None:
        self.df = 0
        self.tf = [0 for _ in range(size)]

class IndexStore:
    def __init__(self, size: int) -> None:
        self.corpus_ids = dict()
        self.corpus_name_ids = []
        self.objects = dict()
        self.size = size

    def locate_docid(self, docid: str) -> int:
        oid = len(self.corpus_name_ids)
        self.corpus_ids[docid] = oid
        self.corpus_name_ids.append(docid)
        return oid

    def fetch_or_create_object(self, word: str) -> IndexObject:
        if word in self.objects:
            return self.objects[word]
        wl = IndexObject(self.size)
        self.objects[word] = wl
        return wl
    
    def tf_doc_of_object(self, word: str) -> Generator[Tuple[int, str], None, None]:
        if word not in self.objects:
            return

        tf = self.objects[word].tf

        for i in range(len(tf)):
            if tf[i] != 0:
                yield tf[i], self.corpus_name_ids[i]
        


In [54]:
index = IndexStore(len(lines))

# Building index

for i in range(len(lines)):
    line = lines[i]
    docno, doctext = read_doc(line)
    if docno == None:
        continue
    docno = index.locate_docid(docno)

    words = re.findall('\w+', doctext)
    for w in words:
        word = w.lower()

        wl = index.fetch_or_create_object(word)
        wl.df += 1
        wl.tf[docno] = 1


In [56]:
for word in sorted(index.objects):
    io = index.objects[word]
    print("{0}=df({1})".format(io.df, word))
    for tf, doc in index.tf_doc_of_object(word):
        print("\t{0} {1}".format(tf, doc))


1=df(arabia)
	1 D4
1=df(casablanca)
	1 D1
1=df(citezen)
	1 D0
2=df(godfather)
	1 D2
1=df(gone)
	1 D3
1=df(graduate)
	1 D6
1=df(in)
	1 D9
1=df(kane)
	1 D0
1=df(lawrence)
	1 D4
1=df(list)
	1 D8
3=df(of)
	1 D4
	1 D5
1=df(on)
	1 D7
2=df(oz)
	1 D5
1=df(rain)
	1 D9
1=df(s)
	1 D8
1=df(schindler)
	1 D8
1=df(singin)
	1 D9
8=df(the)
	1 D2
	1 D3
	1 D5
	1 D6
	1 D7
	1 D9
1=df(waterfront)
	1 D7
1=df(wind)
	1 D3
1=df(with)
	1 D3
2=df(wizard)
	1 D5
