In [60]:
import re
from typing import Tuple

In [61]:
doc_read_pattern = re.compile("<doc><docno>([^<]*)</docno>([^<]*)</doc>")

def read_doc(text: str) -> Tuple[str, str]:
    matcher = doc_read_pattern.search(text)
    if matcher == None:
        return None, None
    return matcher.group(1), matcher.group(2)


In [62]:
with open("doc.txt", "r") as f:
    lines = f.readlines()

In [63]:
words_list = {}
words_im = {}
inci_matrix_wordid = {}
inci_matrix = []
inci_lines = []

for i in range(len(lines)):
    line = lines[i]
    docno, doctext = read_doc(line)
    if docno == None:
        continue
    
    inci_lines.append(docno)
    words = re.findall('\w+', doctext)
    words_line = []
    for w in words:
        key = w.lower()
        if key not in words_line:
            words_line.append(key)
        

        # Fill word count 

        if key in words_list:
            words_list[key] += 1
        else:
            words_list[key] = 1

        # Fill inverted index

        if key not in words_im:
            words_im[key] = [docno]
        elif docno not in words_im[key]:
            words_im[key].append(docno)
    
        # Fill incidence matrix

        if key not in inci_matrix_wordid:
            inci_matrix_wordid[key] = len(inci_matrix)
            inci_matrix.append([0 for _ in range(len(lines))])
            
        inci_matrix[inci_matrix_wordid[key]][i] = 1


In [67]:
print("## Word counts")

print("{0:16} Count".format("Name"))

for w in sorted(words_list):
    print("{0:16} {1}".format(w, words_list[w]))


## Word counts
Name             Count
arabia           1
casablanca       1
citezen          1
godfather        2
gone             1
graduate         1
in               1
kane             1
lawrence         1
list             1
of               3
on               1
oz               2
rain             1
s                1
schindler        1
singin           1
the              8
waterfront       1
wind             1
with             1
wizard           2


In [None]:
print("## Inverted index")

print("{0:16} Document(s)".format("Word"))
for w in sorted(words_im):
    print("{0:16} {1}".format(w, ", ".join(words_im[w])))


## Inverted index
Word             Document(s)
arabia           {D4}
casablanca       {D1}
citezen          {D0}
godfather        {D2}
gone             {D3}
graduate         {D6}
in               {D9}
kane             {D0}
lawrence         {D4}
list             {D8}
of               {D4, D5}
on               {D7}
oz               {D5}
rain             {D9}
s                {D8}
schindler        {D8}
singin           {D9}
the              {D2, D3, D5, D6, D7, D9}
waterfront       {D7}
wind             {D3}
with             {D3}
wizard           {D5}


In [70]:
print("## Incidence matrix")

print("{0:16} {1}".format("Word \\ Document", " ".join(
    "{0:>4}".format(d) for d in inci_lines)))

for w in sorted(inci_matrix_wordid):
    print("{0:16} {1}".format(w, " ".join("{0:4}".format(m)
                                          for m in inci_matrix[inci_matrix_wordid[w]])))


## Incidence matrix
Word \ Document    D0   D1   D2   D3   D4   D5   D6   D7   D8   D9
arabia              0    0    0    0    1    0    0    0    0    0
casablanca          0    1    0    0    0    0    0    0    0    0
citezen             1    0    0    0    0    0    0    0    0    0
godfather           0    0    1    0    0    0    0    0    0    0
gone                0    0    0    1    0    0    0    0    0    0
graduate            0    0    0    0    0    0    1    0    0    0
in                  0    0    0    0    0    0    0    0    0    1
kane                1    0    0    0    0    0    0    0    0    0
lawrence            0    0    0    0    1    0    0    0    0    0
list                0    0    0    0    0    0    0    0    1    0
of                  0    0    0    0    1    1    0    0    0    0
on                  0    0    0    0    0    0    0    1    0    0
oz                  0    0    0    0    0    1    0    0    0    0
rain                0    0    0    0    0 