In [2]:
import re
from collections import defaultdict

# Example document list
DOCS = ["doc1.txt", "doc2.txt"]

def read_word_files(docs):
    frequency_data = defaultdict(int)
    total_terms = 0
    unique_terms = set()

    for doc in docs:
        try:
            with open(doc, "r") as f:
                for word in f.read().split():
                    word = word.lower()
                    word = re.sub(r"[^a-zA-Z0-9\s]", "", word)
                    frequency_data[word] += 1
                    total_terms += 1
                    unique_terms.add(word)
        except FileNotFoundError:
            print(f"File {doc} not found!")
            return None, None, None

    return frequency_data, total_terms, len(unique_terms), len(docs)


def print_all_counts(frequency_data):
    print("-----------------------------------------------")
    print("    Occurrences    Word")
    for word in sorted(frequency_data):
        print(f"{frequency_data[word]:15}    {word}")
    print("-----------------------------------------------")


if __name__ == "__main__":
    frequency_data, total, unique_count, docs_count = read_word_files(DOCS)
    if frequency_data:
        print(f"There are {total} terms in the collection.")
        print(f"There are {unique_count} unique terms in the collection.")
        print(f"There are {docs_count} documents in the collection.\n")
        print_all_counts(frequency_data)

There are 2 terms in the collection.
There are 1 unique terms in the collection.
There are 2 documents in the collection.

-----------------------------------------------
    Occurrences    Word
              2    applebananawatermelon
-----------------------------------------------
