In [21]:
import os
from collections import defaultdict, Counter
from nltk.corpus import stopwords
import re

In [22]:

stop_words = set(stopwords.words('english'))
input_dir = 'data'


In [23]:
def clean_text(text):
    text = text.lower()  
    text = re.sub(r'[^\w\s]', '', text)  
    words = text.split()
    cleaned_words = [word for word  in words if word not in stop_words]
    return cleaned_words

In [24]:
def map(filename, doc_id):
    with open(filename, 'r', encoding='utf-8') as f:
        text = f.read()
        words = clean_text(text)
        word_tuples = [(word, (doc_id, 1)) for word in words]
    return word_tuples

In [25]:
def combine(mapped_data):
    combined_data = defaultdict(lambda: defaultdict(int))
    for word, (doc_id, count) in mapped_data:
        combined_data[word][doc_id] += count
    return combined_data

In [26]:
combined_data_list = []
for doc_id, filename in enumerate(os.listdir(input_dir)):
    file_path = os.path.join(input_dir, filename)
    mapped_data = map(file_path, str(doc_id))
    print(mapped_data)
    combined_data = combine(mapped_data)
    combined_data_list.append(combined_data)

[('project', ('0', 1)), ('gutenberg', ('0', 1)), ('ebook', ('0', 1)), ('symbolic', ('0', 1)), ('logic', ('0', 1)), ('lewis', ('0', 1)), ('carroll', ('0', 1)), ('ebook', ('0', 1)), ('use', ('0', 1)), ('anyone', ('0', 1)), ('anywhere', ('0', 1)), ('cost', ('0', 1)), ('almost', ('0', 1)), ('restrictions', ('0', 1)), ('whatsoever', ('0', 1)), ('may', ('0', 1)), ('copy', ('0', 1)), ('give', ('0', 1)), ('away', ('0', 1)), ('reuse', ('0', 1)), ('terms', ('0', 1)), ('project', ('0', 1)), ('gutenberg', ('0', 1)), ('license', ('0', 1)), ('included', ('0', 1)), ('ebook', ('0', 1)), ('online', ('0', 1)), ('wwwgutenbergnet', ('0', 1)), ('title', ('0', 1)), ('symbolic', ('0', 1)), ('logic', ('0', 1)), ('author', ('0', 1)), ('lewis', ('0', 1)), ('carroll', ('0', 1)), ('release', ('0', 1)), ('date', ('0', 1)), ('may', ('0', 1)), ('5', ('0', 1)), ('2009', ('0', 1)), ('ebook', ('0', 1)), ('28696', ('0', 1)), ('language', ('0', 1)), ('english', ('0', 1)), ('start', ('0', 1)), ('project', ('0', 1)), ('gut

In [27]:
combined_data_list[0]

defaultdict(<function __main__.combine.<locals>.<lambda>()>,
            {'project': defaultdict(int, {'0': 87}),
             'gutenberg': defaultdict(int, {'0': 30}),
             'ebook': defaultdict(int, {'0': 11}),
             'symbolic': defaultdict(int, {'0': 15}),
             'logic': defaultdict(int, {'0': 33}),
             'lewis': defaultdict(int, {'0': 7}),
             'carroll': defaultdict(int, {'0': 6}),
             'use': defaultdict(int, {'0': 42}),
             'anyone': defaultdict(int, {'0': 5}),
             'anywhere': defaultdict(int, {'0': 3}),
             'cost': defaultdict(int, {'0': 5}),
             'almost': defaultdict(int, {'0': 5}),
             'restrictions': defaultdict(int, {'0': 2}),
             'whatsoever': defaultdict(int, {'0': 2}),
             'may': defaultdict(int, {'0': 236}),
             'copy': defaultdict(int, {'0': 16}),
             'give': defaultdict(int, {'0': 32}),
             'away': defaultdict(int, {'0': 8}),
         

In [28]:
def reduce(combined_data_list):
    final_counts = defaultdict(lambda: defaultdict(int))
    for combined_data in combined_data_list:
        for word, doc_counts in combined_data.items():
            for doc_id, count in doc_counts.items():
                final_counts[word][doc_id] += count
    return final_counts

In [29]:
final_counts = reduce(combined_data_list)

In [30]:
for word, doc_counts in final_counts.items():
    doc_list = [(doc_id, count) for doc_id, count in doc_counts.items()]
    print(f'{word}: {doc_list}')

project: [('0', 87), ('1', 87), ('2', 87), ('3', 87), ('4', 87)]
gutenberg: [('0', 30), ('1', 30), ('2', 29), ('3', 30), ('4', 30)]
ebook: [('0', 11), ('1', 11), ('2', 10), ('3', 11), ('4', 11)]
symbolic: [('0', 15)]
logic: [('0', 33), ('1', 15), ('3', 3)]
lewis: [('0', 7), ('1', 5), ('2', 5), ('3', 6), ('4', 6)]
carroll: [('0', 6), ('1', 4), ('2', 5), ('3', 6), ('4', 6)]
use: [('0', 42), ('1', 23), ('2', 13), ('3', 25), ('4', 19)]
anyone: [('0', 5), ('1', 5), ('2', 5), ('3', 5), ('4', 5)]
anywhere: [('0', 3), ('1', 2), ('2', 2), ('3', 4), ('4', 2)]
cost: [('0', 5), ('1', 3), ('2', 3), ('3', 16), ('4', 6)]
almost: [('0', 5), ('1', 2), ('2', 5), ('3', 6), ('4', 6)]
restrictions: [('0', 2), ('1', 2), ('2', 2), ('3', 2), ('4', 2)]
whatsoever: [('0', 2), ('1', 2), ('2', 2), ('3', 2), ('4', 3)]
may: [('0', 236), ('1', 67), ('2', 25), ('3', 48), ('4', 34)]
copy: [('0', 16), ('1', 13), ('2', 12), ('3', 13), ('4', 13)]
give: [('0', 32), ('1', 24), ('2', 6), ('3', 37), ('4', 12)]
away: [('0', 8

In [31]:
final_counts['logic']

defaultdict(int, {'0': 33, '1': 15, '3': 3})