In [95]:
import sys
import re
import os
import math
from operator import itemgetter
from collections import defaultdict

In [96]:
# Read data

path = "testcase/wc_data1.txt"

In [97]:
# Implement the sort algorithm

def hash_function(key):

    if key[0].lower() < 'm':
        return 0
    else:
        return 1

In [98]:
# mapper.py

mapper_output = []

with open(path, 'r') as file:
    for line in file:
        line = line.strip()
        words = line.split()
        
        for word in words:

            partition = hash_function(word)
            mapper_output.append((word, 1, partition))

print("Mapper Output (with partition):")
for output in mapper_output:
    print(f'{output[0]}\t{output[1]}\tPartition: {output[2]}')

Mapper Output (with partition):
I	1	Partition: 0
love	1	Partition: 0
drink	1	Partition: 0
craft	1	Partition: 0
beer	1	Partition: 0
and	1	Partition: 0
larger	1	Partition: 0
beer	1	Partition: 0
in	1	Partition: 0
Germany	1	Partition: 0


In [99]:
# Shuffle and Sort

partitioned_mapper_output = {}

for word, count, partition in mapper_output:
    if partition not in partitioned_mapper_output:
        partitioned_mapper_output[partition] = []
    partitioned_mapper_output[partition].append((word, count))

In [100]:
# Sort by word (key) across partitions
sorted_output = []
for partition in partitioned_mapper_output:
    partitioned_mapper_output[partition].sort(key=itemgetter(0))
    sorted_output.extend(partitioned_mapper_output[partition])

# Sort the final combined output by the word
sorted_output.sort(key=itemgetter(0))

In [101]:
# Reducer

print("\nReducer Output:")
current_word = None
current_count = 0

for word, count in sorted_output:
    if current_word == word:
        current_count += count
    else:
        if current_word:
            print(f'{current_word}\t{current_count}')
        current_count = count
        current_word = word

# Print the last word count
if current_word == word:
    print(f'{current_word}\t{current_count}')


Reducer Output:
Germany	1
I	1
and	1
beer	2
craft	1
drink	1
in	1
larger	1
love	1


In [102]:
def search_pattern_in_files(file_paths, search_pattern):
    """ Search for the pattern in the given files and return the matching lines for each file. """
    
    pattern = re.compile(search_pattern)
    file_matches = {}
    
    # Iterate through the files
    for file_path in file_paths:
        matching_lines = []
        
        with open(file_path, 'r') as file:
            # Go through each line and search for the pattern
            for i, line in enumerate(file, 1):
                if pattern.search(line):
                    matching_lines.append(i)
        
        # Only store files that have matching lines
        if matching_lines:
            file_matches[os.path.basename(file_path)] = matching_lines
    
    return file_matches

In [103]:
# Implement the Searching algorithm

file_paths = ["testcase/wc_data1.txt", "testcase/wc_data2.txt"]
search_pattern = "beer"

matches = search_pattern_in_files(file_paths, search_pattern)

# Print output in the format (file, [l1, l2, ...])
for file, lines in matches.items():
    print(f"{file} -> {lines}")

wc_data1.txt -> [1]


In [104]:
# Implement the TF-IDF computation algorithm

# Reducer Function: Combines counts of the same word
def reducer(sorted_output):
    word_counts = defaultdict(int)
    for word, count in sorted_output:
        word_counts[word] += count
    return word_counts

# Calculate term frequency (TF) for each word in a document
def compute_tf(word, word_count, total_words):
    return word_count[word] / total_words

# Calculate inverse document frequency (IDF) for a word
def compute_idf(word, doc_count, total_docs):
    return math.log(total_docs / (1 + doc_count.get(word, 0)))  # Add 1 to avoid division by zero

# Main function to calculate TF-IDF
def compute_tf_idf(word_counts, total_docs, doc_word_count):
    total_words = sum(word_counts.values())
    tf_idf = {}
    
    # Calculate TF-IDF for each word
    for word in word_counts:
        tf = compute_tf(word, word_counts, total_words)
        idf = compute_idf(word, doc_word_count, total_docs)
        tf_idf[word] = tf * idf
    return tf_idf

word_counts = reducer(sorted_output)

# Count the number of documents each word appears in
doc_word_count = defaultdict(int)
total_docs = len(file_paths)

for file in file_paths:
    with open(file, 'r') as f:
        words_in_file = set(f.read().split())
        for word in words_in_file:
            doc_word_count[word] += 1

# Compute TF-IDF
tf_idf_scores = compute_tf_idf(word_counts, total_docs, doc_word_count)

# Print the TF-IDF Scores
print("\nTF-IDF Scores:")
for word, score in tf_idf_scores.items():
    print(f'{word}: {score:.4f}')



TF-IDF Scores:
Germany: 0.0000
I: -0.0405
and: -0.0405
beer: 0.0000
craft: 0.0000
drink: -0.0405
in: -0.0405
larger: 0.0000
love: -0.0405
