In [None]:
import string

files = ["01.txt.txt",
         "02.txt.txt",
         "03.txt.txt",
         "04.txt.txt",
         "05.txt.txt",
         "06.txt.txt",
         "07.txt.txt",
         "08.txt.txt",
         "09.txt.txt",
         "10.txt.txt",
         "11.txt.txt",
         "12.txt.txt",
         "13.txt.txt",
         "14.txt.txt",
         "15.txt.txt",
         "16.txt.txt",
         "17.txt.txt",
         "18.txt.txt",
         "19.txt.txt",
         "20.txt.txt",
    ]

In [None]:
def clean_stop_words(filepath, encoding="utf-8"):
    '''
    Remove stop words from a file with the given stop words file.

    Return:
        filepaths (list[str]): List of file paths to process.
        encoding (str, optional): File encoding (default: "utf-8").

    Returns:
        list[str]: A new list of words from the file with stop words removed.
    '''
    punctuation = string.punctuation.replace("'", "")
    translator = str.maketrans("", "", punctuation)

    file_words = []
    with open(filepath, "r", encoding=encoding, errors="ignore") as file:
        for line in file:
            line = line.lower().strip()
            line = line.translate(translator)
            file_words.extend(line.split())
        
    return file_words


{'02': 5, '19': 5, '03': 3, '14': 2, '01': 1, '07': 1, '08': 1, '16': 1, '20': 1}


In [None]:
#without lambda
def index(filepaths, encoding="utf-8"):
    '''
    Build a combined word frequency index across multiple text files.append

    Parameters:
        filepaths (list[str]): List of file paths to process
        encoding (str, optional): File encoding (default: "utf-8")

    Returns:
        A dictionary where each key is a word within the files and each value is
        a sorted list of file_name(first 2 characters), count pairs. The list is also sorted in 
        descending order.

    All words are lower cased and stripped of punctuation (outside of apostrophes).
    Stop words (given in stop_words txt file) are removed from files
    '''
    punctuation = string.punctuation.replace("'", "")
    translator = str.maketrans("", "", punctuation)

    combined = {}

    for filepath in filepaths:
        file_words = []
        with open(filepath, "r", encoding=encoding, errors="ignore") as file:
            for line in file:
                line = line.lower().strip()
                line = line.translate(translator)
                file_words.extend(line.split())
        
        for word in file_words:
            if word in stop_words:
                file_words.remove(word)

        word_freq = {}
        for word in file_words:
            word_freq[word] = word_freq.get(word, 0) + 1

        for word, count in word_freq.items():
            if word not in combined:
                combined[word] = []
            combined[word].append([filepath[:2], count])

    for word in combined:
        combined[word].sort(key=lambda x: x[1], reverse=True)

    return combined

In [None]:
#with lambda functions
def index_with_lambda(filepaths, encoding="utf-8"):
    '''
    Build a combined word frequency index across multiple text files.append

    Parameters:
        filepaths (list[str]): List of file paths to process
        encoding (str, optional): File encoding (default: "utf-8")

    Returns:
        A dictionary where each key is a word within the files and each value is
        a sorted list of file_name(first 2 characters), count pairs. The list is also sorted in 
        descending order.

    All words are lower cased and stripped of punctuation (outside of apostrophes).
    Stop words (given in stop_words txt file) are removed from files
    '''
    punctuation = string.punctuation.replace("'", "")
    translator = str.maketrans("", "", punctuation)

    combined = {}

    for filepath in filepaths:
        with open(filepath, "r", encoding=encoding, errors="ignore") as file:
            file_words = sum(
                map(lambda line: line.lower().strip().translate(translator).split(), file),
                []
            )

            file_words = list(filter(lambda w: w not in stop_words, file_words))

            word_freq = {w: file_words.count(w) for w in set(file_words)}

        for word, count in word_freq.items():
            if word not in combined:
                combined[word] = []
            combined[word].append([filepath[:2], count])
            
    for word in combined:
        combined[word].sort(key=lambda x: x[1], reverse=True)
            
    return combined

In [None]:
def search(words, file_dict):
    file_sums = {}
    words = words.lower().split()
    for word in words:
        if word in file_dict:
            for filepath, count in file_dict[word]:
                file_sums[filepath] = file_sums.get(filepath, 0) + count
    return file_sums

stop_words = clean_stop_words("stopwords.txt")

In [None]:
word_dicts = index_with_lambda(files)

#example
print(search("Microsoft will", word_dicts))