In [31]:
import os
import re
from collections import defaultdict, Counter

In [32]:
def find_vowel_duplications(text):
    pattern = r'\b(\w*([aeiouAEIOU])\2\w*)\b'
    return [match[0] for match in re.findall(pattern, text)]

In [33]:
def counter(vowel_frequencies):
    vowels = 'aeiou'
    for vowel in vowels:
        print(f"Top 3 words with duplication of vowel '{vowel}':")
        sorted_frequencies = sorted(vowel_frequencies.items(), key=lambda x: x[1], reverse=True)
        count = 0
        for word, freq in sorted_frequencies:
            if vowel in word:
                print(f"{word}: {freq}")
                count += 1
            if count >= 3:
                break
        print()

In [34]:
def get_filenames(path = "blogs"):
    return os.listdir(path)

In [35]:
def write_result(filename, tokens_per_file):
    with open(f"{filename}.txt", 'w', encoding='utf-8') as output:
        for file_name, words in tokens_per_file.items():
            output.write(f"{file_name}:\n")
            output.write(f"{words}, ")
            output.write("\n\n")

In [36]:

def normalize(token):
    for vowel in 'aeiouAEIOU':
        token = re.sub(f'{vowel}{{2,}}', f'{vowel}{vowel}', token)
    return token

In [37]:
filenames = os.listdir("blogs")
male_files = [f for f in filenames if f.startswith('M')]
female_files = [f for f in filenames if f.startswith('F')]
vowel_frequencies = defaultdict(int)
male_vowel_frequencies = defaultdict(int)
female_vowel_frequencies = defaultdict(int)

tokens_per_file = {}
male_tokens_per_file = {}
female_tokens_per_file = {}

def process_files(files, tokens_per_file, vowel_frequencies, should_normalize=False):
    results = {vowel: Counter() for vowel in 'aeiou'}
    for file_name in files:
        with open(os.path.join("blogs", file_name), 'r', encoding='utf-8') as file:
            text = file.read()
            tokens = find_vowel_duplications(text)
            if tokens:
                tokens_per_file[file_name] = tokens
                for token in tokens:
                    if should_normalize:
                        normalize(token)
                    token = token.lower()  # Case folding
                    vowel_frequencies[token] += 1
    return results

all_results = process_files(filenames, tokens_per_file, vowel_frequencies)
male_results = process_files(male_files, male_tokens_per_file, male_vowel_frequencies)
female_results = process_files(female_files, female_tokens_per_file, female_vowel_frequencies)

counter(vowel_frequencies)
write_result("results", tokens_per_file)
print("********************************************************************************")
print("Male")
counter(male_vowel_frequencies)
write_result("results_male", tokens_per_file)
print("********************************************************************************")
counter(female_vowel_frequencies)
write_result("results_female", tokens_per_file)
print("Female")

Top 3 words with duplication of vowel 'a':
afternoon: 181
asleep: 146
agree: 106

Top 3 words with duplication of vowel 'e':
been: 2761
see: 2615
feel: 1505

Top 3 words with duplication of vowel 'i':
looking: 598
feeling: 497
seeing: 208

Top 3 words with duplication of vowel 'o':
good: 2875
too: 1980
school: 1379

Top 3 words with duplication of vowel 'u':
queen: 47
volunteers: 33
understood: 27

********************************************************************************
Male
Top 3 words with duplication of vowel 'a':
afternoon: 68
asleep: 67
football: 51

Top 3 words with duplication of vowel 'e':
been: 1389
see: 1286
need: 651

Top 3 words with duplication of vowel 'i':
looking: 305
feeling: 200
seeing: 115

Top 3 words with duplication of vowel 'o':
good: 1514
too: 862
school: 669

Top 3 words with duplication of vowel 'u':
volunteers: 21
queen: 19
understood: 15

********************************************************************************
Top 3 words with duplication of 