In [5]:
import collections
import os
corpus_file = os.path.join('Data', 'corpus.txt')

# Use a defaultdict to automatically handle new list creation
words_by_length = collections.defaultdict(list)

print(f"Starting to process {corpus_file}...")

# Check if the file exists before trying to open it
if not os.path.exists(corpus_file):
    print(f"Error: The file '{corpus_file}' was not found.")
    print("Please make sure your repository structure is correct.")
else:
    with open(corpus_file, 'r') as f:
        for line in f:
            # 1. Strip leading/trailing whitespace and convert to lowercase
            word = line.strip().lower()
            
            # 2. Filter out any lines that aren't purely alphabetic
            if word.isalpha():
                # 3. Group the word by its length
                words_by_length[len(word)].append(word)

    print("Corpus processing complete.")
    
    # --- Optional: Print some stats to verify ---
    print(f"Found words for {len(words_by_length)} different lengths.")
    
    lengths_sorted = sorted(words_by_length.keys())
    
    if lengths_sorted:
        print("\n--- Word Counts Per Length (Sample) ---")
        sample_lengths = lengths_sorted[:5] + lengths_sorted[-5:]
        for length in sample_lengths:
            print(f"Length {length}: {len(words_by_length[length])} words")
            
        if 7 in words_by_length:
            print("\nExample (first 5 words of length 7):")
            print(words_by_length[7][:5])

Starting to process Data/corpus.txt...
Corpus processing complete.
Found words for 24 different lengths.

--- Word Counts Per Length (Sample) ---
Length 1: 46 words
Length 2: 84 words
Length 3: 388 words
Length 4: 1169 words
Length 5: 2340 words
Length 20: 40 words
Length 21: 16 words
Length 22: 8 words
Length 23: 3 words
Length 24: 1 words

Example (first 5 words of length 8):
['selamlik', 'unsealed', 'wanworth', 'sonorous', 'reborrow']
