### Project Gutenberg dictionary
Now we want to build a dictionary based on English books between 1600 - 1950 from the Project Gutenberg corpus. The idea is to create an era specific frequency dictionary to enhance the word correction in the pre-processing pipeline.

In [1]:
import glob
import pandas as pd
import re
from collections import Counter
from datasets import load_dataset
import os # To get the number of CPU cores
from tqdm import tqdm
import multiprocessing
import pickle
from functools import partial
import pickle

In [2]:
# This regex will extract words, including those with apostrophes (e.g., "don't", "artist's").
# It excludes standalone punctuation and numbers mixed with symbols.
WORD_REGEX = re.compile(r"\b[a-z0-9]+\b")
POSSESSIVE_REGEX = re.compile(r"'s\b")

GP_DIR = "./Data_project_gutenberg/"
DICTIONARY_DATA_DIR = './Dictionary_data/'

In [3]:
def get_file_list(directory_path, file_extension="csv"):
    """
    Return a list of files matching the given extension in directory_path.
    Exits if no files are found.
    """
    
    file_pattern = os.path.join(directory_path, "*." + file_extension) # e.g., './Data/*.csv'

    print(f"Finding all files matching: {file_pattern}")
    all_files = glob.glob(file_pattern, recursive=True)
    print(f"Found {len(all_files)} files to process.")

    if not all_files:
        print(f"Error: No files found. Check your DATA_DIR_PATH ({directory_path}) and file extension ({file_extension}).")
        exit() # Exit the script if no files are found
    
    return all_files, len(all_files)

In [4]:
def to_upper(text):
    """
    Convert value to string and uppercase it.
    Simpler and more robust than character-by-character handling.
    Accepts None and other non-str inputs without error.
    """
    result = ""   
    for c in str(text):
        if c.islower():
            result += c.upper()
        else:
            result += c
    return result

In [5]:
def read_project_gutenberg_book(file_path=None, title=None, author=None, language=None):
    with open(file_path, 'r') as file_content:
        start_saving    = False
        saved_text      = list()
        found_title     = None
        found_author    = ""
        found_language  = ""
        for line in file_content:
            clean_line = line.strip()
            if clean_line:
                if clean_line.startswith("*** END OF THE PROJECT GUTENBERG EBOOK"):
                    break

                split_line = clean_line.split()

                if split_line[0] == 'Title:':
                    offset = len('Title:')
                    found_title = clean_line[offset:].strip()
                    if title and found_title not in title:
                        return None, None, None, None
                    continue

                if split_line[0] == 'Author:':
                    offset = len('Author:')
                    found_author = clean_line[offset:].strip()
                    if author and found_author not in author:
                        return None, None, None, None
                    continue

                if split_line[0] == 'Language:':
                    offset = len('Language:')
                    found_language = clean_line[offset:].strip()
                    if language and found_language not in language:
                        return None, None, None, None
                    continue

                if found_title and clean_line == found_title.upper():
                    start_saving = True
                    continue

                if start_saving:
                    saved_text.append(clean_line)
                    saved_text.append(" ")

            else:
                if start_saving:
                    saved_text.append("\n")
        return found_title, found_author, found_language, "".join(saved_text)
                
    print(f"Title: {found_title}")
    print(f"Author: {found_author}")
    print(f"Language: {found_language}")

In [6]:
def worker_word_counter(file_batch=None, pg_args=dict()):
    # ... (same setup as before)
    _, _, _, text = read_project_gutenberg_book(file_path=file_batch, 
                                                title=pg_args.get('title'), 
                                                author=pg_args.get('author'), 
                                                language=pg_args.get('language'))
    
    unigram_counter = Counter()
    bigram_counter = Counter()

    if isinstance(text, str):
        # 1. Standardize and find all words
        text = re.sub(r"'s\b", "", text.lower())
        words = re.findall(r"\b[a-z0-9]+\b", text)
        
        # 2. Update Unigrams (standard word counts)
        unigram_counter.update(words)
        
        # 3. Create and update Bigrams (pairs of words)
        # Zip the words list with itself shifted by one to get (w1, w2) pairs
        if len(words) > 1:
            bigrams = [f"{words[i]} {words[i+1]}" for i in range(len(words)-1)]
            bigram_counter.update(bigrams)
            
    # Return both counters as a tuple
    return (unigram_counter, bigram_counter)

In [7]:
def driver_GP_frequency_builder(data_dir_path=None, file_extension="txt", batch_size=50, PG_args=None):
    all_files, file_count = get_file_list(data_dir_path, file_extension=file_extension)
    
    num_cores = max(1, os.cpu_count() - 2) 
    final_unigrams = Counter()
    final_bigrams = Counter()
    partial_worker = partial(worker_word_counter, pg_args=PG_args)

    print(f"Starting processing pool with {num_cores} workers...")

    with multiprocessing.Pool(processes=num_cores) as pool:
        # Using a larger chunksize for efficiency with many small files
        results = pool.imap_unordered(partial_worker, all_files, chunksize=batch_size)
        
        with tqdm(total=file_count, desc="Processing files") as pbar:
            for unigrams, bigrams in results:
                if unigrams:
                    final_unigrams.update(unigrams)
                if bigrams:
                    final_bigrams.update(bigrams)
                pbar.update(1)
            
        print("...Processing complete.")

    return final_unigrams, final_bigrams

In [8]:
unigram_counts, bigram_counts = driver_GP_frequency_builder(
    data_dir_path=GP_DIR + "*", 
    file_extension="txt", 
    PG_args={"language": "English"}
)

Finding all files matching: ./Data_project_gutenberg/*/*.txt
Found 75598 files to process.
Starting processing pool with 4 workers...


Processing files: 100%|██████████| 75598/75598 [09:45<00:00, 129.08it/s]

...Processing complete.





In [9]:
unigram_counts.most_common(200)

[('the', 69939084),
 ('of', 36367417),
 ('and', 34771680),
 ('to', 28947276),
 ('a', 24076281),
 ('in', 19593482),
 ('i', 13693634),
 ('that', 13329369),
 ('he', 13146077),
 ('was', 12606378),
 ('it', 12272177),
 ('his', 9718397),
 ('is', 8591193),
 ('with', 8572143),
 ('for', 8433910),
 ('as', 8381070),
 ('you', 8351348),
 ('had', 7554995),
 ('her', 6811634),
 ('on', 6424446),
 ('but', 6408171),
 ('at', 6394728),
 ('not', 6293547),
 ('she', 6199784),
 ('be', 6098392),
 ('by', 5279325),
 ('have', 4840001),
 ('they', 4766184),
 ('this', 4668595),
 ('from', 4596890),
 ('which', 4565175),
 ('him', 4442740),
 ('all', 4329706),
 ('were', 3977033),
 ('one', 3893614),
 ('we', 3801237),
 ('or', 3628079),
 ('are', 3602511),
 ('so', 3584900),
 ('there', 3524106),
 ('my', 3447950),
 ('an', 3410976),
 ('said', 3284459),
 ('me', 3237021),
 ('no', 3105253),
 ('their', 3010266),
 ('when', 2963006),
 ('t', 2956252),
 ('if', 2904478),
 ('would', 2881075),
 ('been', 2806384),
 ('who', 2734310),
 ('what'

In [10]:
bigram_counts.most_common(200)

[('of the', 9660466),
 ('in the', 5494338),
 ('to the', 3727589),
 ('and the', 2624167),
 ('on the', 2398563),
 ('it was', 1927731),
 ('to be', 1900618),
 ('at the', 1722586),
 ('it is', 1670519),
 ('of a', 1628586),
 ('for the', 1596607),
 ('from the', 1524104),
 ('with the', 1455091),
 ('in a', 1420436),
 ('by the', 1395189),
 ('he was', 1367840),
 ('he had', 1297660),
 ('that the', 1246975),
 ('of his', 1223769),
 ('with a', 1195597),
 ('had been', 1055322),
 ('was a', 996505),
 ('that he', 919143),
 ('don t', 899085),
 ('and i', 846223),
 ('in his', 839217),
 ('into the', 816993),
 ('all the', 793271),
 ('for a', 790144),
 ('one of', 775847),
 ('i have', 766928),
 ('there was', 754192),
 ('as the', 745067),
 ('i am', 721040),
 ('have been', 719225),
 ('out of', 712113),
 ('as a', 711816),
 ('and a', 693940),
 ('the same', 693142),
 ('is a', 690806),
 ('did not', 677127),
 ('was the', 658313),
 ('is the', 655139),
 ('she was', 650203),
 ('and he', 636755),
 ('she had', 634960),
 ('t

In [11]:
output_path = os.path.join(DICTIONARY_DATA_DIR, "project_gutenberg_word_count.txt")
with open(output_path, "w", encoding="utf-8") as f:
    # most_common() already returns (word, count) sorted desc by count
    for word, count in unigram_counts.most_common():
        f.write(f"{word}\t{count}\n")


output_path = os.path.join(DICTIONARY_DATA_DIR, "project_gutenberg_word_count.pkl")
with open(output_path, 'wb') as file:
    pickle.dump(unigram_counts, file)
print(f"Saved {len(unigram_counts)} identified words with frequencies to: {output_path}")

Saved 1337241 identified words with frequencies to: ./Dictionary_data/project_gutenberg_word_count.pkl


In [12]:
# Save Bigrams in SymSpell format
output_path_bigrams = os.path.join(DICTIONARY_DATA_DIR, "project_gutenberg_bigrams.txt")
with open(output_path_bigrams, "w", encoding="utf-8") as f:
    for bigram, count in bigram_counts.most_common():
        # bigram is already "word1 word2"
        f.write(f"{bigram}\t{count}\n")

# Also save as pickle for fast loading in other scripts
with open(os.path.join(DICTIONARY_DATA_DIR, "project_gutenberg_bigrams.pkl"), 'wb') as file:
    pickle.dump(bigram_counts, file)