In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from typing import Any

import h5py

import cProfile
import multiprocessing
from multiprocessing.managers import DictProxy

In [2]:
from utils.samples_and_labels import (
    build_len_to_most_frequent_letters,
    build_n_grams,
    save_data,
)

In [3]:
# Load the dictionary
dictionary_file_location = "./data/words_250000_train.txt"
text_file = open(dictionary_file_location,"r")
full_dictionary = text_file.read().splitlines()
text_file.close()

In [4]:
words_train, words_val = train_test_split(full_dictionary, test_size=0.2, random_state=42)
print(f"Number of words_train: {len(words_train)}")
print(f"Number of words_val: {len(words_val)}")

Number of words_train: 181840
Number of words_val: 45460


In [5]:
# Obtain guess combinations with the most common letters in English
GUESSABLE_CHARS = {"e", "t", "a", "o", "i", "n"}
GUESSED_CHARS = set()

# Create a maximum of 50 samples per word
MAX_SAMPLES_PER_WORD = 50

LEN_TO_MOST_FREQUENT_LETTERS = build_len_to_most_frequent_letters(full_dictionary)
N_GRAMS = build_n_grams(full_dictionary)

In [6]:
def process_words(
    words_to_process: list[str],
    filename: str,
    start_from: int,
    end_at: int,
    batch_size: int,
    num_processes: int,
    cache: DictProxy,
    lock: Any,
):
    """Process words and save samples and labels in parallel."""
    assert num_processes < multiprocessing.cpu_count()

    chunk_size = (end_at - start_from) // num_processes + 1

    processes = []
    for i in range(num_processes):
        start = start_from + i * chunk_size
        end = min(start_from + (i + 1) * chunk_size, end_at)

        p = multiprocessing.Process(
            target=save_data,
            args=(
                words_to_process,
                GUESSED_CHARS,
                GUESSABLE_CHARS,
                MAX_SAMPLES_PER_WORD,
                filename,
                start,
                end,
                batch_size,
                cache,
                LEN_TO_MOST_FREQUENT_LETTERS,
                N_GRAMS,
                lock
            )
        )
        processes.append(p)
        p.start()

    for p in processes:
        p.join()

In [7]:
# Make sure to use same cache throughout preprocessing
manager = multiprocessing.Manager()
cache = manager.dict()

lock = multiprocessing.Lock()

In [8]:
FILENAME_TRAIN= "./data/train_data.h5"
FILENAME_VAL= "./data/validation_data.h5"

Training data
```
0       - 60,000 : done
60,000  - 120,000: done
120,000 - 181,840: done
```

Validation data
```
0       - 45,460 : done
```

Update `start_from` and `end_at` to process data

In [10]:
filename= FILENAME_TRAIN
words_to_process = words_train

start_from = 0
end_at = 181_840

batch_size = 250
num_processes = 9

process_words(
    words_to_process=words_to_process,
    filename=filename,
    start_from=start_from,
    end_at=end_at,
    batch_size=batch_size,
    num_processes=num_processes,
    cache=cache,
    lock=lock,
)

Processing: 100000-100250. End: 109094. Length of all words to process: 181840
Processing: 109094-109344. End: 118188. Length of all words to process: 181840
Processing: 100250-100500. End: 109094. Length of all words to process: 181840
Processing: 118188-118438. End: 127282. Length of all words to process: 181840
Processing: 109344-109594. End: 118188. Length of all words to process: 181840
Processing: 127282-127532. End: 136376. Length of all words to process: 181840
Processing: 100500-100750. End: 109094. Length of all words to process: 181840
Processing: 136376-136626. End: 145470. Length of all words to process: 181840
Processing: 118438-118688. End: 127282. Length of all words to process: 181840
Processing: 145470-145720. End: 154564. Length of all words to process: 181840
Processing: 109594-109844. End: 118188. Length of all words to process: 181840
Processing: 127532-127782. End: 136376. Length of all words to process: 181840
Processing: 154564-154814. End: 163658. Length of al

In [13]:
filename= FILENAME_VAL
words_to_process = words_val

start_from = 0
end_at = 45_460

batch_size = 250
num_processes = 9

process_words(
    words_to_process=words_to_process,
    filename=filename,
    start_from=start_from,
    end_at=end_at,
    batch_size=batch_size,
    num_processes=num_processes,
    cache=cache,
    lock=lock,
)

Processing: 0-250. End: 5052. Length of all words to process: 45460
Processing: 5052-5302. End: 10104. Length of all words to process: 45460
Processing: 250-500. End: 5052. Length of all words to process: 45460
Processing: 10104-10354. End: 15156. Length of all words to process: 45460
Processing: 5302-5552. End: 10104. Length of all words to process: 45460
Processing: 15156-15406. End: 20208. Length of all words to process: 45460
Processing: 500-750. End: 5052. Length of all words to process: 45460
Processing: 20208-20458. End: 25260. Length of all words to process: 45460
Processing: 10354-10604. End: 15156. Length of all words to process: 45460
Processing: 25260-25510. End: 30312. Length of all words to process: 45460
Processing: 5552-5802. End: 10104. Length of all words to process: 45460
Processing: 15406-15656. End: 20208. Length of all words to process: 45460
Processing: 30312-30562. End: 35364. Length of all words to process: 45460
Processing: 750-1000. End: 5052. Length of all w

In [16]:
len(cache)

9577841

In [17]:
with h5py.File(FILENAME_TRAIN, 'r') as hf:
    X_train_masked_words = hf["masked_words"]
    X_train_guesses = hf["previous_guesses"]
    y_train = hf['next_guess_probs']

    assert X_train_masked_words.shape[0] == X_train_guesses.shape[0] == y_train.shape[0]
    LEN_TRAIN = y_train.shape[0]
    print(f"Number of training samples and labels: {LEN_TRAIN}")

Number of training samples and labels: 7665680


In [18]:
with h5py.File(FILENAME_VAL, 'r') as hf:
    X_val_masked_words = hf["masked_words"]
    X_val_guesses = hf["previous_guesses"]
    y_val = hf['next_guess_probs']

    assert X_val_masked_words.shape[0] == X_val_guesses.shape[0] == y_val.shape[0]
    LEN_VAL = y_val.shape[0]
    print(f"Number of validation samples and labels: {LEN_VAL}")

Number of validation samples and labels: 1912161


We have 7,665,680 training and 1,912,161 validation samples and labels.

In total: 9,577,841

## Post processing

- replace masked_word padding with -1
- apply smoothing to labels
- shuffle data

In [19]:
def update_h5_dataset(
    filename: str,
    masked_words: np.ndarray,
    previous_guesses: np.ndarray,
    next_guess_probs: np.ndarray,
):
    """Create or update an h5 dataset with new samples and labels. Save the dataset in chunks for faster access in batches."""
    with h5py.File(filename, "a") as hf:
        if "masked_words" in hf and "previous_guesses" in hf and "next_guess_probs" in hf:
            hf["masked_words"].resize((hf["masked_words"].shape[0] + masked_words.shape[0]), axis=0)
            hf["masked_words"][-masked_words.shape[0]:] = masked_words

            hf["previous_guesses"].resize((hf["previous_guesses"].shape[0] + previous_guesses.shape[0]), axis=0)
            hf["previous_guesses"][-previous_guesses.shape[0]:] = previous_guesses

            hf["next_guess_probs"].resize((hf["next_guess_probs"].shape[0] + next_guess_probs.shape[0]), axis=0)
            hf["next_guess_probs"][-next_guess_probs.shape[0]:] = next_guess_probs
        else:
            hf.create_dataset("masked_words", data=masked_words, maxshape=(None, *masked_words.shape[1:]), compression="gzip", compression_opts=2)
            hf.create_dataset("previous_guesses", data=previous_guesses, maxshape=(None, *previous_guesses.shape[1:]), compression="gzip", compression_opts=2)
            hf.create_dataset("next_guess_probs", data=next_guess_probs, maxshape=(None, *next_guess_probs.shape[1:]), compression="gzip", compression_opts=2)

In [20]:
SMOOTHING_FACTOR = 0.01


def replace_padding_with_minus_one(masked_words: np.ndarray) -> np.ndarray:
    row_sums = np.sum(masked_words, axis=2)
    masked_words[row_sums == 0] = -1
    return masked_words


def apply_smoothing(labels: np.ndarray, smoothing_factor: float) -> np.ndarray:
    return (labels + smoothing_factor) / (1 + smoothing_factor * labels.shape[1])

In [21]:
### train data

In [22]:
with h5py.File(FILENAME_TRAIN, 'r') as hf:
    X_train_masked_words = hf["masked_words"][:]
    X_train_guesses = hf["previous_guesses"][:]
    y_train = hf['next_guess_probs'][:]

    assert X_train_masked_words.shape[0] == X_train_guesses.shape[0] == y_train.shape[0]
    LEN_TRAIN = y_train.shape[0]
    print(f"Number of training samples and labels: {LEN_TRAIN}")

Number of training samples and labels: 7665680


In [23]:
X_train_masked_words = replace_padding_with_minus_one(X_train_masked_words)
y_train = apply_smoothing(y_train, SMOOTHING_FACTOR)

In [24]:
indices = np.arange(LEN_TRAIN)
np.random.shuffle(indices)

X_train_masked_words = X_train_masked_words[indices]
X_train_guesses = X_train_guesses[indices]
y_train = y_train[indices]

In [25]:
FILENAME_TRAIN_POST= "./data/train_data_post.h5"
batch_size = 250

for i in range(0, LEN_TRAIN, batch_size):
    masked_words = X_train_masked_words[i:i+batch_size]
    previous_guesses = X_train_guesses[i:i+batch_size]
    next_guess_probs = y_train[i:i+batch_size]
    update_h5_dataset(FILENAME_TRAIN_POST, masked_words, previous_guesses, next_guess_probs)

In [26]:
### validation data

In [27]:
with h5py.File(FILENAME_VAL, 'r') as hf:
    X_val_masked_words = hf["masked_words"][:]
    X_val_guesses = hf["previous_guesses"][:]
    y_val = hf['next_guess_probs'][:]

    assert X_val_masked_words.shape[0] == X_val_guesses.shape[0] == y_val.shape[0]
    LEN_VAL = y_val.shape[0]
    print(f"Number of validation samples and labels: {LEN_VAL}")

Number of validation samples and labels: 1912161


In [28]:
X_val_masked_words = replace_padding_with_minus_one(X_val_masked_words)
y_val = apply_smoothing(y_val, SMOOTHING_FACTOR)

In [29]:
# Shuffle the validation data
indices = np.arange(LEN_VAL)
np.random.shuffle(indices)

X_val_masked_words = X_val_masked_words[indices]
X_val_guesses = X_val_guesses[indices]
y_val = y_val[indices]

In [30]:
FILENAME_VAL_POST= "./data/validation_data_post.h5"
batch_size = 250

for i in range(0, LEN_VAL, batch_size):
    masked_words = X_val_masked_words[i:i+batch_size]
    previous_guesses = X_val_guesses[i:i+batch_size]
    next_guess_probs = y_val[i:i+batch_size]
    update_h5_dataset(FILENAME_VAL_POST, masked_words, previous_guesses, next_guess_probs)