<a href="https://colab.research.google.com/github/branndonm1/branndonm1/blob/main/Hangman_simulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import random
import torch
from torch.utils.data import Dataset, DataLoader

class MaskedWordDataset(Dataset):
    def __init__(self, word_list, mask_prob=0.3):
        self.words = word_list
        self.vocab = sorted(list(set("".join(self.words))))
        self.char2idx = {c: i+1 for i, c in enumerate(self.vocab)}  # 0 for padding
        self.char2idx['_'] = len(self.char2idx) + 1  # Special token for blank
        self.idx2char = {i: c for c, i in self.char2idx.items()}
        self.mask_prob = mask_prob
        self.max_len = max(len(w) for w in self.words)

    def __len__(self):
        return len(self.words)

    def __getitem__(self, idx):
        word = self.words[idx]
        masked = []
        target = []
        for c in word:
            if random.random() < self.mask_prob:
                masked.append('_')
                target.append(c)
            else:
                masked.append(c)
                target.append(None)

        input_ids = [self.char2idx.get(c, 0) for c in masked]
        target_ids = [self.char2idx[c] if c else 0 for c in target]
        mask = [1 if t else 0 for t in target]

        # Pad
        while len(input_ids) < self.max_len:
            input_ids.append(0)
            target_ids.append(0)
            mask.append(0)

        return torch.tensor(input_ids), torch.tensor(target_ids), torch.tensor(mask)

import torch.nn as nn

class MaskedCharModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=64, hidden_dim=128):
        super(MaskedCharModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size + 2, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.classifier = nn.Linear(hidden_dim * 2, vocab_size + 1)  # Not including '_' or padding

    def forward(self, x):
        emb = self.embedding(x)
        lstm_out, _ = self.lstm(emb)
        logits = self.classifier(lstm_out)  # (batch, seq_len, vocab_size)
        return logits


def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0

    for x, y, mask in dataloader:
        x, y, mask = x.to(device), y.to(device), mask.to(device)
        optimizer.zero_grad()

        logits = model(x)  # (B, L, V)
        logits = logits.view(-1, logits.size(-1))
        y = y.view(-1)
        mask = mask.view(-1)

        loss = criterion(logits[mask == 1], y[mask == 1])
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(dataloader)

def predict_blanks(model, input_str, char2idx, idx2char, device, max_len=20):
    model.eval()
    input_ids = [char2idx.get(c, 0) for c in input_str]
    input_ids += [0] * (max_len - len(input_ids))

    with torch.no_grad():
        input_tensor = torch.tensor([input_ids]).to(device)
        logits = model(input_tensor)[0]  # shape: (seq_len, vocab_size)

        # Build a set of visible letters in the input
        visible_letters = set(c for c in input_str if c != '_' and c in char2idx)

        # Convert visible letters to indices
        forbidden_indices = [char2idx[c] for c in visible_letters]

        # Mask out forbidden letters by setting logits to -inf
        for i, c in enumerate(input_str):
            if c == '_':
                logits[i][forbidden_indices] = float('-inf')  # Prevent reused characters

        probs = torch.softmax(logits, dim=-1)

    result = {}
    for i, c in enumerate(input_str):
        if c == '_':
            for j in range(1, len(idx2char)):
                ch = idx2char[j]
                prob = probs[i][j].item()
                if prob > 0:
                    result[ch] = result.get(ch, 0) + prob

    total = sum(result.values())
    if total == 0:
        return []
    result = [(ch, p / total) for ch, p in sorted(result.items(), key=lambda x: -x[1])]
    return result


import torch.optim as optim

# Load your 250K word list
with open("words_250000_train.txt") as f:
  word_list = [line.strip().lower() for line in f if line.strip().isalpha()]


dataset = MaskedWordDataset(word_list)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = MaskedCharModel(vocab_size=len(dataset.char2idx)).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(10):
    loss = train(model, dataloader, optimizer, criterion, device)
    print(f"Epoch {epoch+1}: Loss = {loss:.4f}")


Epoch 1: Loss = 2.1929
Epoch 2: Loss = 2.0172
Epoch 3: Loss = 1.9607
Epoch 4: Loss = 1.9317
Epoch 5: Loss = 1.9128
Epoch 6: Loss = 1.8956
Epoch 7: Loss = 1.8822
Epoch 8: Loss = 1.8703
Epoch 9: Loss = 1.8612
Epoch 10: Loss = 1.8580


In [None]:
input_str = "te__er"
result = predict_blanks(model, input_str, dataset.char2idx, dataset.idx2char, device)
print(result)  # [('e', 0.23), ('o', 0.18), ...]


[('n', 0.19607991624719115), ('l', 0.1852843710593658), ('k', 0.12297534124521213), ('s', 0.08546447848587933), ('d', 0.055260395119455005), ('a', 0.047677083926141665), ('z', 0.04291503668275719), ('p', 0.04094336223339335), ('m', 0.03568729037386653), ('i', 0.03283926021226334), ('g', 0.027158040980432615), ('h', 0.025884854876116716), ('c', 0.020566631784606458), ('w', 0.017789772830833696), ('v', 0.014827331404303964), ('u', 0.013889362560820176), ('b', 0.012228238827427605), ('f', 0.012087789063027396), ('o', 0.004583039529870631), ('x', 0.0034127418142365605), ('y', 0.0023263112271546663), ('j', 7.607764450745664e-05), ('q', 4.327187113656015e-05)]


In [None]:
def hangman_simulator(target_word, guess_fn, max_attempts=6, verbose=True):
    guessed_letters = set()
    attempts_left = max_attempts
    current_masked = ['_' for _ in target_word]

    def update_mask(letter):
        return [letter if target_word[i] == letter else current_masked[i]
                for i in range(len(target_word))]

    if verbose:
        print(f"Target word: {'_' * len(target_word)} (hidden)")
    while attempts_left > 0 and '_' in current_masked:
        guess = guess_fn("".join(current_masked), guessed_letters)
        guessed_letters.add(guess)

        if guess in target_word:
            current_masked = update_mask(guess)
            if verbose:
                print(f"✅ Correct guess: {guess} → {''.join(current_masked)}")
        else:
            attempts_left -= 1
            if verbose:
                print(f"❌ Wrong guess: {guess} → {''.join(current_masked)} | Lives left: {attempts_left}")

    if '_' not in current_masked:
        if verbose:
            print(f"🎉 You won! Word was: {target_word}")
        return True
    else:
        if verbose:
            print(f"💀 You lost. Word was: {target_word}")
        return False

used_guesses = set()

def guess(masked_word, guessed_letters):
    global used_guesses

    #if not used_guesses:
    #    used_guesses.add('e')
    #    return 'e'



    # Get model probabilities
    result = predict_blanks(model, masked_word, dataset.char2idx, dataset.idx2char, device, max_len=dataset.max_len)

    # Guess the highest ranked character not already guessed
    for char, _ in result:
        if char not in guessed_letters:
            used_guesses.add(char)
            return char

    # fallback
    for c in 'abcdefghijklmnopqrstuvwxyz':
        if c not in guessed_letters:
            used_guesses.add(c)
            return c

    return 'a'  # Default fallback

used_guesses = set()

hangman_simulator("principal", guess_fn=guess)


Target word: _________ (hidden)
❌ Wrong guess: e → _________ | Lives left: 5
❌ Wrong guess: s → _________ | Lives left: 4
✅ Correct guess: i → __i__i___
✅ Correct guess: n → __in_i___
❌ Wrong guess: t → __in_i___ | Lives left: 3
✅ Correct guess: a → __in_i_a_
✅ Correct guess: c → __inci_a_
✅ Correct guess: l → __inci_al
✅ Correct guess: r → _rinci_al
✅ Correct guess: p → principal
🎉 You won! Word was: principal


True

In [None]:
def run_hangman_simulation(model, dataset, word_list_1000, num_games=1000, verbose_every=0):
    wins = 0
    total = 0
    device = next(model.parameters()).device

    def predict_blanks_for_game(masked_word, guessed_letters):
        result = predict_blanks(model, masked_word, dataset.char2idx, dataset.idx2char, device, max_len=dataset.max_len)
        for char, _ in result:
            if char not in guessed_letters:
                return char
        for c in 'abcdefghijklmnopqrstuvwxyz':
            if c not in guessed_letters:
                return c
        return 'a'

    for i in range(num_games):
        target_word = random.choice(word_list_1000)
        used_guesses.clear()  # Reset per game

        def guess(masked_word, guessed_letters):
            global used_guesses
            return predict_blanks_for_game(masked_word, guessed_letters)

        win = hangman_simulator(target_word, guess_fn=guess, max_attempts=6, verbose=(verbose_every > 0 and i % verbose_every == 0))
        wins += int(win)
        total += 1

    win_rate = wins / total
    print(f"\n✅ Hangman Strategy Win Rate: {wins}/{total} = {win_rate:.2%}")
    return win_rate


# Make sure this is your 1000-word test list
word_list_1000 = random.sample(dataset.words, 1000)

# Run simulation
run_hangman_simulation(model, dataset, word_list_1000, num_games=1000)



✅ Hangman Strategy Win Rate: 504/1000 = 50.40%


0.504

In [None]:
import nltk
nltk.download('words')
from nltk.corpus import words

all_words = [w.lower() for w in words.words() if w.isalpha() and len(w) >= 4]
word_list_external = random.sample(all_words, 1000)  # or fewer if you want


# Filter to only words fully covered by model vocab
#def filter_words_by_vocab(word_list, allowed_chars):
#    return [w for w in word_list if all(c in allowed_chars for c in w)]

#allowed_chars = set(dataset.char2idx.keys())
#word_list_external = filter_words_by_vocab(word_list_external, allowed_chars)

# Now simulate
run_hangman_simulation(model, dataset, word_list_external, num_games=1000)


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!



✅ Hangman Strategy Win Rate: 553/1000 = 55.30%


0.553

In [None]:
# Save model and metadata
torch.save({
    'model_state_dict': model.state_dict(),
    'char2idx': dataset.char2idx,
    'idx2char': dataset.idx2char
}, 'masked_char_model.pth')


In [None]:
# Load checkpoint
checkpoint = torch.load('masked_char_model.pth', map_location=device)

# Recreate dataset-dependent variables
char2idx = checkpoint['char2idx']
idx2char = checkpoint['idx2char']
vocab_size = len(char2idx)

# Recreate model
model = MaskedCharModel(vocab_size=vocab_size).to(device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()


_pp_e
