In [1]:
def build_dictionary(dictionary_file_location):
  text_file = open(dictionary_file_location,"r")
  full_dictionary = text_file.read().splitlines()
  text_file.close()
  return full_dictionary

In [2]:
d = build_dictionary("words_250000_train.txt")

In [3]:

from collections import defaultdict, Counter

def most_common_character_by_length(words):
    words_by_length = defaultdict(list)
    for word in words:
        words_by_length[len(word)].append(word)

    most_common_char_by_length = {}
    for length, word_list in words_by_length.items():
        char_counter = Counter()
        for word in word_list:
            char_counter.update(word)
        most_common_char_by_length[length] = char_counter.most_common(1)[0][0]

    return most_common_char_by_length

In [4]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import random
import string

ALPHABET = string.ascii_lowercase
ALPHABET_IDX = {char: idx+1 for idx, char in enumerate(ALPHABET)}
ALPHABET_IDX['_'] = 0
words = build_dictionary("words_250000_train.txt")
MAX_WORD_LENGTH = max([len(word) for word in words])

class HangmanDataset(Dataset):
    def __init__(self, words):
        self.samples = []
        for word in words:
            self.samples.extend(self.create_samples(word))
        self.samples = random.sample(self.samples, 10000)
    def create_samples(self, word):
        samples = []
        word_indices = [ALPHABET_IDX[char] for char in word]
        for i in range(1, len(word)):  # Start from 1 to ensure at least one letter is guessed
            guessed_indices = word_indices[:i]
            guessed_mask = [1 if idx in guessed_indices else 0 for idx in range(len(ALPHABET))]
            input_state = [idx if idx in guessed_indices else ALPHABET_IDX['_'] for idx in word_indices]
            remaining_letters = list(set(word_indices) - set(guessed_indices))
            if remaining_letters:
                target_letter = random.choice(remaining_letters)
                samples.append((input_state, guessed_mask, target_letter))
        return samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        input_state, guessed_mask, target_letter = self.samples[idx]
        input_state = input_state + [ALPHABET_IDX['_']] * (MAX_WORD_LENGTH - len(input_state))  # Padding
        return (torch.tensor(input_state, dtype=torch.long),
                torch.tensor(guessed_mask, dtype=torch.float),
                torch.tensor(target_letter, dtype=torch.long))

# Example of how to use the dataset
def build_dictionary(file_path):
    with open(file_path, 'r') as f:
        words = f.read().splitlines()
    return words


dataset = HangmanDataset(words)


In [5]:
train_ratio = .8
train_size = int(train_ratio * len(dataset))

train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, len(dataset) - train_size])

In [6]:
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128)

In [7]:
class HangmanLSTM(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super(HangmanLSTM, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=ALPHABET_IDX['_'])
        self.lstm = nn.LSTM(embedding_dim + 26, hidden_dim, num_layers = 2, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, guessed):
        x = self.embedding(x)
        guessed = guessed.unsqueeze(1).expand(-1, x.size(1), -1)
        x = torch.cat((x, guessed), dim = 2)

        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out[:, -1])
        return out


INPUT_DIM = len(ALPHABET) + 1
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
OUTPUT_DIM = len(ALPHABET) + 1

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HangmanLSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM).to(device)


In [9]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [10]:
def train(model, data_loader, criterion, optimizer, device, epochs=5):
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        for input_state, guessed_mask, target_letter in data_loader:
            input_state, guessed_mask, target_letter = input_state.to(device), guessed_mask.to(device), target_letter.to(device)
            optimizer.zero_grad()
            output = model(input_state, guessed_mask)
            loss = criterion(output, target_letter)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f'Epoch {epoch+1}/{epochs}, Loss: {epoch_loss/len(data_loader):.4f}')

train(model, train_loader, criterion, optimizer, device)


Epoch 1/5, Loss: 2.9322
Epoch 2/5, Loss: 2.7871
Epoch 3/5, Loss: 2.7128
Epoch 4/5, Loss: 2.6532
Epoch 5/5, Loss: 2.6054


In [12]:
def predict_next_letter(model, current_word, guessed_letters, device):
    model.eval()
    word_indices = [ALPHABET_IDX[char] if char != '_' else ALPHABET_IDX['_'] for char in current_word]
    guessed_mask = [1 if char in guessed_letters else 0 for char in ALPHABET]

    word_indices += [ALPHABET_IDX['_']] * (MAX_WORD_LENGTH - len(word_indices))

    input_state = torch.tensor([word_indices], dtype=torch.long).to(device)
    guessed_mask = torch.tensor([guessed_mask], dtype=torch.float).to(device)

    with torch.no_grad():
        output = model(input_state, guessed_mask)
    predicted_letter_idx = output.argmax(dim=1).item()
    predicted_letter = ALPHABET[predicted_letter_idx]

    return predicted_letter

current_word = "ex_mple"
guessed_letters = ['e', 'x', 'm', 'p', 'l']
predicted_letter = predict_next_letter(model, current_word, guessed_letters, device)
print(predicted_letter)

o


In [18]:
model.eval()  # Set the model to evaluation mode
test_loss = 0
correct_predictions = 0
total_predictions = 0

with torch.no_grad():  # No need to track gradients for testing
    for inputs, guessed, targets in test_loader:

        outputs = model(inputs, guessed)
        loss = criterion(outputs, targets)
        test_loss += loss.item()

        _, predicted = torch.max(outputs.data, 1)
        total_predictions += targets.size(0)
        correct_predictions += (predicted == targets).sum().item()

# Calculate the average loss and accuracy
average_test_loss = test_loss / len(test_loader)
accuracy = correct_predictions / total_predictions
print(accuracy)
print(average_test_loss)

0.167
2.682900443673134
