## Training a model offline for the task

### Importing libraries needed

In [1]:
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import *
from torch.utils.data import Dataset, DataLoader, random_split
import collections 
import time
from datetime import datetime

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [2]:
def build_dictionary(dictionary_file_location):
    text_file = open(dictionary_file_location,"r")
    full_dictionary = text_file.read().splitlines()
    text_file.close()
    return full_dictionary

full_dictionary_location = "words_250000_train.txt"
full_dictionary = build_dictionary(full_dictionary_location)       
full_dictionary_common_letter_sorted = collections.Counter("".join(full_dictionary)).most_common()

In [3]:
len(full_dictionary)

227300

### Creating a dataset with random guesses and masked words
- I select a random valid subset of letters that could be a guess with enough lives remaining.\
  In any sample data point,
    - there are at most 5 incorrect guesses
    - there are at least 1 un-guessed letters
- There are 15 random data points created for every word in the given list
- This data can of course be made much larger, but I limited this due to time.

In [4]:
unique_letters = 'abcdefghijklmnopqrstuvwxyz'

words_guesses_list = []
remaining_list = []

def create_data_small(dictionary):
    for word in dictionary:
        for i in range(15):
            letters = set(word)
            k = float('inf')
            l = 0
            while k>5 or l<1:
                size = random.randint(0, len(letters)+4)
                random_letters = set(random.sample(unique_letters, size))
                k = len(random_letters.difference(letters))
                l = len(letters.difference(random_letters))
                
            prev_state = ''.join(i if i in random_letters else '_' for i in word)
            remaining = list(letters.difference(set(random_letters)))
            
            words_guesses_list.append((word,random_letters))
            remaining_list.append(remaining)

a = time.time()

create_data_small(full_dictionary)
torch.save({'words_guesses':words_guesses_list,'remaining':remaining_list}, 'dataset_large.pt')

b = time.time()
print(f"Created data set in time: {b-a}")

Created data set in time: 55.85075664520264


In [5]:
start = time.time()

data = torch.load('dataset_large.pt', weights_only=False)

end = time.time()
print("data loaded in time: {}".format(end-start))

data loaded in time: 7.747374057769775


### Helper functions to convert data as input for model
- One Hot encoding of curret state, each letter becomes a One-hot vector of size 27 (accounting for '_')
- All the guessed letters are shown by indicator variables 1 or 0 in array of size 26
- The perfect guess by a model is equal probability to all remaining letters

In [6]:
def OH_current_state(word, guessed):
    indices = [(ord(i)-97 if ord(i)-97 in guessed else 26) for i in word]
    one_hot_state = torch.zeros((len(word),27), dtype=torch.float32)
    for i,j in enumerate(indices): 
        one_hot_state[i,j] = 1
    return one_hot_state

def ID_guessed_letters(guessed):
    guessed = [ord(i)-97 for i in guessed]
    id_guesses = torch.zeros(26, dtype=torch.float32)
    for i in guessed:
        id_guesses[i] = 1.0
    return id_guesses

def correct_probabilities(remaining):
    remaining = [ord(i)-97 for i in remaining]
    probabilities = torch.zeros(26, dtype=torch.float32)
    for i in remaining:
        probabilities[i] = 1.0
    probabilities /= probabilities.sum()
    return probabilities 

### Data preprocessing to encode inputs correctly

In [7]:
a = time.time()

X = data['words_guesses']
X1 = [OH_current_state(*i) for i in X]
X2 = [ID_guessed_letters(i[1]) for i in X]
Y = data['remaining']
Y = [correct_probabilities(i) for i in Y]

b = time.time()
print(f'Data processed in time: {b-a}')

In [8]:
len(X1), len(X2), len(Y)

(3409500, 3409500, 3409500)

### Dataset and Dataloader creation

In [9]:
class MaskedWordsDataset(Dataset):
    def __init__(self, X1, X2, Y):
        self.X1 = X1
        self.X2 = X2
        self.Y  = Y
    
    def __len__(self):
        return len(self.X1)
    
    def __getitem__(self, index):
        feat_1 = self.X1[index]
        feat_2 = self.X2[index]
        answer = self.Y[index]
        return feat_1, feat_2, answer

def my_collate(batch):
    feat_1, feat_2, answer = zip(*batch)
    max_len = max(x.shape[0] for x in feat_1)
    padded_feat_1 = [torch.cat([x, torch.zeros(max_len - x.shape[0], x.shape[1], dtype=torch.float32)], dim=0) for x in feat_1]

    return torch.stack(padded_feat_1).squeeze(1), torch.stack(feat_2), torch.stack(answer)

In [10]:
dataset = MaskedWordsDataset(X1, X2, Y)

train_size = int(0.9 * len(dataset))
test_size  = len(dataset) - train_size
batch_size = 500

train_set, dev_set = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True , collate_fn=my_collate, num_workers=8)
dev_loader   = DataLoader(dev_set  , batch_size=batch_size, shuffle=False, collate_fn=my_collate, num_workers=8)


### Model architecture
- I chose a bidirectional LSTM with 3 layers and dropout 0.2
- The fully connected linear layers face a higher dropout layer value of 0.4
- Previously guessed letters are added as input during the second last linear layer

In [11]:
class BI_LSTM_model(nn.Module):
    
    def __init__(self, size_word, size_guessed, hidden_size_lstm, out_size):
        super(BI_LSTM_model, self).__init__()
        self.lstm = nn.LSTM(size_word, hidden_size_lstm, num_layers=3, bidirectional=True, dropout=0.2, batch_first=True)
        self.lin1 = nn.Linear(hidden_size_lstm * 2 + size_guessed, 128)  # hidden_size * 2 because of bidirectional
        self.lin2 = nn.Linear(128, out_size)
        self.dropout = nn.Dropout(0.4)
    
    def forward(self, x1, x2):
        lstm_out, _ = self.lstm(x1)
        lstm_out = lstm_out[:, -1, :]  # Take output of the last time step
        x1 = self.dropout(lstm_out)
        x = self.lin1(torch.cat((x1,x2), dim=1))
        x = self.dropout(x)
        x = self.lin2(x)
        return x

size_word = 27
size_guessed = 26
hidden_size_lstm = 256
out_size = 26

## Hyper-parameters and training details

In [12]:
num_epoch = 200
learning_rate = 1e-3

model     = BI_LSTM_model(size_word, size_guessed, hidden_size_lstm, out_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

In [13]:
def train_epoch(model, criterion, optimizer, train_loader, val_loader):
    
    model.train()
    model.to(device)
    criterion = criterion.to(device)
    start = time.time()
    
    train_loss = 0
    length = len(train_loader)
    
    for batch_idx, (feat_1, feat_2, answer) in enumerate(train_loader):
        feat_1 = feat_1.to(device)
        feat_2 = feat_2.to(device)
        answer = answer.to(device)
        
        optimizer.zero_grad()
        output = model.forward(feat_1, feat_2)
        loss = criterion(output, answer)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()

    print(f"Training Loss %: {100*train_loss/length}")
    
    val_loss = 0
    length = len(val_loader)
    
    model.eval()
    with torch.no_grad():
        
        for batch_idx, (dev_X1, dev_X2, dev_Y) in enumerate(val_loader):            
            dev_X1 = dev_X1.to(device)
            dev_X2 = dev_X2.to(device)
            dev_Y  = dev_Y.to(device)
            
            output = model(dev_X1, dev_X2)
            loss = criterion(output, dev_Y)
            
            val_loss += loss.item()
            
    val_loss_epoch = val_loss / length
    print(f"Val Loss of Epoch %: {100*val_loss_epoch}")
    
    end = time.time()
    print(f"Epoch time: {end - start}")
    
    return val_loss_epoch


In [14]:
num_epoch = 200
Losses = []

model_path = ('./models/new_lstm/')

g = open("val_loss_log.txt", 'w')
g.close()

# Load previous model state if resuming
last_epoch = 0
try:
    checkpoint = torch.load(model_path + str(num_epoch) + '.pt', weights_only=False)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    last_epoch = checkpoint['epoch']
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    print(f"Resuming training from epoch {last_epoch + 1}")
except FileNotFoundError:
    print("No checkpoint found, starting training from scratch")

for i in range(last_epoch, num_epoch):
    curr_time = datetime.now()
    pretty_time = curr_time.strftime("%d/%m/%y %H:%M:%S")
    print("------------------------------------")
    print(f"Epoch {i+1} begins at {pretty_time}")
    print("------------------------------------")

    val_loss_per_batch = train_epoch(model, criterion, optimizer, train_loader, dev_loader)
    scheduler.step()
    
    with open("val_loss_log.txt",'a') as g:
        g.write(f"Epoch {i+1} val loss : {val_loss_per_batch}\n")
            
    print(f"\n Saving model at {model_path}{i+1}.pt\n")
    torch.save({
        'epoch': i + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'val_loss': val_loss_per_batch,
    }, model_path + str(i+1) + '.pt')
            
    Losses.append(val_loss_per_batch)
    