# Prepare training sample

In [1]:
from hangmanai.preprocess import create_input_sample
sample = ['abc']
features, labels = create_input_sample(sample)

All the combinations of masked word will be generated as the training samples. For example,  the word 'abc' has 7 combinations ('#bc', 'a#c', 'ab#', 'a##', '#b#', '##c', '###')


In [2]:
len(features)

7

For each training sample, the input length is 29 because of the maximum length of word in the trainng file.  

In [3]:
len(features[0])

29

0-25 represents A-Z
<br>26 represents the masked value
<br>27 represents the padding value
<br>For example 'a#c', the input value is [2, 1, 4] + [0]*26

In [4]:
features[0]

[26,
 1,
 2,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27]

The label is the actual value of the letter which is masked.
<br> Thus the length of the label is also 29

In [5]:
len(labels[0])

29

-1 indicates that it will be ignored and does not contribute to the input gradient
<br>FOR example 'a#c', the label is [-1, 1, -1...-1]. 

In [6]:
labels[0]

[0,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1]

# Model

<br>Bidirectional LSTM Model is used.  
<br>One Hot Encoding is used to ecode the input value. 

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as Fun

class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, isTag=True):
        super(LSTMModel, self).__init__()

        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.input_dim = input_dim
        self.isTag = isTag
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim*2, output_dim)

    def forward(self, x):
        x = Fun.one_hot(x, num_classes=self.input_dim).type(torch.FloatTensor)
        h0 = torch.zeros(self.layer_dim*2, x.size(0), self.hidden_dim)
        c0 = torch.zeros(self.layer_dim*2, x.size(0), self.hidden_dim)
        out, _ = self.lstm(x, (h0, c0))
        if not self.isTag:
            out = out[:, -1, :]
        out = self.fc(out)

        return out

# Inference

The output probabilities of the masked tokens are calculated. 
<br>Then Probability Score is calculated by summing up the probalilites of the masked tokens. 
<br>Finally, the Probability Score is sorted in descending order. 

In [24]:
from hangmanai.model import load_model
from hangmanai.preprocess import preprocess_feature, output_to_char
from hangmanai.config import MAX_LEN, COVER_VALUE

from config import MODEL_PATH
model_name = 'bidiretionLSTM_0.torch'

word = 'aggr_gat_'
model = load_model(f'{MODEL_PATH}/{model_name}')
feature = preprocess_feature(word)
featuresTest = torch.tensor(feature).to(torch.int64)
featuresTest = featuresTest.reshape(1, MAX_LEN)
proba = model(featuresTest)
proba = torch.nn.functional.softmax(proba[featuresTest==COVER_VALUE], dim=1).sum(dim=0)
_, indices = torch.sort(proba, descending=True)
for i in indices:
    guess_letter = output_to_char(i)
    probit = proba_sum[i].item()
    print(f"Guess Letter : {guess_letter}, Probability Score: {probit}")

Guess Letter : e, Probability Score: 0.9788843393325806
Guess Letter : i, Probability Score: 0.5304452180862427
Guess Letter : o, Probability Score: 0.19663839042186737
Guess Letter : h, Probability Score: 0.11021000891923904
Guess Letter : u, Probability Score: 0.08457174152135849
Guess Letter : y, Probability Score: 0.05383685231208801
Guess Letter : s, Probability Score: 0.019202565774321556
Guess Letter : l, Probability Score: 0.008124561980366707
Guess Letter : n, Probability Score: 0.006252141669392586
Guess Letter : d, Probability Score: 0.004193069878965616
Guess Letter : m, Probability Score: 0.0027985151391476393
Guess Letter : c, Probability Score: 0.002747895661741495
Guess Letter : f, Probability Score: 0.0008452110923826694
Guess Letter : w, Probability Score: 0.00045734710874967277
Guess Letter : r, Probability Score: 0.0003708497970364988
Guess Letter : k, Probability Score: 0.00013670269981957972
Guess Letter : p, Probability Score: 5.347983096726239e-05
Guess Letter :