In [1]:
import torch
import torch.nn.functional as F
from tqdm import tqdm

In [2]:
##### One-Hot Encoding #####
def one_hot_encode(indices, vocab_size):
    tensor = torch.zeros(vocab_size)
    for idx in indices:
        if idx < vocab_size:
            tensor[idx] = 1
    return tensor.view(1, -1)  # Reshape to (1, vocab_size)


##### Xavier Normalized Initialization #####
def init_weights(input_size, output_size):
    return torch.FloatTensor(output_size, input_size).uniform_(-1, 1) * torch.sqrt(torch.tensor(6.0) / (input_size + output_size))

##### Activation Functions #####
def sigmoid(input, derivative=False):
    if derivative:
        return input * (1 - input)
    return 1 / (1 + torch.exp(-input))

def tanh(input, derivative=False):
    if derivative:
        return 1 - input ** 2
    return torch.tanh(input)

def softmax(input):
    return F.softmax(input, dim=0)

In [None]:
import torch.nn as nn
##### Long Short-Term Memory Network Class #####
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_epochs, learning_rate):
        super(LSTM, self).__init__()
        # Hyperparameters
        self.learning_rate = learning_rate
        self.hidden_size = hidden_size
        self.num_epochs = num_epochs

        # Forget Gate
        self.wf = torch.nn.Parameter(init_weights(input_size + hidden_size, hidden_size))
        self.bf = torch.nn.Parameter(torch.zeros(hidden_size, 1))

        # Input Gate
        self.wi = torch.nn.Parameter(init_weights(input_size + hidden_size, hidden_size))
        self.bi = torch.nn.Parameter(torch.zeros(hidden_size, 1))

        # Candidate Gate
        self.wc = torch.nn.Parameter(init_weights(input_size + hidden_size, hidden_size))
        self.bc = torch.nn.Parameter(torch.zeros(hidden_size, 1))

        # Output Gate
        self.wo = torch.nn.Parameter(init_weights(input_size + hidden_size, hidden_size))
        self.bo = torch.nn.Parameter(torch.zeros(hidden_size, 1))

        # Final Gate
        self.wy = torch.nn.Parameter(init_weights(hidden_size, output_size))
        self.by = torch.nn.Parameter(torch.zeros(output_size, 1))


    # Reset Network Memory
    def reset(self):
        self.concat_inputs = {}

        self.hidden_states = {-1: torch.zeros((self.hidden_size, 1))}
        self.cell_states = {-1: torch.zeros((self.hidden_size, 1))}

        self.activation_outputs = {}
        self.candidate_gates = {}
        self.output_gates = {}
        self.forget_gates = {}
        self.input_gates = {}
        self.outputs = {}

    # Forward Propagation
    def forward(self, inputs):
        self.reset()

        outputs = []
        for q in range(len(inputs)):
            # Move self.hidden_states[q - 1] to the same device as input
            self.hidden_states[q - 1] = self.hidden_states[q - 1].to(inputs[q].device)

            # Reshape input to match the size of hidden state
            input_reshaped = inputs[q].view(-1, 1)

            self.concat_inputs[q] = torch.cat((self.hidden_states[q - 1], input_reshaped))

            self.forget_gates[q] = sigmoid(torch.matmul(self.wf, self.concat_inputs[q]) + self.bf)
            self.input_gates[q] = sigmoid(torch.matmul(self.wi, self.concat_inputs[q]) + self.bi)
            self.candidate_gates[q] = tanh(torch.matmul(self.wc, self.concat_inputs[q]) + self.bc)
            self.output_gates[q] = sigmoid(torch.matmul(self.wo, self.concat_inputs[q]) + self.bo)

            # Move self.cell_states[q - 1] to the same device as input
            self.cell_states[q - 1] = self.cell_states[q - 1].to(inputs[q].device)

            # Perform computation on the same device
            self.cell_states[q] = self.forget_gates[q] * self.cell_states[q - 1] + self.input_gates[q] * self.candidate_gates[q]
            self.hidden_states[q] = self.output_gates[q] * tanh(self.cell_states[q])

            outputs.append(torch.matmul(self.wy, self.hidden_states[q]) + self.by)

        return outputs

    # Backward Propagation
    def backward(self, errors, inputs):
        dwf, dbf = 0, 0
        dwi, dbi = 0, 0
        dwc, dbc = 0, 0
        dwo, dbo = 0, 0
        dwy, dby = 0, 0

        dh_next, dc_next = torch.zeros_like(self.hidden_states[0]), torch.zeros_like(self.cell_states[0])
        for q in reversed(range(len(inputs))):
            error = errors[q]

            # Final Gate Weights and Biases Errors
            dwy += torch.matmul(error, self.hidden_states[q].T)
            dby += error

            # Hidden State Error
            d_hs = torch.matmul(self.wy.T, error) + dh_next

            # Output Gate Weights and Biases Errors
            d_o = tanh(self.cell_states[q]) * d_hs * sigmoid(self.output_gates[q], derivative=True)
            dwo += torch.matmul(d_o, inputs[q].T)
            dbo += d_o

            # Cell State Error
            d_cs = tanh(tanh(self.cell_states[q]), derivative=True) * self.output_gates[q] * d_hs + dc_next

            # Forget Gate Weights and Biases Errors
            d_f = d_cs * self.cell_states[q - 1] * sigmoid(self.forget_gates[q], derivative=True)
            dwf += torch.matmul(d_f, inputs[q].T)
            dbf += d_f

            # Input Gate Weights and Biases Errors
            d_i = d_cs * self.candidate_gates[q] * sigmoid(self.input_gates[q], derivative=True)
            dwi += torch.matmul(d_i, inputs[q].T)
            dbi += d_i

            # Candidate Gate Weights and Biases Errors
            d_c = d_cs * self.input_gates[q] * tanh(self.candidate_gates[q], derivative=True)
            dwc += torch.matmul(d_c, inputs[q].T)
            dbc += d_c

            # Concatenated Input Error (Sum of Error at Each Gate!)
            d_z = torch.matmul(self.wf.T, d_f) + torch.matmul(self.wi.T, d_i) + torch.matmul(self.wc.T, d_c) + torch.matmul(self.wo.T, d_o)

            # Error of Hidden State and Cell State at Next Time Step
            dh_next = d_z[:self.hidden_size, :]
            dc_next = self.forget_gates[q] * d_cs

        for d_ in (dwf, dbf, dwi, dbi, dwc, dbc, dwo, dbo, dwy, dby):
            torch.clamp_(d_, -1, 1)

        self.wf.data += dwf * self.learning_rate
        self.bf.data += dbf * self.learning_rate

        self.wi.data += dwi * self.learning_rate
        self.bi.data += dbi * self.learning_rate

        self.wc.data += dwc * self.learning_rate
        self.bc.data += dbc * self.learning_rate

        self.wo.data += dwo * self.learning_rate
        self.bo.data += dbo * self.learning_rate

        self.wy.data += dwy * self.learning_rate
        self.by.data += dby * self.learning_rate

    # Train
    def train(self, inputs, labels):
        for _ in tqdm(range(self.num_epochs)):
            predictions = self.forward(inputs)

            errors = []
            for q in range(len(predictions)):
                error = -softmax(predictions[q])
                label_index = labels[q].argmax().item()  # Convert label tensor to integer
                error[label_index] += 1
                errors.append(error)

            self.backward(errors, self.concat_inputs)

    # Test
    def test(self, inputs, labels):
        accuracy = 0
        probabilities = self.forward(inputs)

        output = ''
        for q in range(len(labels)):
            prediction_index = torch.multinomial(softmax(probabilities[q].reshape(-1)), 1).item()
            output += str(prediction_index)  # Store prediction index instead of character

            # Convert label tensor to integer for comparison
            label_index = labels[q].item()
            if prediction_index == label_index:
                accuracy += 1

        print(f'Ground Truth:\n{labels}\n')
        print(f'Predictions:\n{output}\n')

        print(f'Accuracy: {round(accuracy * 100 / len(inputs), 2)}%')


In [4]:
import nltk
from nltk.corpus import treebank
import random
from collections import Counter

In [5]:
sentences = treebank.tagged_sents()
sentences = list(sentences)

random.seed(7)
random.shuffle(sentences)
train_size = int(0.8 * len(sentences))
train_sentences = sentences[:train_size]
test_sentences = sentences[train_size:]

In [6]:
train_words, train_tags = zip(*[(word, tag) for sent in train_sentences for word, tag in sent])
test_words, test_tags = zip(*[(word, tag) for sent in test_sentences for word, tag in sent])

In [7]:
# Create vocabulary and tag set
word_counts = Counter(train_words)
tag_counts = Counter(train_tags)
word_to_idx = {word: idx for idx, (word, _) in enumerate(word_counts.most_common(), 1)}
tag_to_idx = {tag: idx for idx, (tag, _) in enumerate(tag_counts.most_common(), 1)}
idx_to_tag = {idx: tag for tag, idx in tag_to_idx.items()}

In [8]:
# Add a default index for the 'O' tag
tag_to_idx.setdefault('O', len(tag_to_idx) + 1)

# Convert words and tags into numerical indices
train_input_indices = [[word_to_idx.get(word, 0) for word in sent] for sent in train_words]
train_label_indices = [[tag_to_idx.get(tag, 0) for tag in sent] for sent in train_tags]
test_input_indices = [[word_to_idx.get(word, 0) for word in sent] for sent in test_words]
test_label_indices = [[tag_to_idx.get(tag, 0) for tag in sent] for sent in test_tags]


In [9]:
# Determine the size of the vocabulary for tags
tag_vocab_size = len(tag_to_idx)

# Convert indices into one-hot encoded vectors
train_inputs = [torch.tensor(one_hot_encode(indices, len(word_to_idx))).float() for indices in train_input_indices]
train_labels = [torch.tensor(one_hot_encode(indices, tag_vocab_size)).float() for indices in train_label_indices]
test_inputs = [torch.tensor(one_hot_encode(indices, len(word_to_idx))).float() for indices in test_input_indices]
test_labels = [torch.tensor(one_hot_encode(indices, tag_vocab_size)).float() for indices in test_label_indices]

  train_inputs = [torch.tensor(one_hot_encode(indices, len(word_to_idx))).float() for indices in train_input_indices]
  train_labels = [torch.tensor(one_hot_encode(indices, tag_vocab_size)).float() for indices in train_label_indices]
  test_inputs = [torch.tensor(one_hot_encode(indices, len(word_to_idx))).float() for indices in test_input_indices]
  test_labels = [torch.tensor(one_hot_encode(indices, tag_vocab_size)).float() for indices in test_label_indices]


In [10]:
# Initialize and train the LSTM model
input_size = len(word_to_idx)
output_size = len(tag_to_idx)
hidden_size = 128
num_epochs = 10
learning_rate = 0.01

pos_tagger = LSTM(input_size, hidden_size, output_size, num_epochs, learning_rate)

In [11]:
input_size

11016

In [12]:
# Move the model to the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pos_tagger.to(device)

LSTM()

In [13]:
# Convert inputs and labels to CUDA tensors
train_inputs = [input_tensor.to(device) for input_tensor in train_inputs]
train_labels = [label_tensor.to(device) for label_tensor in train_labels]
test_inputs = [input_tensor.to(device) for input_tensor in test_inputs]
test_labels = [label_tensor.to(device) for label_tensor in test_labels]

In [14]:
pos_tagger.train(train_inputs, train_labels)

 70%|███████   | 7/10 [28:17<12:52, 257.55s/it]

In [None]:
# After training
torch.save(pos_tagger.state_dict(), 'pos_tagger_model.pth')

In [None]:
# Load the model
pos_tagger = LSTM(input_size, hidden_size, output_size, num_epochs, learning_rate)
pos_tagger.load_state_dict(torch.load('pos_tagger_model.pth'))

RuntimeError: Error(s) in loading state_dict for LSTM:
	size mismatch for wf: copying a param with shape torch.Size([128, 11176]) from checkpoint, the shape in current model is torch.Size([128, 11098]).
	size mismatch for wi: copying a param with shape torch.Size([128, 11176]) from checkpoint, the shape in current model is torch.Size([128, 11098]).
	size mismatch for wc: copying a param with shape torch.Size([128, 11176]) from checkpoint, the shape in current model is torch.Size([128, 11098]).
	size mismatch for wo: copying a param with shape torch.Size([128, 11176]) from checkpoint, the shape in current model is torch.Size([128, 11098]).

In [None]:
# Test the model
pos_tagger.test(test_inputs, test_labels)

In [None]:
import torch
saved_model = torch.load('pos_tagger_model.pth')
for name, param in saved_model.items():
    print(name, param.size())


wf torch.Size([128, 11176])
bf torch.Size([128, 1])
wi torch.Size([128, 11176])
bi torch.Size([128, 1])
wc torch.Size([128, 11176])
bc torch.Size([128, 1])
wo torch.Size([128, 11176])
bo torch.Size([128, 1])
wy torch.Size([47, 128])
by torch.Size([47, 1])
