### Text classification using LSTM

In this coding exercise, you will create a simple LSTM model using PyTorch to perform text classification on a dataset of short phrases. Your task is to fill in the missing parts of the code marked with `# TODO`.

You need to:

- Create a vocabulary to represent words as indices.
- Tokenize, encode, and pad the phrases.
- Convert the phrases and categories to PyTorch tensors.
- Instantiate the LSTM model with the vocabulary size, embedding dimensions, hidden dimensions, and output dimensions.
- Define the loss function and optimizer.
- Train the model for a number of epochs.
- Test the model on new phrases and print the category predictions.

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim

In [18]:
# Phrases (textual data) and their category labels (0 for sports, 1 for technology, 2 for food)
# Note: this data is extremely less for realistically training an LSTM model. Feel free to use
# a relevant data source or create your own dummy data for this exercise.
phrases = ["great goal scored", "amazing touchdown", "new phone release", "latest laptop model", "tasty pizza", "delicious burger"]
categories = [0, 0, 1, 1, 2, 2]

# TODO: Create a vocabulary to represent words as indices
vocab = {
    "<PAD>": 0,
    "great": 1, "goal": 2, "scored": 3,
    "amazing": 4, "touchdown": 5,
    "new": 6, "phone": 7, "release": 8,
    "latest": 9, "laptop": 10, "model": 11,
    "tasty": 12, "pizza": 13,
    "delicious": 14, "burger": 15
}

# TODO: Tokenize, encode, and pad phrases
encoded_sentences = [[vocab[word] for word in phrase.split()] for phrase in phrases]
max_length = max([len(sentence) for sentence in encoded_sentences])
padded_sentences = [sentence + [vocab["<PAD>"]] * (max_length - len(sentence)) for sentence in encoded_sentences]

print(f"vocab: {vocab}")
print(f"encoded_sentences: {encoded_sentences}")
print(f"max_length: {max_length}")
print(f"padded_sentences: {padded_sentences}")

# TODO: Convert phrases and categories to PyTorch tensors
inputs = torch.LongTensor(padded_sentences)
labels = torch.LongTensor(categories)

vocab: {'<PAD>': 0, 'great': 1, 'goal': 2, 'scored': 3, 'amazing': 4, 'touchdown': 5, 'new': 6, 'phone': 7, 'release': 8, 'latest': 9, 'laptop': 10, 'model': 11, 'tasty': 12, 'pizza': 13, 'delicious': 14, 'burger': 15}
encoded_sentences: [[1, 2, 3], [4, 5], [6, 7, 8], [9, 10, 11], [12, 13], [14, 15]]
max_length: 3
padded_sentences: [[1, 2, 3], [4, 5, 0], [6, 7, 8], [9, 10, 11], [12, 13, 0], [14, 15, 0]]


In [19]:
# Define LSTM model
class PhraseClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(PhraseClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, _) = self.lstm(embedded)
        logits = self.fc(hidden.squeeze(0))
        return logits

In [20]:
# TODO: Instantiate model and define loss and optimizer
vocab_size = len(vocab)
emedding_dim = 10
hidden_dim = 20
output_dim = 3 # bc of 3 categories

model = PhraseClassifier(vocab_size, emedding_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# TODO: Train the model for a number of epochs
epochs = 1000
for epoch in range(epochs):
    optimizer.zero_grad()
    predictions = model(inputs.t())
    loss = criterion(predictions, labels)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        print(f"Epoch: {epoch + 1}, Loss: {loss.item()}")

# TODO: Test the model on new phrases
with torch.no_grad():
    # test input
    test_sentences = ["new laptop", "burger"]

    # preprocessing
    encoded_test_sentences = [[vocab[word] for word in sentence.split()] for sentence in test_sentences]
    padded_test_sentences = [sentence + [vocab["<PAD>"]] * (max_length - len(sentence)) for sentence in encoded_test_sentences]
    test_inputs = torch.LongTensor(padded_test_sentences)

    # make prediction
    test_predictions = torch.sigmoid(model(test_inputs.t()).squeeze(1))
    print("Test predictions:", test_predictions)

Epoch: 100, Loss: 0.360628217458725
Epoch: 200, Loss: 0.046063899993896484
Epoch: 300, Loss: 0.016064921393990517
Epoch: 400, Loss: 0.008466260507702827
Epoch: 500, Loss: 0.005360834300518036
Epoch: 600, Loss: 0.0037694808561354876
Epoch: 700, Loss: 0.0028331864159554243
Epoch: 800, Loss: 0.002227128716185689
Epoch: 900, Loss: 0.0018071482190862298
Epoch: 1000, Loss: 0.0015011801151558757
Test predictions: tensor([[0.8854, 0.6162, 0.1435],
        [0.1195, 0.0986, 0.9916]])
