In [9]:
# !pip install torch==2.2.0 torchtext==0.17.0 -f https://download.pytorch.org/whl/torch_stable.html
# !pip install -U scikit-learn
!nvidia-smi

162.22s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Tue Dec 10 17:44:04 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.12              Driver Version: 550.90.12      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla V100-SXM2-32GB           On  |   00000000:1A:00.0 Off |                    0 |
| N/A   29C    P0             40W /  300W |       1MiB /  32768MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla V100-SXM2-32GB           On  |   00

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import Vocab, vocab
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import re
from collections import Counter
from typing import List, Tuple, Dict, Optional, Any

## Long Short Term Memory (LSTM)


### Data Loading

We will use the same dataset for named entity recognition in Assignment #2. First download the data and take a look at the first 50 lines:


Each line corresponds to a word. Different sentences are separated by an additional line break. Take "EU NNP I-NP ORG" as an example. "EU" is a word. "NNP" and "I-NP" are tags for POS tagging and chunking, which we will ignore. "ORG" is the tag for NER, which is our prediction target. There are 5 possible values for the NER tag: ORG, PER, LOC, MISC, and O.


In [11]:
# A sentence is a list of (word, tag) tuples.
# For example, [("hello", "O"), ("world", "O"), ("!", "O")]
Sentence = List[Tuple[str, str]]


def read_data_file(
    datapath: str,
) -> Tuple[List[Sentence], Dict[str, int], Dict[str, int]]:
    """
    Read and preprocess input data from the file `datapath`.
    Example:
    ```
        sentences, word_cnt, tag_cnt = read_data_file("eng.train")
    ```
    Return values:
        `sentences`: a list of sentences, including words and NER tags
        `word_cnt`: a Counter object, the number of occurrences of each word
        `tag_cnt`: a Counter object, the number of occurences of each NER tag
    """
    sentences: List[Sentence] = []
    word_cnt: Dict[str, int] = Counter()
    tag_cnt: Dict[str, int] = Counter()

    for sentence_txt in open(datapath).read().split("\n\n"):
        if "DOCSTART" in sentence_txt:
            # Ignore dummy sentences at the begining of each document.
            continue
        # Read a new sentence
        sentences.append([])
        for token in sentence_txt.split("\n"):
            w, _, _, t = token.split()
            # Replace all digits with "0" to reduce out-of-vocabulary words
            w = re.sub("\d", "0", w)
            word_cnt[w] += 1
            tag_cnt[t] += 1
            sentences[-1].append((w, t))

    return sentences, word_cnt, tag_cnt

In [12]:
# Some helper code
def get_device() -> torch.device:
    """
    Use GPU when it is available; use CPU otherwise.
    """
    return torch.device("gpu") if torch.cuda.is_available() else torch.device("cpu")


print(get_device())

cpu


In [13]:
def eval_metrics(ground_truth: List[int], predictions: List[int]) -> Dict[str, Any]:
    """
    Calculate various evaluation metrics such as accuracy and F1 score
    Parameters:
        `ground_truth`: the list of ground truth NER tags
        `predictions`: the list of predicted NER tags
    """
    f1_scores = f1_score(ground_truth, predictions, average=None)
    return {
        "accuracy": accuracy_score(ground_truth, predictions),
        "average f1": np.mean(f1_scores),
        "f1": f1_scores,
        "confusion matrix": confusion_matrix(ground_truth, predictions),
    }

## Long Short-term Memory (LSTM)

Now we implement an one-layer LSTM for the same task and compare it to FFNNs.


### Data Loading **(4 points)**

Like before, we first implement the data loader. But unlike before, each data example is now a variable-length sentence. How can we pack multiple sentences with different lengths into the same batch? One possible solution is to pad them to the same length using a special token.

> Padding ensures that all sentences in the batch have the same length, making it possible to process them simultaneously in a neural network.


In [14]:
# 3 sentences with different lengths
sentence_1 = torch.tensor([6, 1, 2])
sentence_2 = torch.tensor([4, 2, 7, 7, 9])
sentence_3 = torch.tensor([3, 4])
# Form a batch by padding 0
sentence_batch = torch.tensor(
    [
        [6, 1, 2, 0, 0],
        [4, 2, 7, 7, 9],
        [3, 4, 0, 0, 0],
    ]
)

We implement the above idea in a customized batching function `form_batch`. Optionally, see [here](https://pytorch.org/docs/stable/data.html#loading-batched-and-non-batched-data) for how batching works in PyTorch.


In [15]:
class SequenceDataset(Dataset):
    """
    Each data example is a sentence, including its words and NER tags.
    """

    def __init__(
        self,
        datapath: str,
        words_vocab: Optional[Vocab] = None,
        tags_vocab: Optional[Vocab] = None,
    ) -> None:
        """
        Initialize the dataset by reading from datapath.
        """
        super().__init__()
        self.sentences: List[Sentence] = []
        UNKNOWN = "<UNKNOWN>"
        PAD = "<PAD>"  # Special token used for padding

        print("Loading data from %s" % datapath)
        self.sentences, word_cnt, tag_cnt = read_data_file(datapath)
        print("%d sentences loaded." % len(self.sentences))

        if words_vocab is None:
            words_vocab = vocab(word_cnt, specials=[PAD, UNKNOWN])
            words_vocab.set_default_index(words_vocab[UNKNOWN])

        self.words_vocab = words_vocab

        self.unknown_idx = self.words_vocab[UNKNOWN]
        self.pad_idx = self.words_vocab[PAD]

        if tags_vocab is None:
            tags_vocab = vocab(tag_cnt, specials=[])
        self.tags_vocab = tags_vocab

    def __getitem__(self, idx: int) -> Sentence:
        """
        Get the idx'th sentence in the dataset.
        """
        return self.sentences[idx]

    def __len__(self) -> int:
        """
        Return the number of sentences in the dataset.
        """
        # TODO: Implement this method
        return len(self.sentences)

    def form_batch(self, sentences: List[Sentence]) -> Dict[str, Any]:
        """
        A customized function for batching a number of sentences together.
        Different sentences have different lengths. Let max_len be the longest length.
        When packing them into one tensor, we need to pad all sentences to max_len.
        Return values:
            `words`: a list in which each element itself is a list of words in a sentence
            `word_idxs`: a batch_size x max_len tensor.
                       word_idxs[i][j] is the index of the j'th word in the i'th sentence .
            `tags`: a list in which each element itself is a list of tags in a sentence
            `tag_idxs`: a batch_size x max_len tensor
                      tag_idxs[i][j] is the index of the j'th tag in the i'th sentence.
            `valid_mask`: a batch_size x max_len tensor
                        valid_mask[i][j] is True if the i'th sentence has the j'th word.
                        Otherwise, valid[i][j] is False.
        """
        words: List[List[str]] = []
        tags: List[List[str]] = []
        max_len = -1  # length of the longest sentence
        for sent in sentences:
            words.append([])
            tags.append([])
            for w, t in sent:
                words[-1].append(w)
                tags[-1].append(t)
            max_len = max(max_len, len(words[-1]))

        batch_size = len(sentences)
        word_idxs = torch.full(
            (batch_size, max_len), fill_value=self.pad_idx, dtype=torch.int64
        )
        tag_idxs = torch.full_like(word_idxs, fill_value=self.tags_vocab["O"])
        valid_mask = torch.zeros_like(word_idxs, dtype=torch.bool)

        ## TODO: Fill in the values in word_idxs, tag_idxs, and valid_mask
        ## Caveat: There may be out-of-vocabulary words in validation data
        ## See torchtext.vocab.Vocab: https://pytorch.org/text/stable/vocab.html#torchtext.vocab.Vocab

        for i, (w, t) in enumerate(zip(words, tags)):
            for j, (word, tag) in enumerate(zip(w, t)):
                word_idxs[i][j] = self.words_vocab[word]
                tag_idxs[i][j] = self.tags_vocab[tag]
                valid_mask[i][j] = True

        return {
            "words": words,
            "word_idxs": word_idxs,
            "tags": tags,
            "tag_idxs": tag_idxs,
            "valid_mask": valid_mask,
        }


def create_sequence_dataloaders(
    batch_size: int, shuffle: bool = True
) -> Tuple[DataLoader, DataLoader, Vocab]:
    """
    Create the dataloaders for training and validaiton.
    """
    ds_train = SequenceDataset("eng.train")
    ds_val = SequenceDataset(
        "eng.val", words_vocab=ds_train.words_vocab, tags_vocab=ds_train.tags_vocab
    )
    loader_train = DataLoader(
        ds_train,
        batch_size,
        shuffle,
        collate_fn=ds_train.form_batch,  # customized function for batching
        drop_last=True,
        pin_memory=True,
    )
    loader_val = DataLoader(
        ds_val, batch_size, collate_fn=ds_val.form_batch, pin_memory=True
    )
    return loader_train, loader_val, ds_train

Here is a simple sanity-check. Try to understand its output.


In [16]:
def check_sequence_dataloader() -> None:
    loader_train, _, _ = create_sequence_dataloaders(batch_size=3, shuffle=False)
    print("Iterating on the training data..")
    for i, data_batch in enumerate(loader_train):
        if i == 0:
            print(data_batch)
    print("Done!")


check_sequence_dataloader()

Loading data from eng.train
14041 sentences loaded.
Loading data from eng.val
3490 sentences loaded.
Iterating on the training data..
{'words': [['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], ['Peter', 'Blackburn'], ['BRUSSELS', '0000-00-00']], 'word_idxs': tensor([[ 2,  3,  4,  5,  6,  7,  8,  9, 10],
        [11, 12,  0,  0,  0,  0,  0,  0,  0],
        [13, 14,  0,  0,  0,  0,  0,  0,  0]]), 'tags': [['ORG', 'O', 'MISC', 'O', 'O', 'O', 'MISC', 'O', 'O'], ['PER', 'PER'], ['LOC', 'O']], 'tag_idxs': tensor([[0, 1, 2, 1, 1, 1, 2, 1, 1],
        [3, 3, 1, 1, 1, 1, 1, 1, 1],
        [4, 1, 1, 1, 1, 1, 1, 1, 1]]), 'valid_mask': tensor([[ True,  True,  True,  True,  True,  True,  True,  True,  True],
        [ True,  True, False, False, False, False, False, False, False],
        [ True,  True, False, False, False, False, False, False, False]])}
Done!


### Implement the Model **(8 points)**

Next, implement LSTM for predicting NER tags from input words. [nn.LSTM](https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html#torch.nn.LSTM) is definitely useful. Further, it is tricky to handle sentences in the same batch with different lengths. Please read the PyTorch documentation in detail!


In [17]:
class LSTM(nn.Module):
    """
    Long short-term memory for NER
    """

    def __init__(
        self,
        words_vocab: Vocab,
        tags_vocab: Vocab,
        d_emb: int,
        d_hidden: int,
        bidirectional: bool,
    ) -> None:
        """
        Initialize an LSTM
        Parameters:
            `words_vocab`: vocabulary of words
            `tags_vocab`: vocabulary of tags
            `d_emb`: dimension of word embeddings (D)
            `d_hidden`: dimension of the hidden layer (H)
            `bidirectional`: true if LSTM should be bidirectional
        """
        super().__init__()
        # TODO: Create the word embeddings (nn.Embedding),
        #       the LSTM (nn.LSTM) and the output layer (nn.Linear).
        #       Read the torch docs for additional guidance : https://pytorch.org/docs/stable
        #       Note: Pay attention to the LSTM output shapes!

        self.words_vocab = words_vocab
        self.tags_vocab = tags_vocab
        self.d_emb = d_emb
        self.d_hidden = d_hidden
        self.bidirectional = bidirectional

        self.embedding = nn.Embedding(len(words_vocab), d_emb)
        self.lstm = nn.LSTM(
            d_emb, d_hidden, bidirectional=bidirectional, batch_first=True
        )
        self.output = nn.Linear(
            d_hidden * 2 if bidirectional else d_hidden, len(tags_vocab)
        )

    def forward(
        self, word_idxs: torch.Tensor, valid_mask: torch.Tensor
    ) -> torch.Tensor:
        """
        Given words in sentences, predict the logits of the NER tag.
        Parameters:
            `word_idxs`: a batch_size x max_len tensor
            `valid_mask`: a batch_size x max_len tensor
        Return values:
            `logits`: a batch_size x max_len x 5 tensor
        """
        # TODO: Implement the forward pass

        context_emb = self.embedding(word_idxs)
        packed_input = nn.utils.rnn.pack_padded_sequence(
            context_emb, valid_mask.sum(dim=1), batch_first=True, enforce_sorted=False
        )
        packed_output, _ = self.lstm(packed_input)
        lstm_out, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        logits = self.output(lstm_out)

        return logits

We do a sanity-check by loading a batch of data examples and pass it through the network.


> The **valid_mask** is used to indicate which positions in the input sequences are valid (i.e., correspond to actual words) and which are padding. <br> This is important for sequence models like LSTMs because padding tokens should not contribute to the model's predictions or loss calculations.

In [18]:
def check_lstm() -> None:
    # Hyperparameters
    batch_size = 4
    d_emb = 64
    d_hidden = 128
    bidirectional = True
    # Create the dataloaders and the model
    loader_train, _, ds_train = create_sequence_dataloaders(batch_size)
    model = LSTM(
        ds_train.words_vocab, ds_train.tags_vocab, d_emb, d_hidden, bidirectional
    )
    device = get_device()
    model.to(device)
    print(model)
    # Get the first batch
    data_batch = next(iter(loader_train))
    # Move data to GPU
    word_idxs = data_batch["word_idxs"].to(device, non_blocking=True)
    tag_idxs = data_batch["tag_idxs"].to(device, non_blocking=True)
    valid_mask = data_batch["valid_mask"].to(device, non_blocking=True)
    # Calculate the model
    print("Input word_idxs shape:", word_idxs.size())
    print("Input valid_mask shape:", valid_mask.size())
    logits = model(word_idxs, valid_mask)
    print("Output logits shape:", logits.size())


check_lstm()

Loading data from eng.train
14041 sentences loaded.
Loading data from eng.val
3490 sentences loaded.
LSTM(
  (words_vocab): Vocab()
  (tags_vocab): Vocab()
  (embedding): Embedding(20102, 64)
  (lstm): LSTM(64, 128, batch_first=True, bidirectional=True)
  (output): Linear(in_features=256, out_features=5, bias=True)
)
Input word_idxs shape: torch.Size([4, 20])
Input valid_mask shape: torch.Size([4, 20])
Output logits shape: torch.Size([4, 20, 5])


### Training and Validation **(6 points)**

Complete the functions for training and validating the LSTM model. When calculating the loss function, you only want to include values from valid positions (where `valid_mask` is `True`). The `reduction` parameter in [F.cross_entropy](https://pytorch.org/docs/stable/nn.functional.html#torch.nn.functional.cross_entropy) may be useful.


In [19]:
def train_lstm(
    model: nn.Module,
    loader: DataLoader,
    optimizer: optim.Optimizer,
    device: torch.device,
    silent: bool = False,  # whether to print the training loss
) -> Tuple[float, Dict[str, Any]]:
    """
    Train the LSTM model.
    Return values:
        1. the average training loss
        2. training metrics such as accuracy and F1 score
    """
    model.train()
    ground_truth = []
    predictions = []
    losses = []
    report_interval = 100

    for i, data_batch in enumerate(loader):
        word_idxs = data_batch["word_idxs"].to(device, non_blocking=True)
        tag_idxs = data_batch["tag_idxs"].to(device, non_blocking=True)
        valid_mask = data_batch["valid_mask"].to(device, non_blocking=True)

        # TODO: Do the same tasks as train_ffnn
        logits = model(word_idxs, valid_mask)
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), tag_idxs.view(-1), ignore_index=-1, reduction='mean')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        losses.append(loss.item())

        # we get (unmasked) predictions by getting argmax of logits along last dimension (You will need to define logits!)
        net_predictions = torch.argmax(logits, -1)

        # flattening a tensor simply converts it from a multi-dimensional to a single-dimensional tensor; we flatten here to make it easier to extract ground truths and predictions
        tag_idxs_flat = tag_idxs.flatten()
        valid_mask_flat = valid_mask.flatten()
        net_predictions_flat = net_predictions.flatten()

        ground_truth.extend(tag_idxs_flat[valid_mask_flat].tolist())
        predictions.extend(net_predictions_flat[valid_mask_flat].tolist())

        if not silent and i > 0 and i % report_interval == 0:
            print(
                "\t[%06d/%06d] Loss: %f"
                % (i, len(loader), np.mean(losses[-report_interval:]))
            )

    return np.mean(losses), eval_metrics(ground_truth, predictions)


def validate_lstm(
    model: nn.Module, loader: DataLoader, device: torch.device
) -> Tuple[float, Dict[str, Any]]:
    """
    Validate the model.
    Return the validation loss and metrics.
    """
    model.eval()
    ground_truth = []
    predictions = []
    losses = []

    with torch.no_grad():

        for data_batch in loader:
            word_idxs = data_batch["word_idxs"].to(device, non_blocking=True)
            tag_idxs = data_batch["tag_idxs"].to(device, non_blocking=True)
            valid_mask = data_batch["valid_mask"].to(device, non_blocking=True)

            # TODO: Do the same tasks as validate_ffnn
            logits = model(word_idxs, valid_mask)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), tag_idxs.view(-1), ignore_index=-1, reduction='mean')

            losses.append(loss.item())

            # we get (unmasked) predictions by getting argmax of logits (You will need to define logits!)
            net_predictions = torch.argmax(logits, -1)

            # flattening a tensor simply converts it from a multi-dimensional to a single-dimensional tensor; we flatten here to make it easier to extract ground truths and predictions
            tag_idxs_flat = tag_idxs.flatten()
            valid_mask_flat = valid_mask.flatten()
            net_predictions_flat = net_predictions.flatten()

            ground_truth.extend(tag_idxs_flat[valid_mask_flat].tolist())
            predictions.extend(net_predictions_flat[valid_mask_flat].tolist())

    return np.mean(losses), eval_metrics(ground_truth, predictions)


def train_val_loop_lstm(hyperparams: Dict[str, Any]) -> None:
    """
    Train and validate the LSTM model for a number of epochs.
    """
    print("Hyperparameters:", hyperparams)
    # Create the dataloaders
    loader_train, loader_val, ds_train = create_sequence_dataloaders(
        hyperparams["batch_size"]
    )
    # Create the model
    model = LSTM(
        ds_train.words_vocab,
        ds_train.tags_vocab,
        hyperparams["d_emb"],
        hyperparams["d_hidden"],
        hyperparams["bidirectional"],
    )
    device = get_device()
    model.to(device)
    print(model)
    # Create the optimizer
    optimizer = optim.RMSprop(
        model.parameters(), hyperparams["learning_rate"], weight_decay=hyperparams["l2"]
    )

    # Train and validate
    for i in range(hyperparams["num_epochs"]):
        print("*" * 80)
        print(f"Epoch #{i+1}")

        print("Training..")
        loss_train, metrics_train = train_lstm(model, loader_train, optimizer, device)
        print("Training loss: ", loss_train)
        print("Training metrics:")
        for k, v in metrics_train.items():
            print("\t", k, ": ", v)

        print("Validating..")
        loss_val, metrics_val = validate_lstm(model, loader_val, device)
        print("Validation loss: ", loss_val)
        print("Validation metrics:")
        for k, v in metrics_val.items():
            print("\t", k, ": ", v)

    print("************ Training Done! ************")

Run the experiment:


In [20]:
train_val_loop_lstm(
    {
        "bidirectional": True,
        "batch_size": 512,
        "d_emb": 64,
        "d_hidden": 128,
        "num_epochs": 15,
        "learning_rate": 0.005,
        "l2": 1e-6,
    }
)

Hyperparameters: {'bidirectional': True, 'batch_size': 512, 'd_emb': 64, 'd_hidden': 128, 'num_epochs': 15, 'learning_rate': 0.005, 'l2': 1e-06}
Loading data from eng.train
14041 sentences loaded.
Loading data from eng.val
3490 sentences loaded.
LSTM(
  (words_vocab): Vocab()
  (tags_vocab): Vocab()
  (embedding): Embedding(20102, 64)
  (lstm): LSTM(64, 128, batch_first=True, bidirectional=True)
  (output): Linear(in_features=256, out_features=5, bias=True)
)
********************************************************************************
Epoch #1
Training..
Training loss:  1.0985378269796018
Training metrics:
	 accuracy :  0.8017554615189079
	 average f1 :  0.3318232403255778
	 f1 :  [0.19571409 0.89244979 0.04499586 0.26417311 0.26178334]
	 confusion matrix :  [[  1443   7565     88    459    272]
 [  2933 154886   1169   5748   2462]
 [   122   3888    136    199    175]
 [   252   7722     75   2698    188]
 [   169   5844     57    387   1694]]
Validating..
Validation loss:  0.837

We were using bidirectional LSTMs. Please re-run the experiment with a regular (unidirectional) LSTM.


In [21]:
## TODO: Re-run with unidirectional LSTMs
train_val_loop_lstm(
    {
        "bidirectional": False,
        "batch_size": 512,
        "d_emb": 64,
        "d_hidden": 128,
        "num_epochs": 15,
        "learning_rate": 0.005,
        "l2": 1e-6,
    }
)

Hyperparameters: {'bidirectional': False, 'batch_size': 512, 'd_emb': 64, 'd_hidden': 128, 'num_epochs': 15, 'learning_rate': 0.005, 'l2': 1e-06}
Loading data from eng.train
14041 sentences loaded.
Loading data from eng.val
3490 sentences loaded.
LSTM(
  (words_vocab): Vocab()
  (tags_vocab): Vocab()
  (embedding): Embedding(20102, 64)
  (lstm): LSTM(64, 128, batch_first=True)
  (output): Linear(in_features=128, out_features=5, bias=True)
)
********************************************************************************
Epoch #1
Training..
Training loss:  1.0759901978351452
Training metrics:
	 accuracy :  0.8070387199425411
	 average f1 :  0.322689427796722
	 f1 :  [0.10858011 0.89992691 0.05757488 0.23694338 0.31042187]
	 confusion matrix :  [[   715   7542    149    565    914]
 [  2014 156374   1710   3961   2927]
 [   152   3503    198    232    430]
 [   225   7975    165   2155    431]
 [   179   5146    141    326   2362]]
Validating..
Validation loss:  0.8399518047060285
Valida

### Questions **(2 points)**

(a) How does the final performance of LSTMs compare to FFNNs? Is it better? What is a possible explanation?

(b) How does bidirectional LSTMs compare to unidirectional LSTMs? Why?

**TODO: Please fill in your answer here**


### (a)

**Performance Comparison**:
LSTMs (Long Short-Term Memory networks) generally outperform FFNNs (Feedforward Neural Networks) on sequence-based tasks such as Natural Language Processing (NLP), time series prediction, and speech recognition.

**Possible Explanation**:
- **Sequential Data Handling**: LSTMs are specifically designed to handle sequential data and can capture temporal dependencies and patterns over time. They maintain a memory of previous inputs, which allows them to understand context and relationships in sequences.
- **Vanishing Gradient Problem**: LSTMs mitigate the vanishing gradient problem through their gating mechanisms (input, forget, and output gates), allowing them to learn long-term dependencies more effectively than FFNNs.
- **Contextual Understanding**: In tasks like NLP, understanding the context of a word within a sentence is crucial. LSTMs can maintain context over long sequences, whereas FFNNs treat each input independently and lack this capability.

### (b)

**Performance Comparison**:
Bidirectional LSTMs (BiLSTMs) often outperform unidirectional LSTMs on tasks where understanding the context from both past and future inputs is beneficial.

**Possible Explanation**:
- **Context from Both Directions**: Bidirectional LSTMs process the input sequence in both forward and backward directions. This allows them to capture information from both past and future contexts, providing a more comprehensive understanding of the sequence.
- **Enhanced Feature Representation**: By combining the outputs from both directions, BiLSTMs can create richer and more informative feature representations, which can lead to better performance on tasks like named entity recognition, machine translation, and sentiment analysis.
- **Improved Accuracy**: In many NLP tasks, the meaning of a word can depend on both preceding and succeeding words. BiLSTMs can leverage this bidirectional context to improve accuracy and performance.

**Example**:
Consider the sentence "The cat sat on the mat." To understand the word "sat," it is helpful to know both the preceding context ("The cat") and the succeeding context ("on the mat"). A bidirectional LSTM can utilize information from both directions to better understand the word "sat" in this context.