In [None]:
!pip install datasets
!pip install seqeval

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader,TensorDataset
from seqeval.metrics import classification_report
import tensorflow as tf
from datasets import load_dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
dataset=load_dataset('conll2003')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

conll2003.py:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [None]:
training=dataset['train']
validation=dataset['validation']
test=dataset['test']

In [None]:
training_tokens=training['tokens']
training_tags=training['ner_tags']

validation_tokens=validation['tokens']
validation_tags=validation['ner_tags']

test_tokens=test['tokens']
test_tags=test['ner_tags']

In [None]:
tokenizer=Tokenizer(oov_token='<UNK>')
tokenizer.fit_on_texts(training_tokens)

In [None]:
train_sequences=tokenizer.texts_to_sequences(training_tokens)
validation_sequences=tokenizer.texts_to_sequences(validation_tokens)
test_sequences=tokenizer.texts_to_sequences(test_tokens)

In [None]:
train_padded_sequences=pad_sequences(train_sequences,padding='post',maxlen=128)
validation_padded_sequences=pad_sequences(validation_sequences,padding='post',maxlen=128)
test_padded_sequences=pad_sequences(test_sequences,padding='post',maxlen=128)

In [None]:
token2index=tokenizer.word_index
index2token={index:token for token,index in token2index.items()}

In [None]:
tag_names=dataset["train"].features["ner_tags"].feature.names
tag2index={tag:index for index,tag in enumerate(tag_names)}
index2tag={index:tag for tag,index in tag2index.items()}

In [None]:
training_padded_tags=pad_sequences(training_tags,padding='post',maxlen=128)
validation_padded_tags=pad_sequences(validation_tags,padding='post',maxlen=128)
test_padded_tags=pad_sequences(test_tags,padding='post',maxlen=128)

In [None]:
training_categorical=tf.keras.utils.to_categorical(training_padded_tags,num_classes=len(tag2index))
validation_categorical=tf.keras.utils.to_categorical(validation_padded_tags,num_classes=len(tag2index))
test_categorical=tf.keras.utils.to_categorical(test_padded_tags,num_classes=len(tag2index))

In [None]:
training_dataset = TensorDataset(torch.tensor(train_padded_sequences), torch.tensor(training_categorical))
validation_dataset = TensorDataset(torch.tensor(validation_padded_sequences), torch.tensor(validation_categorical))
test_dataset = TensorDataset(torch.tensor(test_padded_sequences), torch.tensor(test_categorical))

In [None]:
train_loader=DataLoader(training_dataset,batch_size=32,shuffle=True)
validation_loader=DataLoader(validation_dataset,batch_size=32,shuffle=False)
test_loader=DataLoader(test_dataset,batch_size=32,shuffle=False)

Simple RNN Model

In [None]:
class simple_rnn(nn.Module):
  def __init__(self,vocab_size,embedding_dim,hidden_units1,hidden_units2,output_dim):
    super(simple_rnn,self).__init__()
    self.embedding=nn.Embedding(vocab_size,embedding_dim)
    self.rnn1=nn.RNN(embedding_dim,hidden_units1,bidirectional=True,batch_first=True)
    self.rnn2=nn.RNN(hidden_units1*2,hidden_units2,bidirectional=True,batch_first=True)
    self.output=nn.Linear(hidden_units2*2,output_dim)
  def forward(self,x):
    x=self.embedding(x)
    x,_ = self.rnn1(x)
    x,_ = self.rnn2(x)
    x=self.output(x)
    return x

In [None]:
model=simple_rnn(len(token2index)+1,300,128,128,len(tag2index))

In [None]:
loss_fn=nn.CrossEntropyLoss()
optimizer=optim.Adam(model.parameters(),lr=3e-4)

In [None]:
for epoch in range(10):
    model.train()
    total_loss = 0
    total_correct = 0
    total_tokens = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch, y_batch
        optimizer.zero_grad()
        outputs = model(X_batch)
        # The target should have shape (batch_size, seq_len) for CrossEntropyLoss
        # Current shape of y_batch is (batch_size, seq_len, num_classes)
        # We need to convert it to (batch_size * seq_len, num_classes) and then
        # find the class index with maximum value for each token
        # Change 'dimension' to 'dim' in torch.argmax
        loss = loss_fn(outputs.view(-1, len(tag2index)), torch.argmax(y_batch.view(-1, len(tag2index)), dim=1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # Accuracy calculation
        preds = torch.argmax(outputs, dim=-1)  # (batch, seq_len)
        # Convert y_batch to class indices for comparison with preds
        y_batch_indices = torch.argmax(y_batch, dim=-1) # shape (batch_size, seq_len)
        mask = y_batch_indices != tag2index["O"] # or != PAD if you use PAD
        correct = (preds == y_batch_indices) & mask # comparing with class indiwces
        total_correct += correct.sum().item()
        total_tokens += mask.sum().item()

    accuracy = total_correct / total_tokens if total_tokens > 0 else 0.0
    print(f"Epoch {epoch+1} - Loss: {total_loss:.4f} - Accuracy: {accuracy:.4f}")

Epoch 1 - Loss: 4.9724 - Accuracy: 0.8601
Epoch 2 - Loss: 3.6864 - Accuracy: 0.8999
Epoch 3 - Loss: 2.6800 - Accuracy: 0.9288
Epoch 4 - Loss: 1.8839 - Accuracy: 0.9523
Epoch 5 - Loss: 1.3172 - Accuracy: 0.9687
Epoch 6 - Loss: 0.8988 - Accuracy: 0.9796
Epoch 7 - Loss: 0.6294 - Accuracy: 0.9871
Epoch 8 - Loss: 0.4314 - Accuracy: 0.9918
Epoch 9 - Loss: 0.3086 - Accuracy: 0.9947
Epoch 10 - Loss: 0.2396 - Accuracy: 0.9957


In [None]:
model.eval()
total_loss=0
total_correct=0
total_tokens=0
all_preds, all_labels = [], []
with torch.no_grad():
    for X_batch, y_batch in validation_loader:
        outputs = model(X_batch)
        loss = loss_fn(outputs.view(-1, len(tag2index)), torch.argmax(y_batch.view(-1, len(tag2index)), dim=1))
        total_loss += loss.item()

        # Accuracy calculation
        preds = torch.argmax(outputs, dim=-1)  # (batch, seq_len)
        # Convert y_batch to class indices for comparison with preds
        y_batch_indices = torch.argmax(y_batch, dim=-1) # shape (batch_size, seq_len)
        mask = y_batch_indices != tag2index["O"] # or != PAD if you use PAD
        correct = (preds == y_batch_indices) & mask # comparing with class indices
        total_correct += correct.sum().item()
        total_tokens += mask.sum().item()
    accuracy = total_correct / total_tokens if total_tokens > 0 else 0.0
    print(f"Test Loss: {total_loss:.4f} - Test Accuracy: {accuracy:.4f}")


Test Loss: 4.0155 - Test Accuracy: 0.7496


BiDirectional GRU

In [None]:
class gru(nn.Module):
  def __init__(self,vocab_size,embedding_dim,hidden_units1,hidden_units2,output_dim):
    super(gru,self).__init__()
    self.embedding=nn.Embedding(vocab_size,embedding_dim)
    self.gru1=nn.GRU(embedding_dim,hidden_units1,bidirectional=True,batch_first=True)
    self.gru2=nn.GRU(hidden_units1*2,hidden_units2,bidirectional=True,batch_first=True)
    self.output=nn.Linear(hidden_units2*2,output_dim)
  def forward(self,x):
    x=self.embedding(x)
    x,_ = self.gru1(x)
    x,_ = self.gru2(x)
    x=self.output(x)
    return x

In [None]:
model1=gru(len(token2index)+1,300,128,128,len(tag2index))

In [None]:
loss_fn=nn.CrossEntropyLoss()
optimizer=optim.Adam(model1.parameters(),lr=3e-4)

In [None]:
for epoch in range(10):
    model.train()
    total_loss = 0
    total_correct = 0
    total_tokens = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch, y_batch
        optimizer.zero_grad()
        outputs = model1(X_batch)
        # The target should have shape (batch_size, seq_len) for CrossEntropyLoss
        # Current shape of y_batch is (batch_size, seq_len, num_classes)
        # We need to convert it to (batch_size * seq_len, num_classes) and then
        # find the class index with maximum value for each token
        # Change 'dimension' to 'dim' in torch.argmax
        loss = loss_fn(outputs.view(-1, len(tag2index)), torch.argmax(y_batch.view(-1, len(tag2index)), dim=1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # Accuracy calculation
        preds = torch.argmax(outputs, dim=-1)  # (batch, seq_len)
        # Convert y_batch to class indices for comparison with preds
        y_batch_indices = torch.argmax(y_batch, dim=-1) # shape (batch_size, seq_len)
        mask = y_batch_indices != tag2index["O"] # or != PAD if you use PAD
        correct = (preds == y_batch_indices) & mask # comparing with class indices
        total_correct += correct.sum().item()
        total_tokens += mask.sum().item()

    accuracy = total_correct / total_tokens if total_tokens > 0 else 0.0
    print(f"Epoch {epoch+1} - Training Loss: {total_loss:.4f} - Train Accuracy: {accuracy:.4f}")

Epoch 1 - Training Loss: 22.3269 - Train Accuracy: 0.2560
Epoch 2 - Training Loss: 13.5196 - Train Accuracy: 0.5784
Epoch 3 - Training Loss: 8.5283 - Train Accuracy: 0.7464
Epoch 4 - Training Loss: 5.5459 - Train Accuracy: 0.8408
Epoch 5 - Training Loss: 3.5640 - Train Accuracy: 0.9022
Epoch 6 - Training Loss: 2.2249 - Train Accuracy: 0.9432
Epoch 7 - Training Loss: 1.3407 - Train Accuracy: 0.9699
Epoch 8 - Training Loss: 0.7969 - Train Accuracy: 0.9840
Epoch 9 - Training Loss: 0.4762 - Train Accuracy: 0.9925
Epoch 10 - Training Loss: 0.3045 - Train Accuracy: 0.9958


In [None]:
model1.eval()
total_loss=0
total_correct=0
total_tokens=0
all_preds, all_labels = [], []
with torch.no_grad():
    for X_batch, y_batch in validation_loader:
        outputs = model1(X_batch)
        loss = loss_fn(outputs.view(-1, len(tag2index)), torch.argmax(y_batch.view(-1, len(tag2index)), dim=1))
        total_loss += loss.item()

        # Accuracy calculation
        preds = torch.argmax(outputs, dim=-1)  # (batch, seq_len)
        # Convert y_batch to class indices for comparison with preds
        y_batch_indices = torch.argmax(y_batch, dim=-1) # shape (batch_size, seq_len)
        mask = y_batch_indices != tag2index["O"] # or != PAD if you use PAD
        correct = (preds == y_batch_indices) & mask # comparing with class indices
        total_correct += correct.sum().item()
        total_tokens += mask.sum().item()
    accuracy = total_correct / total_tokens if total_tokens > 0 else 0.0
    print(f"Test Loss: {total_loss:.4f} - Test Accuracy: {accuracy:.4f}")


Test Loss: 3.5430 - Test Accuracy: 0.7320


Similarly in place of GRU, we can use LSTM in model's architecture