In [59]:
# Import necessary libraries
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertModel
import datasets
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define the LSTM layer
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)

    def forward(self, x):
        # Add an extra dimension for the batch (if not present)
        if x.dim() == 2:
            x = x.unsqueeze(1)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        return out



# Define the Mixture-of-Experts layer
class MoE(nn.Module):
    def __init__(self, num_experts, input_size, hidden_size):
        super(MoE, self).__init__()
        self.num_experts = num_experts
        self.gating_network = nn.Linear(input_size, num_experts)
        self.experts = nn.ModuleList([nn.Linear(input_size, hidden_size) for i in range(num_experts)])

    def forward(self, x):
        gate_outputs = F.softmax(self.gating_network(x), dim=2)
        expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=2)
        final_output = (gate_outputs.unsqueeze(3) * expert_outputs).sum(dim=2)
        return final_output


In [60]:
# Define the complete model
class LSTM_MoE_Model(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_experts, num_labels):
        super(LSTM_MoE_Model, self).__init__()
        self.lstm1 = LSTM(input_size, hidden_size, num_layers)
        self.moe = MoE(num_experts, hidden_size, hidden_size)
        self.lstm2 = LSTM(hidden_size, hidden_size, num_layers)
        self.classifier = nn.Linear(hidden_size, num_labels)

    def forward(self, x):
        x = self.lstm1(x)
        x = self.moe(x)
        x = self.lstm2(x)
        x = self.classifier(x)
        return x

# Define the model
model = LSTM_MoE_Model(input_size=512, hidden_size=256, num_layers=2, num_experts=10, num_labels=9)
model = model.to(device)  # Move the model to the device

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [45]:
# Define the training function
def train(model, dataloader, criterion, optimizer):
    model.train()
    total_loss = 0
    for batch in dataloader:
        inputs = batch['input_ids'].to(device).float()  # Convert inputs to float
        labels = batch['labels'].to(device)
        outputs = model(inputs)
        # Reshape outputs and labels
        outputs = outputs.view(-1, outputs.shape[-1])  # (batch_size * sequence_length, num_labels)
        _, labels = labels.max(dim=-1)  # (batch_size * sequence_length,) -> (batch_size,)
        # Calculate loss
        loss = criterion(outputs.view(-1, outputs.shape[-1]), labels.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)



In [46]:
# Define the evaluation function
def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            inputs = batch['input_ids'].to(device).float()  # Convert inputs to float
            labels = batch['labels'].to(device)
            outputs = model(inputs)
            # Reshape outputs and labels
            outputs = outputs.view(-1, outputs.shape[-1])  # (batch_size * sequence_length, num_labels)
            # Reshape labels to expected size (batch_size, num_labels)
            _, labels = labels.max(dim=-1)  # (batch_size * sequence_length,) -> (batch_size,)
            # Calculate loss
            loss = criterion(outputs.view(-1, outputs.shape[-1]), labels.view(-1))

            total_loss += loss.item()
    return total_loss / len(dataloader)

In [52]:
from datasets import load_dataset

# Load the 'conll2003' dataset
raw_datasets = load_dataset("conll2003")
print(raw_datasets["train"][0])

# Split the 'train' set into 'train' and 'validation' sets
split_datasets = raw_datasets['train'].train_test_split(test_size=0.2)

# Assign the split datasets back into 'train' and 'validation'
raw_datasets['train'] = split_datasets['train']
raw_datasets['validation'] = split_datasets['test']

# Tokenize the datasets
def tokenize_function(examples):
    # Join the list of tokens into a single string for each example in the batch
    sentences = [" ".join(tokens) for tokens in examples["tokens"]]
    return tokenizer(sentences, padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# Define your batch size
batch_size = 16

from torch.nn.utils.rnn import pad_sequence
# Apply the function on the entire dataset

def collate_fn(batch):
    inputs = pad_sequence([torch.tensor(item['input_ids']) for item in batch], batch_first=True)
    labels = pad_sequence([torch.tensor(item['ner_tags']) for item in batch], batch_first=True)
    return {'input_ids': inputs, 'labels': labels}

train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(tokenized_datasets["validation"], batch_size=batch_size, collate_fn=collate_fn)

{'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}


Map:   0%|          | 0/11232 [00:00<?, ? examples/s]

Map:   0%|          | 0/2809 [00:00<?, ? examples/s]

In [47]:
import torch.optim as optim
import torch.nn as nn

# Define the loss function (criterion)
criterion = nn.CrossEntropyLoss()

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [49]:
print(raw_datasets["train"][0])

{'id': '2377', 'tokens': ['4.', 'Mary', 'Onyali', '(', 'Nigeria', ')', '11.09'], 'pos_tags': [11, 22, 22, 4, 22, 5, 11], 'chunk_tags': [11, 11, 12, 0, 11, 0, 11], 'ner_tags': [0, 1, 2, 0, 5, 0, 0]}


In [50]:
def collate_fn(batch):
    inputs = pad_sequence([torch.tensor(item['input_ids']) for item in batch], batch_first=True)
    labels = pad_sequence([torch.tensor(item['ner_tags']) for item in batch], batch_first=True)
    return {'input_ids': inputs, 'labels': labels}


In [55]:
# Train the model
num_epochs=10
# Train the model
for epoch in range(num_epochs):
    train_loss = train(model, train_dataloader, criterion, optimizer)
    val_loss = evaluate(model, val_dataloader, criterion)
    print(f'Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.3f} | Val Loss: {val_loss:.3f}')


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
