In [17]:
# Import necessary libraries
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.autograd import Variable
from datasets import load_dataset
from transformers import AutoTokenizer
import matplotlib.pyplot as plt
import numpy as np
import os
import urllib.request
from torch.optim import Adam
from torch.utils.data import DataLoader


class MoE(nn.Module):
    def __init__(self, num_experts, in_features, out_features):
        super(MoE, self).__init__()
        self.num_experts = num_experts
        self.in_features = in_features
        self.out_features = out_features

        # Define the experts and the gating network
        self.experts = nn.ModuleList([nn.Linear(in_features, out_features) for _ in range(self.num_experts)])
        self.gating_network = nn.Sequential(
            nn.Linear(in_features, self.num_experts),
            nn.Linear(self.num_experts, out_features)
        )

    def forward(self, x):
        weights = torch.softmax(self.gating_network(x), dim=1)
        return sum(w * expert(x) for w, expert in zip(weights.split(1, dim=1), self.experts))


# Define the LSTM with MoE layer model
class LSTMwithMoE(nn.Module):
    def __init__(self, hidden_dim, output_dim, num_experts):
        super(LSTMwithMoE, self).__init__()
        self.hidden_dim = hidden_dim

        # Define the LSTM layers
        self.lstm1 = nn.LSTM(768, hidden_dim)  # 768 is the dimensionality of BERT's hidden states
        self.lstm2 = nn.LSTM(hidden_dim, hidden_dim)

        # Define the MoE layer
        self.moe = MoE(num_experts, hidden_dim, hidden_dim)

        # Define the output layer
        self.linear = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        lstm_out, _ = self.lstm1(x)
        moe_out = self.moe(lstm_out)
        lstm_out, _ = self.lstm2(moe_out)
        output = self.linear(lstm_out)
        return output

In [78]:
def load_data(dataset_name):
    # Load the dataset
    dataset = load_dataset(dataset_name, split='train', download_mode='force_redownload')

    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

    # Define the correct key for the labels in your dataset
    if dataset_name == 'conll2003':
        label_key = 'ner_tags'
    elif dataset_name == 'squad':
        label_key = 'answers'
    else:
        raise ValueError(f'Unknown dataset: {dataset_name}')

    # Tokenize the dataset
    def tokenize_and_encode(examples):
        # Replace 'text' with the correct key in your dataset
        if dataset_name == 'conll2003':
            text_key = 'tokens'
            text = [' '.join(tokens) for tokens in examples[text_key]]
            labels = [label + [0]*(512-len(label)) for label in examples[label_key]]
        elif dataset_name == 'squad':
            text_key = 'context'  # or 'question'
            text = examples[text_key]
            labels = [label['text'][0] + ' '*(512-len(label['text'][0])) for label in examples[label_key]]
        return {**tokenizer(text, truncation=True, padding='longest', max_length=512), label_key: labels}

    dataset = dataset.map(tokenize_and_encode, batched=True)
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', label_key])
    return dataset


# Define the training function
def train(model, data, epochs=10, lr=0.001):
    model.train()
    optimizer = Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    # Create a DataLoader
    data_loader = DataLoader(data, batch_size=32)

    for epoch in range(epochs):
        print(f'Starting epoch {epoch+1}')
        epoch_loss = 0
        for batch in data_loader:
            # Extract the tensors from the batch
            input_tensor = batch['input_ids'].float()  # Convert the input tensor to Float
            if 'ner_tags' in batch:
                target_tensor = batch['ner_tags']
            elif 'answers' in batch:
                target_tensor = batch['answers']
            else:
                raise ValueError('Unknown label key in batch')
            
            optimizer.zero_grad()
            output = model(input_tensor)
            # Reshape the output and target tensor before calculating the loss
            loss = loss_fn(output.view(-1, output.size(-1)), target_tensor.view(-1))
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f'Epoch {epoch+1} loss: {epoch_loss/len(data_loader)}')



In [79]:
def evaluate(model, data):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in data:
            output = model(batch['input_ids'])
            predictions.extend(output.argmax(dim=-1).tolist())
    return predictions

In [80]:

# Load the data
data_conll2003 = load_data('conll2003')
data_squad = load_data('squad')

# Initialize the model
num_labels_conll2003 = 9  # Replace with the number of unique labels in the 'conll2003' dataset
num_labels_squad = 2  # Replace with the number of unique labels in the 'squad' dataset
model_conll2003 = LSTMwithMoE(hidden_dim=100, output_dim=num_labels_conll2003, num_experts=10)
model_squad = LSTMwithMoE(hidden_dim=100, output_dim=num_labels_squad, num_experts=10)


Downloading data: 100%|██████████| 1.23M/1.23M [00:00<00:00, 4.31MB/s]
Downloading data: 100%|██████████| 312k/312k [00:00<00:00, 1.29MB/s]
Downloading data: 100%|██████████| 283k/283k [00:00<00:00, 1.23MB/s]


Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (1000,) + inhomogeneous part.

In [40]:
# Train and evaluate the model on each dataset
print('Training on CoNLL-2003')
train(model_conll2003, data_conll2003)
print('Evaluating on CoNLL-2003')
f1, precision, recall = evaluate(model_conll2003, data_conll2003)
print(f"F1 Score on CoNLL-2003: {f1}")
print(f"Precision on CoNLL-2003: {precision}")
print(f"Recall on CoNLL-2003: {recall}")
print('Training on SQuAD 1.1')
train(model_squad, data_squad)
print('Evaluating on SQuAD 1.1')
f1, precision, recall = evaluate(model_squad, data_squad)
print(f"F1 Score on SQuAD 1.1: {f1}")
print(f"Precision on SQuAD 1.1: {precision}")
print(f"Recall on SQuAD 1.1: {recall}")


Training on CoNLL-2003
Starting epoch 1


ValueError: Target size (torch.Size([32, 512])) must be the same as input size (torch.Size([32, 9]))

In [25]:
# URLs for the datasets
datasets = {
    'conll2003': 'https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.train',
    'squad1.1': 'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json'
}

# Specify the model name
model_name = 'distilbert-base-uncased'

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define the data downloading function
def download_data(url, filename):
    if not os.path.exists(filename):
        urllib.request.urlretrieve(url, filename)
    return filename  # Add this line

# Define the data loading function
def load_data(dataset_path):
    # Load the dataset
    dataset = load_dataset('text', data_files=dataset_path)

    # Tokenize the dataset
    def tokenize_and_encode(examples):
        return tokenizer(examples['text'], truncation=True, padding='max_length')

    dataset = dataset.map(tokenize_and_encode, batched=True)
    return dataset

# Download and load the data
data = {name: load_data(download_data(url, f'{name}.txt')) for name, url in datasets.items()}

# Initialize the model
model = LSTMwithMoE(input_dim=300, hidden_dim=100, output_dim=10, num_experts=10)

# Train and evaluate the model on each dataset
for name, dataset in data.items():
    print(f'Training on {name}')
    
    # Print the keys in the first item of the dataset
    print(dataset['train'][0].keys())
    
    train(model, dataset)
    print(f'Evaluating on {name}')
    predictions = evaluate(model, dataset)
    print(f'Accuracy on {name}: {accuracy_score(dataset, predictions)}')


# Define the training function
def train(model, data, epochs=10, lr=0.001):
    model.train()
    optimizer = Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    # Select the 'train' split
    data = data['train']

    # Create a DataLoader
    data_loader = DataLoader(data, batch_size=32)

    for epoch in range(epochs):
        print(f'Starting epoch {epoch+1}')
        epoch_loss = 0
        for batch in data_loader:
            # Extract the tensors from the batch
            input_tensor = batch['input_ids']
            
            optimizer.zero_grad()
            output = model(input_tensor)
            # As it's an unsupervised task, we don't calculate loss and backpropagate
            # loss = loss_fn(output, target_tensor)
            # loss.backward()
            optimizer.step()
            # epoch_loss += loss.item()
        # print(f'Epoch {epoch+1} loss: {epoch_loss/len(data_loader)}')


# Define the evaluation function
def evaluate(model, data):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in data:
            output = model(batch)
            predictions.extend(output.argmax(dim=-1).tolist())
    return predictions

# Download and load the data
data = {name: load_data(download_data(url, f'{name}.txt')) for name, url in datasets.items()}

# Initialize the model
model = LSTMwithMoE(input_dim=300, hidden_dim=100, output_dim=10, num_experts=10)

# Train and evaluate the model on each dataset
for name, dataset in data.items():
    print(f'Training on {name}')
    train(model, dataset)
    print(f'Evaluating on {name}')
    predictions = evaluate(model, dataset)
    print(f'Accuracy on {name}: {accuracy_score(dataset, predictions)}')

Training on conll2003
dict_keys(['text', 'input_ids', 'attention_mask'])
Starting epoch 1


KeyError: 'labels'