<a href="https://colab.research.google.com/github/blindTissue/NLP-Project/blob/main/denser_attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install datasets
from datasets import load_dataset
import torch
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import math
import numpy as np


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m61.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m77.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https://

In [None]:
training_length = 8000
validation_length = 2000

dataset = load_dataset("boolq")

class BoolQADataset(torch.utils.data.Dataset):
    """
    Dataset for the dataset of BoolQ questions and answers
    """

    def __init__(self, passages, questions, answers, tokenizer, max_len):
        self.passages = passages
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.answers)

    def __getitem__(self, index):
        """
        This function is called by the DataLoader to get an instance of the data
        :param index:
        :return:
        """

        passage = str(self.passages[index])
        question = self.questions[index]
        answer = self.answers[index]

        # this is input encoding for your model. Note, question comes first since we are doing question answering
        # and we don't wnt it to be truncated if the passage is too long
        input_encoding = question + " [SEP] " + passage

        # encode_plus will encode the input and return a dictionary of tensors
        encoded_review = self.tokenizer.encode_plus(
            input_encoding,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors="pt",
            padding="max_length",
            truncation=True
        )

        return {
            'input_ids': encoded_review['input_ids'][0],  # we only have one example in the batch
            'attention_mask': encoded_review['attention_mask'][0],
            # attention mask tells the model where tokens are padding
            'labels': torch.tensor(answer, dtype=torch.long)  # labels are the answers (yes/no)
        }

dataset['train'][0]

dataset_train_subset = dataset['train'][:training_length]
#dataset_train_subset = dataset['train'][:100]

dataset_dev_subset = dataset['validation'][:validation_length]

max_len = 512

mytokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

train_dataset = BoolQADataset(
        passages=list(dataset_train_subset['passage']),
        questions=list(dataset_train_subset['question']),
        answers=list(dataset_train_subset['answer']),
        tokenizer=mytokenizer,
        max_len=max_len
)

validation_dataset = BoolQADataset(
        passages=list(dataset_dev_subset['passage']),
        questions=list(dataset_dev_subset['question']),
        answers=list(dataset_dev_subset['answer']),
        tokenizer=mytokenizer,
        max_len=max_len
    )

train_dataloader = DataLoader(train_dataset, 1)
validation_dataloader = DataLoader(validation_dataset, 32)



Downloading builder script:   0%|          | 0.00/3.38k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.91k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.60k [00:00<?, ?B/s]

Downloading and preparing dataset boolq/default to /root/.cache/huggingface/datasets/boolq/default/0.1.0/bf0dd57da941c50de94ae3ce3cef7fea48c08f337a4b7aac484e9dddc5aa24e5...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/6.53M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.24M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9427 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3270 [00:00<?, ? examples/s]

Dataset boolq downloaded and prepared to /root/.cache/huggingface/datasets/boolq/default/0.1.0/bf0dd57da941c50de94ae3ce3cef7fea48c08f337a4b7aac484e9dddc5aa24e5. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
class Embeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
    def forward(self, input_ids):
        embeddings = self.word_embeddings(input_ids)
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings
    
class SelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.hidden_size % config.num_attention_heads == 0
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        self.query1 = nn.Linear(config.hidden_size, self.all_head_size)
        self.query2 = nn.Linear(config.hidden_size, self.all_head_size)
        self.key1 = nn.Linear(config.hidden_size, self.all_head_size)
        self.key2 = nn.Linear(config.hidden_size, self.all_head_size)
        self.value1 = nn.Linear(config.hidden_size, self.all_head_size)
        self.value2 = nn.Linear(config.hidden_size, self.all_head_size)
        self.softmax = nn.Softmax(dim=-1)

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)
    
    def forward(self, hidden_states):
        query_layer_1 =  self.transpose_for_scores(self.query1(hidden_states))
        query_layer_2 =  self.transpose_for_scores(self.query2(hidden_states))
        key_layer_1 =  self.transpose_for_scores(self.key1(hidden_states))
        key_layer_2 =  self.transpose_for_scores(self.key2(hidden_states))
        value_layer_1 =  self.transpose_for_scores(self.value1(hidden_states))
        value_layer_2 =  self.transpose_for_scores(self.value2(hidden_states))

        attention_scores_1_1 = torch.matmul(query_layer_1, key_layer_1.transpose(-1, -2))
        attention_scores_1_2 = torch.matmul(query_layer_1, key_layer_2.transpose(-1, -2))
        attention_scores_2_1 = torch.matmul(query_layer_2, key_layer_1.transpose(-1, -2))
        attention_scores_2_2 = torch.matmul(query_layer_2, key_layer_2.transpose(-1, -2))

        attention_scores_1_1 = attention_scores_1_1 / math.sqrt(self.attention_head_size)
        attention_scores_1_2 = attention_scores_1_2 / math.sqrt(self.attention_head_size)
        attention_scores_2_1 = attention_scores_2_1 / math.sqrt(self.attention_head_size)
        attention_scores_2_2 = attention_scores_2_2 / math.sqrt(self.attention_head_size)

        attention_scores_1_1 = self.softmax(attention_scores_1_1)
        attention_scores_1_2 = self.softmax(attention_scores_1_2)
        attention_scores_2_1 = self.softmax(attention_scores_2_1)
        attention_scores_2_2 = self.softmax(attention_scores_2_2)


        attention_scores_1 = (attention_scores_1_1 + attention_scores_1_2) / 2
        attention_scores_2 = (attention_scores_2_1 + attention_scores_2_2) / 2

        context_layer_1 = torch.matmul(attention_scores_1, value_layer_1)
        context_layer_2 = torch.matmul(attention_scores_2, value_layer_2)

        context_layer_1 = context_layer_1.permute(0, 2, 1, 3).contiguous()
        context_layer_2 = context_layer_2.permute(0, 2, 1, 3).contiguous()

        new_context_layer_shape_1 = context_layer_1.size()[:-2] + (self.all_head_size,)
        new_context_layer_shape_2 = context_layer_2.size()[:-2] + (self.all_head_size,)

        context_layer_1 = context_layer_1.view(*new_context_layer_shape_1)
        context_layer_2 = context_layer_2.view(*new_context_layer_shape_2)

        #stack 1 and 2
        context_layer = torch.cat((context_layer_1, context_layer_2), dim=2)

        return context_layer
    
class SelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size*2, config.hidden_size)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states
    


class mid(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.emb = Embeddings(config)
        self.sa = SelfAttention(config)
        self.sa_output = SelfOutput(config)
    def forward(self, input_tensor):
        embedding_output = self.emb(input_tensor)
        self_output = self.sa(embedding_output)
        attention_output = self.sa_output(self_output, embedding_output)
        return attention_output
    
class Attention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.self = SelfAttention(config)
        self.output = SelfOutput(config)
    def forward(self, input_tensor):
        self_output = self.self(input_tensor)
        attention_output = self.output(self_output, input_tensor)
        return attention_output
    
class Intermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = nn.functional.gelu(hidden_states)
        return hidden_states
    
class Output(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states

class Layer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.attention = Attention(config)
        self.intermediate = Intermediate(config)
        self.output = Output(config)
    def forward(self, hidden_states):
        attention_output = self.attention(hidden_states)
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output
    
class Encoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layer = nn.ModuleList([Layer(config) for _ in range(config.num_hidden_layers)])
    def forward(self, hidden_states):
        for layer_module in self.layer:
            hidden_states = layer_module(hidden_states)
        return hidden_states
    
class Pooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()
    def forward(self, hidden_states):
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output

class AttentionModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embeddings = Embeddings(config)
        self.encoder = Encoder(config)
        self.pooler = Pooler(config)
    def forward(self, input_ids):
        embedding_output = self.embeddings(input_ids)
        encoder_outputs = self.encoder(embedding_output)
        pooled_output = self.pooler(encoder_outputs)
        return pooled_output
    
class BinaryClassification(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.attentionModel = AttentionModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, 1)
    def forward(self, input_ids):
        pooled_output = self.attentionModel(input_ids)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits



In [None]:
#make a class same as bertConfig

class Config():
    def __init__(self, attention_probs_dropout_prob = 0.1,
                 classifier_dropout = None,
                 hidden_act = "gelu",
                 hidden_dropout_prob = 0.1,
                 hidden_size = 768,
                 initializer_range = 0.02,
                 intermediate_size = 3072,
                 layer_norm_eps = 1e-12,
                 max_position_embeddings = 512,
                 model_type = "bert",
                 num_attention_heads = 12,
                 num_hidden_layers = 12,
                 pad_token_id = 0,
                 position_embedding_type = "absolute",
                 transformers_version = "4.27.3",
                 type_vocab_size = 2,
                 use_cache = True,
                 vocab_size = 30522):
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.classifier_dropout = classifier_dropout
        self.hidden_act = hidden_act
        self.hidden_dropout_prob = hidden_dropout_prob
        self.hidden_size = hidden_size
        self.initializer_range = initializer_range
        self.intermediate_size = intermediate_size
        self.layer_norm_eps = layer_norm_eps
        self.max_position_embeddings = max_position_embeddings
        self.model_type = model_type
        self.num_attention_heads = num_attention_heads
        self.num_hidden_layers = num_hidden_layers
        self.pad_token_id = pad_token_id
        self.position_embedding_type = position_embedding_type
        self.transformers_version = transformers_version
        self.type_vocab_size = type_vocab_size
        self.use_cache = use_cache
        self.vocab_size = vocab_size

config = Config()
config.num_hidden_layers = 5

In [None]:
from tqdm import tqdm
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model.train()
    losses = []
    correct_predictions = 0
    count = 0
    for d in tqdm(data_loader):
        count += 1
        input_ids = d["input_ids"].to(device)
        labels = d["labels"].to(device)
        #print(labels)
        outputs = torch.sigmoid(model(input_ids=input_ids))
        # Calculate the predictions by thresholding at 0.5
        preds = (outputs > 0.5).float()

        # Use binary cross-entropy loss for binary classification
        loss = loss_fn(outputs, labels.unsqueeze(1).float())
        correct_predictions += torch.sum(preds == labels.unsqueeze(1))
        optimizer.zero_grad()
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
    return correct_predictions.cpu().double() / n_examples, np.mean(losses)

from transformers import AdamW
from transformers import get_scheduler


def evaluate(model, data_loader, loss_fn, device, n_examples):
    model.eval()
    losses = []
    correct_predictions = 0
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            labels = d["labels"].to(device)
            outputs = torch.sigmoid(model(input_ids=input_ids))
            preds = (outputs > 0.5).float()
            loss = loss_fn(outputs, labels.unsqueeze(1).float())
            correct_predictions += torch.sum(preds == labels.unsqueeze(1))
            losses.append(loss.item())
    
    return correct_predictions.cpu().double() / n_examples, np.mean(losses)

def train(model, train_data_loader, val_data_loader, loss_fn, optimizer, device, scheduler, n_examples_train, n_examples_val, n_epochs):
    train_accuracies = []
    train_losses = []
    val_accuracies = []
    val_losses = []

    for epoch in range(n_epochs):
        print(f"Epoch {epoch + 1}/{n_epochs}")
        print("-" * 10)

        train_epoch(model, train_data_loader, loss_fn, optimizer, device, scheduler, n_examples_train)
        train_accuracy, train_loss = evaluate(model, train_data_loader, loss_fn, device, n_examples_train)
        val_accuracy, val_loss = evaluate(model, val_data_loader, loss_fn, device, n_examples_val)

        train_accuracies.append(train_accuracy)
        train_losses.append(train_loss)
        val_accuracies.append(val_accuracy)
        val_losses.append(val_loss)

        print(f"Train Accuracy: {train_accuracy:.4f}, Train Loss: {train_loss:.4f}")
        print(f"Val Accuracy: {val_accuracy:.4f}, Val Loss: {val_loss:.4f}")

    return {
        "train_accuracies": train_accuracies,
        "train_losses": train_losses,
        "val_accuracies": val_accuracies,
        "val_losses": val_losses
    }

train_dataloader = DataLoader(train_dataset, 120)
validation_dataloader = DataLoader(validation_dataset, 10)

print(len(train_dataloader))
print(len(validation_dataloader))

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = BinaryClassification(config).to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 20
training_length = 8000
num_training_steps = epochs * len(train_dataloader)
loss_fn = nn.BCELoss().to(device)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

history = train(model, train_dataloader, validation_dataloader, loss_fn, optimizer, device, lr_scheduler, training_length, validation_length, epochs)

import matplotlib.pyplot as plt



In [None]:
def plot_metrics(history):
    train_accuracies = history['train_accuracies']
    train_losses = history['train_losses']
    val_accuracies = history['val_accuracies']
    val_losses = history['val_losses']
    epochs = range(1, len(train_accuracies) + 1)


    plt.figure(figsize=(12, 5))

    plt.suptitle('Denser Attention Bert 512', fontsize=16, y=1.05)

    # Plot accuracies
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_accuracies, label='Train', marker='o')
    plt.plot(epochs, val_accuracies, label='Validation', marker='o')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.title('Training and Validation Accuracy')

    # Plot losses
    plt.subplot(1, 2, 2)
    plt.plot(epochs, train_losses, label='Train', marker='o')
    plt.plot(epochs, val_losses, label='Validation', marker='o')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Training and Validation Loss')

    plt.show()

plot_metrics(history)