<a href="https://colab.research.google.com/github/aquibjaved/BitsAndPieces-Computation/blob/main/long_seq_experiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [2]:
import torch
import torch.nn as nn
from transformers import DistilBertModel, DistilBertConfig
from transformers import DistilBertModel, DistilBertConfig, DistilBertTokenizerFast
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import datasets

In [4]:
class SequenceProjector(nn.Module):
    def __init__(self, input_dim, projection_dim, kernel_size=3, stride=1):
        super(SequenceProjector, self).__init__()
        self.conv1d = nn.Conv1d(in_channels=input_dim, out_channels=projection_dim, kernel_size=kernel_size, stride=stride, padding='same')
        self.relu = nn.ReLU()
        self.global_avg_pool = nn.AdaptiveAvgPool1d(1)

    def forward(self, x):
        x = x.transpose(1, 2)
        x = self.conv1d(x)
        x = self.relu(x)
        x = self.global_avg_pool(x)
        x = x.squeeze(2)
        return x

class Classifier(nn.Module):
    def __init__(self, projection_dim, num_classes):
        super(Classifier, self).__init__()
        self.embedding_layer = nn.Linear(projection_dim, 768)
        self.position_embeddings = nn.Embedding(1, 768)  # Positional embeddings
        distilbert_config = DistilBertConfig.from_pretrained('distilbert-base-uncased')
        self.distilbert = DistilBertModel(config=distilbert_config)
        for param in self.distilbert.parameters():
            param.requires_grad = False
        self.classifier = nn.Linear(768, num_classes)

    def forward(self, projected_sequence):
        embeddings = self.embedding_layer(projected_sequence)
        positional_embedding = self.position_embeddings(torch.tensor([0], device=embeddings.device))
        embeddings = embeddings + positional_embedding
        outputs = self.distilbert(inputs_embeds=embeddings.unsqueeze(1))
        pooled_output = outputs[0][:, 0, :]
        logits = self.classifier(pooled_output)
        return logits


In [5]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [16]:
import torch

input_seq_dim = 10
projection_dim = 2
projector = SequenceProjector(input_seq_dim, projection_dim)

# Example inputs with different sequence lengths
input1 = torch.randn(32, 100, 768)  # batch_size=32, seq_len=100
input2 = torch.randn(32, 200, 768)  # batch_size=32, seq_len=200
input3 = torch.randn(32, 512, 768) # batch_size=32, seq_len=512

output1 = projector(input1)
output2 = projector(input2)
output3 = projector(input3)

print(output1.shape)  # Output: torch.Size([32, 512])
print(output2.shape)  # Output: torch.Size([32, 512])
print(output3.shape) # Output: torch.Size([32, 512])

RuntimeError: Given groups=1, weight of size [2, 10, 3], expected input[32, 768, 100] to have 10 channels, but got 768 channels instead

In [14]:
output1

tensor([[0.2236, 0.2764, 0.1906,  ..., 0.2081, 0.2185, 0.1807],
        [0.2425, 0.2660, 0.2164,  ..., 0.2458, 0.2917, 0.1963],
        [0.2446, 0.2512, 0.2150,  ..., 0.2129, 0.2323, 0.2093],
        ...,
        [0.2219, 0.2036, 0.1811,  ..., 0.2234, 0.2973, 0.2822],
        [0.3122, 0.2579, 0.1754,  ..., 0.2133, 0.2497, 0.2055],
        [0.2802, 0.3510, 0.2332,  ..., 0.2160, 0.1953, 0.1559]],
       grad_fn=<SqueezeBackward1>)

In [7]:
# Hyperparameters
input_dim = 768
projection_dim = 512
max_len = 512
batch_size = 32
num_classes = 2
num_epochs = 3
learning_rate = 1e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load IMDB dataset
imdb = datasets.load_dataset("imdb")
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Create datasets and dataloaders
train_dataset = TextDataset(imdb['train']['text'], imdb['train']['label'], tokenizer, max_len)
test_dataset = TextDataset(imdb['test']['text'], imdb['test']['label'], tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Initialize model, loss, and optimizer
projector = SequenceProjector(input_dim, projection_dim).to(device)
classifier = Classifier(projection_dim, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(list(projector.parameters()) + list(classifier.embedding_layer.parameters()) + list(classifier.classifier.parameters()) + list(classifier.position_embeddings.parameters()), lr=learning_rate)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [8]:
# import datasets
# import matplotlib.pyplot as plt
# import numpy as np

# imdb = datasets.load_dataset("imdb")

# lengths = []
# for split in imdb:
#     for text in imdb[split]['text']:
#         lengths.append(len(text.split())) # Length including whitespace

# # Plotting the distribution
# plt.figure(figsize=(10, 6))
# plt.hist(lengths, bins=50, color='skyblue', edgecolor='black')  # Adjust bins as needed
# plt.title('Distribution of Text Lengths (including whitespace) in IMDB Dataset')
# plt.xlabel('Text Length')
# plt.ylabel('Frequency')
# plt.grid(axis='y', alpha=0.75)

# # Add some statistics to the plot
# mean_length = np.mean(lengths)
# median_length = np.median(lengths)
# max_length = np.max(lengths)

# plt.axvline(mean_length, color='red', linestyle='dashed', linewidth=1, label=f'Mean: {mean_length:.2f}')
# plt.axvline(median_length, color='green', linestyle='dashed', linewidth=1, label=f'Median: {median_length:.2f}')
# plt.text(max_length*0.7, plt.ylim()[1]*0.9, f'Max: {max_length}', fontsize=10) # Position text appropriately
# plt.legend()
# plt.tight_layout() # Adjust layout to prevent labels from overlapping
# plt.show()

# # Print some statistics
# print(f"Mean length: {mean_length:.2f}")
# print(f"Median length: {median_length:.2f}")
# print(f"Maximum length: {max_length}")
# print(f"Minimum length: {min(lengths)}")

# # Print some percentiles
# percentiles = [1, 5, 10, 25, 50, 75, 90, 95, 99]
# for p in percentiles:
#     print(f"{p}th percentile: {np.percentile(lengths, p)}")

In [9]:
# Training loop
for epoch in range(num_epochs):
    projector.train()
    classifier.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)

        with torch.no_grad():
            embeddings = classifier.distilbert.embeddings(input_ids).to(device)

        optimizer.zero_grad()
        projected_sequence = projector(embeddings)
        logits = classifier(projected_sequence)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

Epoch 1/3, Loss: 0.8474201560020447
Epoch 2/3, Loss: 0.6944388151168823
Epoch 3/3, Loss: 0.6909394264221191


In [10]:


# Evaluation
projector.eval()
classifier.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)
        with torch.no_grad():
            embeddings = classifier.distilbert.embeddings(input_ids).to(device)
        projected_sequence = projector(embeddings)
        logits = classifier(projected_sequence)
        _, predicted = torch.max(logits.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the model on the test set: {100 * correct // total}%')


Accuracy of the model on the test set: 60%


In [11]:
# Evaluation
projector.eval()
classifier.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)
        with torch.no_grad():
            embeddings = classifier.distilbert.embeddings(input_ids).to(device)
        projected_sequence = projector(embeddings)
        logits = classifier(projected_sequence)
        _, predicted = torch.max(logits.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the model on the test set: {100 * correct // total}%')

Accuracy of the model on the test set: 60%


In [12]:
model = nn.Sequential(projector, classifier)
print(model)

Sequential(
  (0): SequenceProjector(
    (conv1d): Conv1d(768, 512, kernel_size=(3,), stride=(1,), padding=same)
    (relu): ReLU()
    (global_avg_pool): AdaptiveAvgPool1d(output_size=1)
  )
  (1): Classifier(
    (embedding_layer): Linear(in_features=512, out_features=768, bias=True)
    (position_embeddings): Embedding(1, 768)
    (distilbert): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0-5): 6 x TransformerBlock(
            (attention): DistilBertSdpaAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=

In [3]:
from transformers import DistilBertTokenizer, DistilBertModel


In [10]:
# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Load the pre-trained DistilBERT model
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

# Example input text
text = "Hello, how are you?"

# Tokenize the input text
inputs = tokenizer(text, return_tensors="pt")

# Pass the tokenized input through the model
outputs = model(**inputs)

# Access the output embeddings
last_hidden_states = outputs.last_hidden_state

print(last_hidden_states.shape)

torch.Size([1, 8, 768])


In [5]:
inputs

{'input_ids': tensor([[ 101, 7592, 1010, 2129, 2024, 2017, 1029,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [12]:
model

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [11]:
model.embeddings.word_embeddings

Embedding(30522, 768, padding_idx=0)

In [7]:
import torch
from transformers import DistilBertModel, DistilBertConfig

class CustomDistilBertModel(DistilBertModel):
    def __init__(self, config):
        super().__init__(config)
        # Freeze the word_embeddings layer (optional if you're bypassing it)
        self.embeddings.word_embeddings = None

    def forward(self, custom_input, position_ids=None, attention_mask=None, **kwargs):
        """
        Forward pass with custom input.

        Args:
            custom_input (torch.Tensor): Tensor of shape (batch_size, sequence_length, hidden_size)
            position_ids (torch.Tensor): Tensor of shape (batch_size, sequence_length)
            attention_mask (torch.Tensor): Mask for padding tokens.
        """
        # Use position embeddings directly
        if position_ids is None:
            seq_length = custom_input.size(1)
            position_ids = torch.arange(seq_length, dtype=torch.long, device=custom_input.device).unsqueeze(0)

        position_embeddings = self.embeddings.position_embeddings(position_ids)

        # Add position embeddings to the custom input
        embeddings = custom_input + position_embeddings

        # Apply layer normalization and dropout
        embeddings = self.embeddings.LayerNorm(embeddings)
        embeddings = self.embeddings.dropout(embeddings)

        # Pass embeddings through the rest of the model
        return super().forward(inputs_embeds=embeddings, attention_mask=attention_mask, **kwargs)

In [9]:
# Load the configuration for DistilBERT
config = DistilBertConfig.from_pretrained("distilbert-base-uncased")

# Initialize the custom model
model = CustomDistilBertModel(config)

# Example input tensor of shape (batch_size=1, sequence_length=512, hidden_size=768)
custom_input = torch.randn(2, 512, 768)

# Perform a forward pass
outputs = model(custom_input)
print(outputs.last_hidden_state.shape)  # Expected shape: (1, 512, 768)

torch.Size([2, 512, 768])


In [25]:
import torch
import torch.nn as nn

class SequenceReducer(nn.Module):
    def __init__(self, vocab_size=30522, embed_dim=768, max_seq_len=512, padding_idx=0):
        super(SequenceReducer, self).__init__()
        self.max_seq_len = max_seq_len

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=padding_idx)

        # Linear layer to reduce sequence length dynamically
        self.linear_reduce = nn.Linear(embed_dim, embed_dim)

        # Attention mechanism to focus on important sequences
        self.attention = nn.Linear(embed_dim, 1)

        # Pooling to ensure the output sequence is exactly max_seq_len
        self.pooling = nn.AdaptiveAvgPool2d((max_seq_len, embed_dim))

    def forward(self, input_ids):
        """
        Args:
            input_ids (torch.Tensor): Tensor of shape (batch_size, sequence_length).
        Returns:
            torch.Tensor: Tensor of shape (batch_size, max_seq_len, embed_dim).
        """
        # Embedding lookup
        embeddings = self.embedding(input_ids)  # Shape: (batch_size, seq_len, embed_dim)
        print("Embedding: ",embeddings.shape)

        # Compute attention scores
        attention_scores = self.attention(embeddings)  # Shape: (batch_size, seq_len, 1)
        attention_weights = torch.softmax(attention_scores, dim=1)  # Normalize scores
        print(attention_weights.shape)

        # Apply attention weights
        attended_embeddings = embeddings * attention_weights  # Shape: (batch_size, seq_len, embed_dim)
        print(attended_embeddings.shape)

        # Reduce dimensionality and focus on important parts
        reduced_embeddings = self.linear_reduce(attended_embeddings)  # Shape: (batch_size, seq_len, embed_dim)
        print(reduced_embeddings.shape)

        # Pooling to ensure the output sequence is exactly max_seq_len
        output = self.pooling(reduced_embeddings.transpose(1, 2))  # Shape: (batch_size, embed_dim, max_seq_len)
        print(output.shape)
        output = output.transpose(1, 2)  # Shape: (batch_size, max_seq_len, embed_dim)

        return output


In [27]:
# Example usage
batch_size = 2
seq_len = 3000
vocab_size = 30522
embed_dim = 768
max_seq_len = 512

# Instantiate the model
model = SequenceReducer(vocab_size=vocab_size, embed_dim=embed_dim, max_seq_len=max_seq_len)

# Dummy input
input_ids = torch.randint(0, vocab_size, (batch_size, seq_len))  # Random token IDs

print(input_ids.shape)
# Forward pass
output = model(input_ids)

print(output.shape)  # Expected: (batch_size, max_seq_len, embed_dim)


torch.Size([2, 3000])
Embedding:  torch.Size([2, 3000, 768])
torch.Size([2, 3000, 1])
torch.Size([2, 3000, 768])
torch.Size([2, 3000, 768])
torch.Size([2, 512, 768])
torch.Size([2, 768, 512])


In [32]:
embd = nn.Embedding(30522, 768, padding_idx=0)

In [35]:
embd(torch.tensor([0,2,1])).shape

torch.Size([3, 768])

In [64]:
import torch
import torch.nn as nn
from transformers import DistilBertModel, DistilBertConfig


class MergedDistilBertModel(DistilBertModel):
    def __init__(self, config, vocab_size=30522, embed_dim=768, max_seq_len=512, padding_idx=0):
        super().__init__(config)

        self.max_seq_len = max_seq_len

        # Layers from SequenceReducer
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=padding_idx)
        self.linear_reduce = nn.Linear(embed_dim, embed_dim)
        self.attention = nn.Linear(embed_dim, 1)
        self.pooling = nn.AdaptiveAvgPool2d((max_seq_len, embed_dim))

        # Freeze all DistilBERT parameters
        for param in self.transformer.parameters():  # Only freeze transformer weights
            param.requires_grad = False

    def forward(self, input_ids, attention_mask=None, **kwargs):
        """
        Forward pass combining SequenceReducer and CustomDistilBertModel.

        Args:
            input_ids (torch.Tensor): Token IDs of shape (batch_size, seq_len).
            attention_mask (torch.Tensor): Optional attention mask of shape (batch_size, seq_len).

        Returns:
            torch.Tensor: Output embeddings of shape (batch_size, max_seq_len, embed_dim).
        """
        # SequenceReducer functionality
        embeddings = self.embedding(input_ids)  # Shape: (batch_size, seq_len, embed_dim)
        attention_scores = self.attention(embeddings)  # Shape: (batch_size, seq_len, 1)
        attention_weights = torch.softmax(attention_scores, dim=1)  # Normalize scores
        attended_embeddings = embeddings * attention_weights  # Shape: (batch_size, seq_len, embed_dim)
        reduced_embeddings = self.linear_reduce(attended_embeddings)  # Shape: (batch_size, seq_len, embed_dim)
        pooled_embeddings = self.pooling(reduced_embeddings.transpose(1, 2))  # Shape: (batch_size, max_seq_len, embed_dim)

        # Pass reduced embeddings through DistilBERT (CustomDistilBertModel functionality)
        position_ids = torch.arange(pooled_embeddings.size(1), dtype=torch.long, device=pooled_embeddings.device).unsqueeze(0)
        position_embeddings = self.embeddings.position_embeddings(position_ids)

        # Apply position embeddings and normalization/dropout layers
        embeddings = pooled_embeddings + position_embeddings
        embeddings = self.embeddings.LayerNorm(embeddings)
        embeddings = self.embeddings.dropout(embeddings)

        # Return through DistilBERT forward
        return super().forward(inputs_embeds=embeddings, attention_mask=attention_mask, **kwargs)


In [65]:
# Load the configuration for DistilBERT
config = DistilBertConfig.from_pretrained("distilbert-base-uncased")

# Instantiate the model
model = MergedDistilBertModel(config)


# Example input
batch_size = 2
seq_len = 700  # Input can be longer than max_seq_len (512)
vocab_size = 30522

# Generate valid input IDs
input_ids = torch.randint(0, vocab_size, (batch_size, seq_len))

# Forward pass
outputs = model(input_ids)
print(outputs.last_hidden_state.shape)  # Expected: (batch_size, 512, 768)


torch.Size([2, 512, 768])


In [66]:
print("Trainable Parameters and Layers:")
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Layer: {name} | Size: {param.size()} | Requires Grad: {param.requires_grad}")

Trainable Parameters and Layers:
Layer: embeddings.word_embeddings.weight | Size: torch.Size([30522, 768]) | Requires Grad: True
Layer: embeddings.position_embeddings.weight | Size: torch.Size([512, 768]) | Requires Grad: True
Layer: embeddings.LayerNorm.weight | Size: torch.Size([768]) | Requires Grad: True
Layer: embeddings.LayerNorm.bias | Size: torch.Size([768]) | Requires Grad: True
Layer: embedding.weight | Size: torch.Size([30522, 768]) | Requires Grad: True
Layer: linear_reduce.weight | Size: torch.Size([768, 768]) | Requires Grad: True
Layer: linear_reduce.bias | Size: torch.Size([768]) | Requires Grad: True
Layer: attention.weight | Size: torch.Size([1, 768]) | Requires Grad: True
Layer: attention.bias | Size: torch.Size([1]) | Requires Grad: True
