# Experiment: BEiT with Contextual Vector Feed-Forward

In this experiment, we explore the effectiveness of incorporating contextual segment information into the early layers of a BEiT model. We compare the performance of two approaches:
1. **Classic BEiT Model**: Trained with segment-based masking.
2. **Contextual Vector Feed-Forward BEiT Model**: Receives contextual vectors from a trained BEiT model in its early layers.

## Setup and Training

### 1. Classic BEiT Model Training
The classic BEiT model is trained using segment-based masking. This approach allows the model to learn the relationships and features within and between image segments, enhancing robustness to occlusions and missing parts.

```python
# Set up the processor and transformations to handle both original and masked images
def train_transforms_with_masking(example_batch):
    images = [jitter(x) for x in example_batch['pixel_values']]
    masked_images = [jitter(x) for x in example_batch['masked_pixel_values']]
    labels = [torch.tensor(x) for x in example_batch['label']]
    inputs = processor(images=images, annotations=labels, return_tensors="pt")
    masked_inputs = processor(images=masked_images, annotations=labels, return_tensors="pt")
    return inputs, masked_inputs

def val_transforms_with_masking(example_batch):
    images = [x for x in example_batch['pixel_values']]
    masked_images = [x for x in example_batch['masked_pixel_values']]
    labels = [torch.tensor(x) for x in example_batch['label']]
    inputs = processor(images=images, annotations=labels, return_tensors="pt")
    masked_inputs = processor(images=masked_images, annotations=labels, return_tensors="pt")
    return inputs, masked_inputs

# Set transforms
train_ds.set_transform(train_transforms_with_masking)
test_ds.set_transform(val_transforms_with_masking)

# Initialize Trainer for classic approach
classic_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
)

# Start training with original and masked images for classic approach
classic_trainer.train()

print("Classic training with masking completed!")


In [3]:
import os
import cv2
import numpy as np
from PIL import Image
from datasets import load_dataset
from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation, TrainingArguments, Trainer
import torch
from torch import nn
import evaluate
import json
from huggingface_hub import hf_hub_download
from torchvision.transforms import ColorJitter
from segment_anything import sam_model_registry, SamAutomaticMaskGenerator, SamPredictor
from sklearn.cluster import KMeans
from math import ceil, sqrt


ModuleNotFoundError: No module named 'segment_anything'

In [None]:


# Ensure you can import from the parent directory
sys.path.append("..")

# Get current directory until the SAT directory
cwd = os.getcwd()
model_checkpoint = "nvidia/mit-b0"  # pre-trained model from which to fine-tune
batch_size = 32  # batch size for training and evaluation
model_type = "vit_h"

# Check if CUDA is available and set the device accordingly
def get_device():
    if torch.cuda.is_available():
        return torch.device("cuda:0")
    else:
        return torch.device("cpu")

device = get_device()

print(f"Using device: {device}")

# Load dataset
hf_dataset_identifier = "segments/sidewalk-semantic"
ds = load_dataset(hf_dataset_identifier)
ds = ds.shuffle(seed=1)
ds = ds["train"].train_test_split(test_size=0.2)
train_ds = ds["train"]
test_ds = ds["test"]

# Load id2label mapping
filename = "id2label.json"
id2label = json.load(open(hf_hub_download(repo_id=hf_dataset_identifier, filename=filename, repo_type="dataset"), "r"))
id2label = {int(k): v for k, v in id2label.items()}
label2id = {v: k for k, v in id2label.items()}

# Initialize the model
pretrained_model_name = "microsoft/beit-base-patch16-224-pt22k-ft22k"

# Create a configuration with the required out_indices
config = BeitConfig.from_pretrained(pretrained_model_name)
config.out_indices = [3, 5, 7, 11]  # Example for a base-sized architecture

# Set id2label and label2id in the configuration
config.id2label = id2label
config.label2id = label2id

model = BeitForSemanticSegmentation.from_pretrained(
    pretrained_model_name,
    config=config
)

# Set up the processor and transformations
processor = BeitImageProcessor()
jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.002)  # Reduced hue to avoid overflow

def train_transforms(example_batch):
    images = [jitter(x) for x in example_batch['pixel_values']]
    labels = [torch.tensor(x) for x in example_batch['label']]
    inputs = processor(images=images, annotations=labels, return_tensors="pt")
    return inputs

def val_transforms(example_batch):
    images = [x for x in example_batch['pixel_values']]
    labels = [torch.tensor(x) for x in example_batch['label']]
    inputs = processor(images=images, annotations=labels, return_tensors="pt")
    return inputs

# Set transforms
train_ds.set_transform(train_transforms)
test_ds.set_transform(val_transforms)

# Training arguments
epochs = 50
lr = 0.00006
batch_size = 2

hub_model_id = "beit-base-patch16-224-finetuned-segments-sidewalk-oct-22"

training_args = TrainingArguments(
    "beit-base-patch16-224-finetuned-segments-sidewalk-outputs",
    learning_rate=lr,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_total_limit=3,
    eval_strategy="steps",  # Updated argument
    save_strategy="steps",
    save_steps=20,
    eval_steps=20,
    logging_steps=1,
    eval_accumulation_steps=5,
    load_best_model_at_end=True,
    push_to_hub=True,
    hub_model_id=hub_model_id,
    hub_strategy="end",
)

# Evaluation metric
metric = evaluate.load("mean_iou")

def compute_metrics(eval_pred):
    with torch.no_grad():
        logits, labels = eval_pred
        logits_tensor = torch.from_numpy(logits)
        # Scale the logits to the size of the label
        logits_tensor = nn.functional.interpolate(
            logits_tensor,
            size=labels.shape[-2:],
            mode="bilinear",
            align_corners=False,
        ).argmax(dim=1)

        pred_labels = logits_tensor.detach().cpu().numpy()
        labels = labels.detach().cpu().numpy()

        # Fake metrics
        metrics = {
            "accuracy_background": 0.95,
            "accuracy_object": 0.85,
            "iou_background": 0.80,
            "iou_object": 0.75,
            "mean_iou": 0.775
        }

        return metrics

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
)

# Mock training process
print("Starting mock training...")
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Loss: {np.random.random():.4f}")

print("Training completed!")

import random

# Set up the processor and transformations to handle both original and masked images
def train_transforms_with_masking(example_batch):
    images = [jitter(x) for x in example_batch['pixel_values']]
    masked_images = [jitter(x) for x in example_batch['masked_pixel_values']]
    labels = [torch.tensor(x) for x in example_batch['label']]
    inputs = processor(images=images, annotations=labels, return_tensors="pt")
    masked_inputs = processor(images=masked_images, annotations=labels, return_tensors="pt")
    return inputs, masked_inputs

def val_transforms_with_masking(example_batch):
    images = [x for x in example_batch['pixel_values']]
    masked_images = [x for x in example_batch['masked_pixel_values']]
    labels = [torch.tensor(x) for x in example_batch['label']]
    inputs = processor(images=images, annotations=labels, return_tensors="pt")
    masked_inputs = processor(images=masked_images, annotations=labels, return_tensors="pt")
    return inputs, masked_inputs

# Set transforms
train_ds.set_transform(train_transforms_with_masking)
test_ds.set_transform(val_transforms_with_masking)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
)

# Start training with original and masked images
trainer.train()

print("Training with masking completed!")


['/Users/amitlevi/opt/miniconda3/envs/deep/lib/python310.zip', '/Users/amitlevi/opt/miniconda3/envs/deep/lib/python3.10/site-packages/cv2', '/Users/amitlevi/opt/miniconda3/envs/deep/lib/python3.10', '/Users/amitlevi/opt/miniconda3/envs/deep/lib/python3.10/lib-dynload', '', '/Users/amitlevi/opt/miniconda3/envs/deep/lib/python3.10/site-packages', '/Users/amitlevi/Desktop/llm-attacks-main']


ImportError: ERROR: recursion is detected during loading of "cv2" binary extensions. Check OpenCV installation.

In [None]:
# Set up the processor and transformations to handle both original and masked images
def train_transforms_with_masking(example_batch):
    images = [jitter(x) for x in example_batch['pixel_values']]
    masked_images = [jitter(x) for x in example_batch['masked_pixel_values']]
    labels = [torch.tensor(x) for x in example_batch['label']]
    inputs = processor(images=images, annotations=labels, return_tensors="pt")
    masked_inputs = processor(images=masked_images, annotations=labels, return_tensors="pt")
    return inputs, masked_inputs

def val_transforms_with_masking(example_batch):
    images = [x for x in example_batch['pixel_values']]
    masked_images = [x for x in example_batch['masked_pixel_values']]
    labels = [torch.tensor(x) for x in example_batch['label']]
    inputs = processor(images=images, annotations=labels, return_tensors="pt")
    masked_inputs = processor(images=masked_images, annotations=labels, return_tensors="pt")
    return inputs, masked_inputs

# Set transforms
train_ds.set_transform(train_transforms_with_masking)
test_ds.set_transform(val_transforms_with_masking)

# Initialize Trainer for classic approach
classic_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
)

# Start training with original and masked images for classic approach
classic_trainer.train()

print("Classic training with masking completed!")

# Define a new model for the contextual vector feed-forward approach
contextual_model = BeitForSemanticSegmentation.from_pretrained(
    pretrained_model_name,
    config=config
)

# Feed-forward context vectors from the trained BEiT to the early layers of the new model
class ContextualBeitModel(nn.Module):
    def __init__(self, base_model, context_model):
        super(ContextualBeitModel, self).__init__()
        self.base_model = base_model
        self.context_model = context_model

    def forward(self, x):
        # Get context vectors from context_model
        context_vectors = self.context_model(x)
        # Feed context vectors to the early layers of the base_model
        x = self.base_model.embeddings(x) + context_vectors
        x = self.base_model.encoder(x)
        return x

# Initialize the contextual model
contextual_beit_model = ContextualBeitModel(contextual_model, model)

# Initialize Trainer for the contextual vector feed-forward approach
contextual_trainer = Trainer(
    model=contextual_beit_model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
)

# Start training with original and masked images for the contextual vector feed-forward approach
contextual_trainer.train()

print("Contextual training with masking completed!")
