In [2]:
import torch

def swap_top_two_softmax(tensor):
    # Find the top two values along the softmax dimension
    top2_values, top2_indices = torch.topk(tensor, 2, dim=1)
    
    # Clone the original tensor so we can modify it
    swapped_tensor = tensor.clone()
    
    # Create a tensor of batch indices
    batch_indices = torch.arange(tensor.size(0))
    
    # Get the indices of the top two values
    max_indices = top2_indices[:, 0]  # Highest value indices
    second_max_indices = top2_indices[:, 1]  # Second highest value indices
    
    # Swap the values by indexing per batch
    swapped_tensor[batch_indices, max_indices] = top2_values[:, 1]  # Place second highest value at highest index
    swapped_tensor[batch_indices, second_max_indices] = top2_values[:, 0]  # Place highest value at second highest index
    
    return swapped_tensor

# Example usage
tensor = torch.tensor([[0.1, 0.3, 0.4, 0.2],
                       [0.5, 0.2, 0.1, 0.2]])
swapped_tensor = swap_top_two_softmax(tensor)
print(swapped_tensor)

tensor([[0.1000, 0.4000, 0.3000, 0.2000],
        [0.2000, 0.5000, 0.1000, 0.2000]])


In [23]:
import torch
import torch.nn.functional as F
import math

def scaled_dot_product_attention(q, k, v):
    """
    Compute scaled dot-product attention without projection weights.
    
    Args:
    q, k, v: Query, Key, and Value tensors, each with shape (batch_size, activation_dim)
    
    Returns:
    output: Tensor with shape (batch_size, activation_dim)
    """
    # Get the dimension of the key vectors
    d_k = k.size(-1)
    
    # Compute scaled dot-product
    # (batch_size, activation_dim) @ (batch_size, activation_dim) -> (batch_size, )
    scores = torch.sum(q * k, dim=-1) / math.sqrt(d_k)
    
    # Apply softmax to get attention weights
    # This will be (batch_size, )
    attention_weights = F.softmax(scores, dim=-1)
    
    # Apply attention weights to values
    # (batch_size, 1) * (batch_size, activation_dim) -> (batch_size, activation_dim)
    output = attention_weights.unsqueeze(-1) * v
    
    return output

# Example usage
batch_size, activation_dim = 2, 64
q = torch.randn(batch_size, activation_dim)
k = torch.randn(batch_size, activation_dim)
v = torch.randn(batch_size, activation_dim)

output = scaled_dot_product_attention(q, k, v)
print("Input shapes:", q.shape, k.shape, v.shape)
print("Output shape:", output.shape)
print("Output:")
print(output)

# Demonstrating that the output preserves the input dimensions
print("\nVerifying output dimensions:")
print(f"Input activation dimension: {activation_dim}")
print(f"Output activation dimension: {output.size(-1)}")
assert output.size(-1) == activation_dim, "Output activation dimension should match input"
print("Dimensions match as expected.")

Input shapes: torch.Size([2, 64]) torch.Size([2, 64]) torch.Size([2, 64])
Output shape: torch.Size([2, 64])
Output:
tensor([[-0.3741,  0.6878,  1.1753,  0.8591, -0.0773, -0.0165,  1.0235, -1.3860,
          1.1682, -0.6247, -1.5720, -0.3600,  0.9425, -0.8259,  0.4667,  0.3603,
         -0.3464, -0.0323, -0.1608,  1.2571,  0.2163, -0.2197,  1.0912, -0.0541,
         -0.3632, -0.3523,  0.1803,  0.3945, -0.3542, -0.0615,  0.7323,  0.3620,
         -0.3932,  0.3974, -0.3008, -0.2432,  0.1163,  0.3527, -0.5179, -0.0759,
         -0.2745,  0.1467,  0.8392,  1.6388, -0.1126,  0.8270, -0.3225,  0.1135,
          0.2199, -0.0473, -0.0572, -0.5722,  0.0138, -0.5974,  0.5761, -0.9230,
         -0.5259,  0.0579, -0.0464,  1.2410, -1.0072, -0.0061,  0.0612, -0.7069],
        [ 0.2583,  0.2331, -0.3502,  0.1184, -0.0344,  0.3198, -0.6052, -0.5714,
         -0.2127, -0.8012, -0.5001, -0.2902, -0.4621, -0.6303,  0.4309, -0.2691,
          0.0876,  0.2833,  0.1570,  0.4544, -0.1433,  0.7034, -0.0511, -

In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class ScaledDotProductAttention(nn.Module):
    def __init__(self, dim_in, dim_k, dim_v):
        super().__init__()
        self.q = nn.Linear(dim_in, dim_k)
        self.k = nn.Linear(dim_in, dim_k)
        self.v = nn.Linear(dim_in, dim_v)
        self.scale = 1 / math.sqrt(dim_k)

    def forward(self, query, key, value):
        q = self.q(query)
        k = self.k(key)
        v = self.v(value)

        # Compute attention scores
        attn_scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale
        
        # Apply softmax to get attention weights
        attn_weights = F.softmax(attn_scores, dim=-1)
        
        # Apply attention weights to values
        output = torch.matmul(attn_weights, v)
        
        return output

class LocalContrastiveAttentionLayer(nn.Module):
    def __init__(self, feature_dim, key_dim, value_dim):
        super().__init__()
        self.feature_dim = feature_dim
        self.key_dim = key_dim
        self.value_dim = value_dim

        # Self-attention for lateral
        self.lateral_attention = ScaledDotProductAttention(feature_dim, key_dim, value_dim)

        # Cross-attention for forward and backward
        self.forward_attention = ScaledDotProductAttention(feature_dim, key_dim, value_dim)
        self.backward_attention = ScaledDotProductAttention(feature_dim, key_dim, value_dim)

        # Final projection to combine attended features
        self.output_projection = nn.Linear(3 * value_dim, feature_dim)

    def forward(self, forward_act, backward_act, lateral_act):
        # Self-attention for lateral activation
        lateral_attended = self.lateral_attention(lateral_act, lateral_act, lateral_act)

        # Cross-attention for forward activation
        forward_attended = self.forward_attention(lateral_act, forward_act, forward_act)

        # Cross-attention for backward activation
        backward_attended = self.backward_attention(lateral_act, backward_act, backward_act)

        # Concatenate the attended features
        combined = torch.cat([forward_attended, lateral_attended, backward_attended], dim=-1)

        # Final projection and non-linearity
        output = F.leaky_relu(self.output_projection(combined))

        return output

# Example usage
batch_size, feature_dim = 32, 64
key_dim, value_dim = 32, 32  # You can adjust these dimensions

forward_act = torch.randn(batch_size, feature_dim)
backward_act = torch.randn(batch_size, feature_dim)
lateral_act = torch.randn(batch_size, feature_dim)

layer = LocalContrastiveAttentionLayer(feature_dim, key_dim, value_dim)
output = layer(forward_act, backward_act, lateral_act)

print("Input shapes:", forward_act.shape, backward_act.shape, lateral_act.shape)
print("Output shape:", output.shape)

Input shapes: torch.Size([32, 64]) torch.Size([32, 64]) torch.Size([32, 64])
Output shape: torch.Size([32, 64])


In [5]:
# Updated function without prints and assuming label data is passed directly
def is_confident(softmax_output, correct_labels):
    # Convert one-hot encoded labels to indices (get the correct class indices)
    correct_indices = torch.argmax(correct_labels, dim=1)  # [batch_size]
    
    # Gather the softmax probabilities of the correct class for each example in the batch
    correct_class_probs = softmax_output.gather(1, correct_indices.unsqueeze(1)).squeeze()

    # Check if the average of the correct class probabilities is above 90%
    avg_confidence = torch.mean(correct_class_probs)
    
    # Return a boolean indicating if average confidence is over 90%
    return avg_confidence, avg_confidence > 0.90

# Sample scenario

# Simulated softmax output (batch_size = 3, softmax_dim = 4)
softmax_output = torch.tensor([
    [0.1, 0.2, 0.6, 0.1],  # Example 1
    [0.05, 0.8, 0.1, 0.05],  # Example 2
    [0.7, 0.15, 0.1, 0.05]   # Example 3
])

# Simulated label data (batch_size = 3, one_hot_dim = 4)
correct_labels = torch.tensor([
    [0, 0, 1, 0],  # Correct class is index 2
    [0, 1, 0, 0],  # Correct class is index 1
    [1, 0, 0, 0]   # Correct class is index 0
])

# Test the function
is_confident(softmax_output, correct_labels)


(tensor(0.7000), tensor(False))

In [10]:
import torch
import torch.nn.functional as F

def zero_correct_class_softmax(logits, correct_classes):
    # Set a large negative value for logits at the index of the correct class
    modified_logits = logits.clone()  # Clone to avoid modifying original logits in-place
    modified_logits[correct_classes == 1] = -1e9  # Use a very large negative value
    
    # Apply softmax to the modified logits
    softmax_output = F.softmax(modified_logits, dim=1)
    return softmax_output

# Example usage
logits = torch.tensor([[2.0, 1.5, 0.5], [0.5, 2.0, 1.0]], dtype=torch.float32)
correct_classes = torch.tensor([[0, 1, 0], [0, 0, 1]], dtype=torch.float32)  # One-hot encoded correct classes

result = zero_correct_class_softmax(logits, correct_classes)
print(torch.softmax(logits, dim=1))
print(result)


tensor([[0.5465, 0.3315, 0.1220],
        [0.1402, 0.6285, 0.2312]])
tensor([[0.8176, 0.0000, 0.1824],
        [0.1824, 0.8176, 0.0000]])


In [20]:
import torch

# Updated function without prints and assuming label data is passed directly
def is_confident(softmax_output, correct_labels, confidence_threshold):
    # Convert one-hot encoded labels to indices (get the correct class indices)
    correct_indices = torch.argmax(correct_labels, dim=1)  # [batch_size]
    
    # Gather the softmax probabilities of the correct class for each example in the batch
    correct_class_probs = softmax_output.gather(1, correct_indices.unsqueeze(1)).squeeze()

    # Check if all of the correct class probabilities are above the confidence threshold
    all_confident = torch.all(correct_class_probs > confidence_threshold)

    # Calculate the average confidence
    avg_confidence = torch.mean(correct_class_probs)

    # Return the confidence probabilities, average confidence, and whether all are above the threshold
    return avg_confidence.item(), all_confident.item()

# Sample scenario

# Simulated softmax output (batch_size = 3, softmax_dim = 4)
softmax_output = torch.tensor([
    [0.1, 0.2, 0.6, 0.1],  # Example 1
    [0.05, 0.8, 0.1, 0.05],  # Example 2
    [0.7, 0.15, 0.1, 0.05]   # Example 3
])

# Simulated label data (batch_size = 3, one_hot_dim = 4)
correct_labels = torch.tensor([
    [0, 0, 1, 0],  # Correct class is index 2
    [0, 1, 0, 0],  # Correct class is index 1
    [1, 0, 0, 0]   # Correct class is index 0
])

# Test the function
confidence_threshold = 0.8
conf_probs, all_above_threshold = is_confident(softmax_output, correct_labels, confidence_threshold)
conf_probs, all_above_threshold

(0.7000000476837158, False)

In [16]:
import torch

# Function to check the percentage of correct softmax values above a given threshold
def percent_above_threshold(softmax_output, correct_labels, confidence_threshold):
    # Convert one-hot encoded labels to indices (get the correct class indices)
    correct_indices = torch.argmax(correct_labels, dim=1)  # [batch_size]
    
    # Gather the softmax probabilities of the correct class for each example in the batch
    correct_class_probs = softmax_output.gather(1, correct_indices.unsqueeze(1)).squeeze()

    # Check how many of the correct class probabilities are above the confidence threshold
    num_above_threshold = torch.sum(correct_class_probs > confidence_threshold).item()

    # Calculate the percentage of softmax values that are above the threshold
    percent_above = (num_above_threshold / correct_class_probs.size(0)) * 100

    # Return the percentage
    return percent_above

# Sample scenario

# Simulated softmax output (batch_size = 3, softmax_dim = 4)
softmax_output = torch.tensor([
    [0.1, 0.2, 0.6, 0.1],  # Example 1
    [0.05, 0.8, 0.1, 0.05],  # Example 2
    [0.7, 0.15, 0.1, 0.05]   # Example 3
])

# Simulated label data (batch_size = 3, one_hot_dim = 4)
correct_labels = torch.tensor([
    [0, 0, 1, 0],  # Correct class is index 2
    [0, 1, 0, 0],  # Correct class is index 1
    [1, 0, 0, 0]   # Correct class is index 0
])

# Test the function
confidence_threshold = 0.69
percent_above = percent_above_threshold(softmax_output, correct_labels, confidence_threshold)
percent_above

66.66666666666666