### Import packages

In [2]:
from transformers import AutoImageProcessor, AutoModelForImageClassification
import torch
import numpy as np
import os
from tqdm import tqdm
from torch.utils.data import DataLoader
from tqdm import tqdm
from torchvision import transforms
from datasets import load_dataset,DatasetDict,load_from_disk
from torch.utils.data import DataLoader
import torch.nn.functional as F
from itertools import islice
import copy
from torchmetrics.retrieval import RetrievalMAP, RetrievalPrecision
import os
import random
import torch
import torch.nn.functional as F
import torch.nn as nn

### Set seed

In [3]:
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x1136a5090>

### Set device

In [4]:
if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'

### Define main path

In [5]:
main_path = '/Users/aakashagarwal/Downloads/ir_assignment2/'

### Dataset

In [6]:
mydataset = "evanarlian/imagenet_1k_resized_256"

#### Load dataset

In [7]:
ds = load_dataset(mydataset)
ds = ds.rename_column("image", "img")

ds = DatasetDict({
    'train': ds['train'],
    'test': ds['val']
})

Resolving data files:   0%|          | 0/52 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/52 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/46 [00:00<?, ?it/s]

#### Test on small dataset

In [8]:
from datasets import Dataset, concatenate_datasets

def add_index(example, idx):
    example['index'] = idx
    return example

# Define the number of shards
num_shards = 10  # Adjust based on memory constraints and dataset size
batch_size = 1000  # Adjust this based on memory limits

# List to hold processed shard datasets
processed_shards = {split: [] for split in ['train', 'test']}

# Loop through each split and each shard
for split in ['train', 'test']:
    # Calculate shard size
    shard_size = len(ds[split]) // num_shards

    for shard in range(num_shards):
        # Slice the dataset for the current shard
        shard_start = shard * shard_size
        shard_end = shard_start + shard_size

        if shard == num_shards - 1:  # Ensure the last shard includes any remaining examples
            shard_end = len(ds[split])

        # Apply the map function to the current shard
        shard_ds = ds[split].select(range(shard_start, shard_end))
        shard_ds = shard_ds.map(add_index, with_indices=True, batched=True, batch_size=batch_size)

        # Append the processed shard to the list
        processed_shards[split].append(shard_ds)

    # Concatenate all processed shards for the current split
    ds[split] = concatenate_datasets(processed_shards[split])

    # Optionally, save the concatenated split to disk if needed
    # ds_feature[split].save_to_disk(f"{split}_processed")

# After the loop, ds_feature will contain the fully processed 'train' and 'test' splits


In [9]:
ds['train'][100000]

{'img': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=341x256>,
 'label': 77,
 'index': 100000}

### Model load

In [10]:
processor = AutoImageProcessor.from_pretrained("google/vit-large-patch32-384",use_fast=True)
model = AutoModelForImageClassification.from_pretrained("google/vit-large-patch32-384")

### Extract features of training data

#### Feature function

In [11]:
# Set model to evaluation mode
model.eval()

transform = transforms.Compose([
        transforms.Resize((384, 384)),  # Resize images to a consistent size
        transforms.ToTensor(),           # Convert PIL images to tensors
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

# Define a function to process and extract features from each image
def extract_features(batch):
    imgs = batch['img']  # This will now be a list of images in a batch

    # Preprocess each image in the batch
    img_tensors = [transform(img).unsqueeze(0).to(device) for img in imgs]  # Add batch dimension and move to device
    img_tensors = torch.cat(img_tensors, dim=0)  # Concatenate to create a batch tensor

    inputs = {'pixel_values': img_tensors}  # Adjust for model input (assuming using Hugging Face models)

    # Forward pass through the model to get features
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)

    hidden_states = outputs.hidden_states  # List of all hidden states

    # Extract the final hidden state
    final_hidden_state = hidden_states[-1]  # Shape: [batch_size, num_patches, hidden_size]
    
    # Optionally, pool the features
    pooled_features = final_hidden_state.mean(dim=1)  # Shape: [batch_size, hidden_size]
    
    # Return the features as a new column for the batch
    return {"features": pooled_features.cpu().numpy()}  # 


#### Create features of train data or load from saved file.

In [12]:

output_dir = main_path+ f"features/imagenet1k/"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "vit_large/")  # Saving in Arrow format

# Check if features already exist
if os.path.exists(output_file):
    print(f"Features file already exists at {output_file}. Loading features...")
    # Load existing features
    ds_feature = load_from_disk(output_file)
else:
    print("Features file does not exist. Extracting features...")

    # Use `map` to apply the feature extraction across the entire dataset
    ds_feature = ds.map(extract_features, batched=True, batch_size=64)

    # Save the dataset with the new features to disk
    ds_feature.save_to_disk(output_file)  # Saving in the Arrow format
    print("Features saved successfully!")

Features file already exists at /Users/aakashagarwal/Downloads/ir_assignment2/features/imagenet1k/vit_large/. Loading features...


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

In [13]:
ds_feature = ds_feature.rename_column("labels", "label")
ds_feature

DatasetDict({
    train: Dataset({
        features: ['features', 'label', 'index'],
        num_rows: 1281167
    })
    test: Dataset({
        features: ['features', 'label', 'index'],
        num_rows: 50000
    })
})

#### Collate function

In [14]:
def custom_collate(batch):
    # Define the transformations you want to apply to the images
    transform = transforms.Compose([
        transforms.Resize((224, 224)),  # Resize images to a consistent size
        transforms.ToTensor(),           # Convert PIL images to tensors
    ])

    # Extracting each field from the batch
    index = [item['index'] for item in batch]
    # images = [transform(item['img']) for item in batch]  # Load and transform images
    labels = torch.tensor([item['label'] for item in batch])          # Convert labels to a tensor
    features = [torch.tensor(item['features']) for item in batch]     # Convert features to tensors
    
    # Return a dictionary with batched data
    return {
        'index': index,
        # 'img': torch.stack(images),    # Stack image tensors into a single tensor
        'label': labels,
        'features': torch.stack(features),  # Stack feature tensors into a single tensor
    }


#### Dataloader

In [62]:
from torch.utils.data import Subset, DataLoader

# Define the subset sizes
# train_subset_size = 50000
# test_subset_size = 10000

# # Create indices for the subsets
# train_indices = list(range(train_subset_size))
# test_indices = list(range(test_subset_size))

# # Create subsets using the indices
# train_subset = Subset(ds_feature['train'], train_indices)
# test_subset = Subset(ds_feature['test'], test_indices)

# Create dataloaders for the subsets
train_dataloader = DataLoader(ds_feature['train'], batch_size=1000, shuffle=False, collate_fn=custom_collate)
test_dataloader = DataLoader(ds_feature['test'], batch_size=64, shuffle=False, collate_fn=custom_collate)


In [63]:
torch.tensor(ds_feature['train'][1:3]['features']).shape

torch.Size([2, 1024])

In [64]:
import torch
from tqdm import tqdm

# Prepare a list to store cosine similarities
cosine_similarities = []

# Function to compute cosine similarities for a single test example
def add_cosine_similarities(example):
    test_features = example['features']  # Assuming 'features' key holds the features
    # Convert to tensor
    test_tensor = torch.tensor(test_features).detach().clone()
    
    # Iterate through the training batches
    for train_batch in tqdm(train_dataloader):
        train_features = train_batch['features']  # Assuming 'features' key holds the features
        
        # Convert to tensor
        train_tensor = torch.tensor(train_features).detach().clone()
        
        # Compute cosine similarities between the current test batch and current train batch
        cosine = torch.matmul(test_tensor, train_tensor.T)
        example['cosine_similarities'] = cosine
    
    return example  # Return the modified example



In [None]:
# Apply the function to each test sample, using with_indices to track batch position
ds_feature['test'] = ds_feature['test'].map(add_cosine_similarities, batched=True, batch_size=25000)

### Random Hyperlane Definition

#### Hash hyperparameters

In [40]:
hash_length = 14
num_hash_table = 5
feature_length = len(ds_feature['train'][0]['features'])

#### Random hyperplanes function

In [41]:
def random_hyperplane(num_hash_table,hash_length,feature_length):
    hyperplanes = []
    
    for i in range(num_hash_table):
        hyperplane = (torch.randn(feature_length,hash_length) - 0.5).to(device)
        hyperplanes.append(hyperplane)
    
    return hyperplanes
        

#### Create random hyperplanes

In [42]:
hyperplanes = random_hyperplane(num_hash_table,hash_length,feature_length)
hyperplanes[0].shape

torch.Size([1024, 14])

### Create Hash Tables for training data

In [43]:
hash_dict_list = []
# Iterate through the dataset
for i in range(len(hyperplanes)):
    hash_dict = dict()
    for batch_idx, data in enumerate(tqdm(train_dataloader, desc="Processing Batches")):
        
        # Extract features and compute hash code
        feature = data['features'].to(device)

        hash_value = (feature @ hyperplanes[i])
        hash_code = torch.where(hash_value > 0, 1, 0)
        
        # Convert the hash code tensor to a list of tuples for dictionary key compatibility
        hash_code_list = hash_code.tolist()

        # Iterate through each image and label in the batch
        for j in range(len(hash_code_list)):
            # Create a unique key from the hash code of each image
            hash_code_key = tuple(hash_code_list[j])
            
            # Initialize the list if this hash code key is not yet in the dictionary
            if hash_code_key not in hash_dict:
                hash_dict[hash_code_key] = []
            
            # Append the tuple (batch index, image index, label) to the list of this hash code key
            hash_dict[hash_code_key].append([data['index'][j], data['label'][j].item()])
        
    hash_dict_list.append(hash_dict)
    print(len(hash_dict.keys()))
    # if batch_idx == 10:
    #     break

        

Processing Batches: 100%|██████████| 20019/20019 [10:15<00:00, 32.52it/s]


7074


Processing Batches: 100%|██████████| 20019/20019 [09:44<00:00, 34.27it/s]


11028


Processing Batches: 100%|██████████| 20019/20019 [10:13<00:00, 32.66it/s]


14356


Processing Batches: 100%|██████████| 20019/20019 [10:24<00:00, 32.07it/s]


8898


Processing Batches: 100%|██████████| 20019/20019 [09:23<00:00, 35.56it/s]

11214





#### Number of buckets in each hash table

In [44]:
for i in range(len(hash_dict_list)):
    # Count the number of keys in the hash_dict
    hash_dict_key_count = len(hash_dict_list[i].keys())
    # Print the count
    print(f"The number of keys in hash_dict is: {hash_dict_key_count}")


The number of keys in hash_dict is: 7074
The number of keys in hash_dict is: 11028
The number of keys in hash_dict is: 14356
The number of keys in hash_dict is: 8898
The number of keys in hash_dict is: 11214


### Hash codes for test data

In [45]:
# Initialize a list to store similar images for each hyperplane
similar_images_list = []

# Iterate through each hyperplane
for i in range(len(hyperplanes)):
    # Create a dictionary to store similar images for this hyperplane
    similar_images = {}

    # Process test data
    for batch_idx, data in enumerate(tqdm(test_dataloader, desc="Processing Test Batches")):
        feature = data['features'].to(device)

        # Compute hash values for the test data
        hash_value = (feature @ hyperplanes[i])
        test_hash_code = torch.where(hash_value > 0, 1, 0)

        # Convert the hash code tensor to a list of tuples for dictionary key compatibility
        test_hash_code_list = test_hash_code.tolist()

        # Iterate through each test image
        for j in range(len(test_hash_code_list)):
            # Create a unique key from (batch_idx, i, label)
            test_key = (data['index'][j], data['label'][j].item())  # Convert tensor to tuple

            # Create a unique key from the test hash code
            test_hash_code_key = tuple(test_hash_code_list[j])

            # Check if this hash code exists in the training hash dictionary
            if test_hash_code_key in hash_dict_list[i]:
                # Retrieve the corresponding images and their labels from the training hash dictionary
                similar_images[test_key] = hash_dict_list[i][test_hash_code_key]
            else:
                similar_images[test_key] = []

    # Store the similar images found for this hyperplane
    similar_images_list.append(similar_images)

# Now similar_images_dict contains keys as (batch_idx, i, label) and values as similar images


Processing Test Batches: 100%|██████████| 782/782 [00:15<00:00, 50.79it/s]
Processing Test Batches: 100%|██████████| 782/782 [00:13<00:00, 56.68it/s]
Processing Test Batches: 100%|██████████| 782/782 [00:14<00:00, 55.82it/s]
Processing Test Batches: 100%|██████████| 782/782 [00:14<00:00, 55.10it/s]
Processing Test Batches: 100%|██████████| 782/782 [00:13<00:00, 58.97it/s]


In [47]:
len(similar_images_list[0])

50000

### Create union of all hash list

In [48]:
# Initialize the dictionary to store similar images
similar_images_dict = {}

# Iterate through the similar_images_dict to populate similar_images_list
for i in similar_images_list:
    for j in tqdm(i.keys()):
        # Check if the key already exists in similar_images_list
        if j not in similar_images_list:
            similar_images_dict[j] = []  # Initialize the list if it doesn't exist

        similar_images_dict[j].extend(i[j])  # Deep copy of the elements

100%|██████████| 50000/50000 [00:18<00:00, 2752.83it/s]
100%|██████████| 50000/50000 [00:05<00:00, 9953.23it/s] 
100%|██████████| 50000/50000 [00:01<00:00, 25852.06it/s]
100%|██████████| 50000/50000 [00:02<00:00, 19528.40it/s]
100%|██████████| 50000/50000 [00:03<00:00, 15770.88it/s]


In [49]:
from tqdm import tqdm

# Initialize the dictionary to store similar images
unique_images_dict = {}

# Iterate through the similar_images_dict to populate unique_images_dict
for i, j in tqdm(similar_images_dict.items()):
    if i not in unique_images_dict:
        unique_images_dict[i] = []  # Initialize the list only if the key doesn't exist

    for k in j:
        if k not in unique_images_dict[i]:  # Check if 'k' is not already in the list
            unique_images_dict[i].append(k)  # Append 'k' to the list


100%|██████████| 50000/50000 [1:36:55<00:00,  8.60it/s]  


In [53]:
# Process each key in similar_images_dict with tqdm
for key, similar_images in tqdm(unique_images_dict.items(), desc="Processing keys"):
    
    test_feature = torch.tensor(ds_feature['test'][key[0]]['features'])
    # Process each similar image for the current key with tqdm
    for i, image in tqdm(enumerate(similar_images[:100]), desc=f"Processing similar images for key {key[0]}", leave=False):
        # Assuming image is a list/tuple with at least four elements
        train_feature = torch.tensor(ds_feature['train'][image[0]]['features'])
        
        # Calculate cosine similarity and store it
        similarity_score = F.cosine_similarity(test_feature, train_feature, dim=0)
        image = list(image[:2])  # Make a copy to modify
        # Append the similarity score to the copied list
        image.append(similarity_score.item())
        
        # Update the original list in similar_images
        similar_images[i] = image[:3]
        

Processing keys:  19%|█▉        | 9480/50000 [07:21<31:25, 21.49it/s]  


KeyboardInterrupt: 

In [50]:
len(unique_images_dict.keys())

50000

### Evaluate

In [None]:
# Define functions to calculate Average Precision (AP) and Precision@K
def average_precision(predictions, targets):
    relevant_indices = (targets == 1).nonzero(as_tuple=True)[0]
    if len(relevant_indices) == 0:
        return 0.0

    precisions = []
    for i, idx in enumerate(relevant_indices, start=1):
        precision_at_i = (targets[:idx + 1].sum() / (idx + 1)).item()
        precisions.append(precision_at_i)

    return sum(precisions) / len(precisions)

def precision_at_k(predictions, targets, k):
    top_k_indices = torch.argsort(predictions, descending=True)[:k]
    top_k_relevant = targets[top_k_indices].sum().item()
    return top_k_relevant / k

# Prepare lists to store metric scores
map_scores = []
precision_10_scores = []
precision_50_scores = []

# Example: Iterate over your dataset to calculate metrics
for key, value in tqdm(unique_images_dict.items(), desc="Calculating metrics", total=len(similar_images_list)):
    if len(value) > 0:
        # Sort the values by similarity score (assuming v[4] is the similarity score)
        sorted_value = sorted(value, key=lambda x: x[-1], reverse=True)[:50]
        predictions = torch.tensor([v[2] for v in sorted_value])  # Similarity scores
        targets = torch.tensor([v[1] == key[1] for v in sorted_value], dtype=torch.float32)  # Relevance labels
        
        # Calculate Mean Average Precision for the current sample
        ap = average_precision(predictions, targets)
        map_scores.append(ap)
        
        # Calculate Precision@10 and Precision@50
        p10 = precision_at_k(predictions, targets, k=10)
        p50 = precision_at_k(predictions, targets, k=50)
        precision_10_scores.append(p10)
        precision_50_scores.append(p50)

# Calculate final average metrics
mean_avg_precision = sum(map_scores) / len(map_scores) if map_scores else 0.0
precision_10 = sum(precision_10_scores) / len(precision_10_scores) if precision_10_scores else 0.0
precision_50 = sum(precision_50_scores) / len(precision_50_scores) if precision_50_scores else 0.0

# Print results
print("Mean Average Precision (mAP):", mean_avg_precision)
print("Precision@10:", precision_10)
print("Precision@50:", precision_50)


Calculating metrics: 10000it [00:01, 5377.86it/s]         

Mean Average Precision (mAP): 0.5984044666674743
Precision@10: 0.45985
Precision@50: 0.2562



