In [None]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import matplotlib.image as mpimg
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
%matplotlib inline
from tensorflow.keras.preprocessing import image
import random
import pandas as pd

In [None]:
import pickle
pickle_file_path = 'labeled_data.pkl'

# Load the pickle file
with open(pickle_file_path, 'rb') as f:
    labeled_data = pickle.load(f)

In [None]:
import pickle
pickle_file_path = 'filtered_df.pkl'

# Load the pickle file
with open(pickle_file_path, 'rb') as f:
    filtered_df = pickle.load(f)

In [None]:
filtered_df.head()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName,image
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011,Casual,Turtle Check Men Navy Blue Shirt,15970.jpg
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016,Casual,Titan Women Silver Watch,59263.jpg
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012,Casual,Puma Men Grey T-shirt,53759.jpg
5,1855,Men,Apparel,Topwear,Tshirts,Grey,Summer,2011,Casual,Inkfruit Mens Chain Reaction T-shirt,1855.jpg
6,30805,Men,Apparel,Topwear,Shirts,Green,Summer,2012,Ethnic,Fabindia Men Striped Green Shirt,30805.jpg


In [None]:
import random

ind = filtered_df.index.tolist()
random.shuffle(ind)

In [None]:
n = len(filtered_df)
p_train = 0.6
p_val = 0.2
n_train = int(p_train*n)
n_val = int(p_val*n)
train_ind = ind[:n_train]
val_ind = ind[n_train:(n_train+n_val)]
test_ind = ind[(n_train+n_val):]

In [None]:
train_img = []
val_img = []
test_img = []
train_label = []
val_label = []
test_label = []
test_ids = []

for img in labeled_data:
    if img['index'] in train_ind:
        train_img.append(img['img'])
        train_label.append(img['label'])
    elif img['index'] in val_ind:
        val_img.append(img['img'])
        val_label.append(img['label'])
    elif img['index'] in test_ind:
        test_img.append(img['img'])
        test_label.append(img['label'])
        test_ids.append(img['index'])

In [None]:
from datasets import Dataset

train_ds = Dataset.from_dict({'img':train_img,'label':train_label})
val_ds = Dataset.from_dict({'img':val_img,'label':val_label})
test_ds = Dataset.from_dict({'img':test_img,'label':test_label})

In [None]:
from transformers import ViTFeatureExtractor

feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')



In [None]:
def preprocess_images(examples):
    # get batch of images
    images = examples['img']
    # convert to list of NumPy arrays of shape (C, H, W)
    images = [np.array(image, dtype=np.uint8) for image in images]
    images = [np.moveaxis(image, source=-1, destination=0) for image in images]
    # preprocess and add pixel_values
    inputs = feature_extractor(images=images)
    examples['pixel_values'] = inputs['pixel_values']

    return examples

In [None]:
top_labels = pd.DataFrame(filtered_df.groupby('articleType').size().reset_index().sort_values(0,ascending = False)[:11]['articleType'])
top_labels_list = sorted(list(top_labels['articleType']))
top_labels['label_num'] = top_labels['articleType'].apply(lambda x: top_labels_list.index(x))
top_labels

Unnamed: 0,articleType,label_num
7,Tshirts,7
4,Shirts,4
0,Casual Shoes,0
9,Watches,9
5,Sports Shoes,5
3,Kurtas,3
6,Tops,6
1,Handbags,1
2,Heels,2
8,Wallets,8


In [None]:
from datasets import Features, ClassLabel, Array3D

# we need to define the features ourselves as both the img and pixel_values have a 3D shape 
features = Features({
    'label': ClassLabel(names = top_labels_list),
    'img': Array3D(dtype="int64", shape=(3,32,32)),
    'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
})

preprocessed_train_ds = train_ds.map(preprocess_images, batched=True, features=features)
preprocessed_val_ds = val_ds.map(preprocess_images, batched=True, features=features)
preprocessed_test_ds = test_ds.map(preprocess_images, batched=True, features=features)

Map:   0%|          | 0/1610 [00:00<?, ? examples/s]

Map:   0%|          | 0/535 [00:00<?, ? examples/s]

Map:   0%|          | 0/531 [00:00<?, ? examples/s]

In [None]:
preprocessed_train_ds[0].keys()

dict_keys(['label', 'img', 'pixel_values'])

In [None]:
from transformers import ViTModel
from transformers.modeling_outputs import SequenceClassifierOutput
import torch.nn as nn

class ViTForImageClassification(nn.Module):
    def __init__(self, num_labels=10, dropout_rate=0.1):
        super(ViTForImageClassification, self).__init__()
        self.vit = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
        self.dropout = nn.Dropout(dropout_rate)
        self.last_layer = nn.Linear(self.vit.config.hidden_size, num_labels)
        self.num_labels = num_labels

    def forward(self, pixel_values, labels=None):
        # Get the outputs from the ViT model
        outputs = self.vit(pixel_values=pixel_values)

        # Extract the CLS token embedding ([:, 0]) and apply dropout
        cls_embedding = self.dropout(outputs.last_hidden_state[:, 0])

        # Pass through the classification layer
        logits = self.last_layer(cls_embedding)

        # Compute loss only if labels are provided
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # Return a SequenceClassifierOutput object
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


In [None]:
from evaluate import load

metric = load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
import torch

num_epochs = 6
batch_size = 4
learning_rate = 2e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(preprocessed_train_ds, batch_size=batch_size, shuffle=True, num_workers=4)
val_loader = DataLoader(preprocessed_val_ds, batch_size=batch_size, shuffle=False, num_workers=4)

In [None]:
from transformers import TrainingArguments, Trainer

metric_name = "accuracy"

args = TrainingArguments(
    f"test-clothing",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=4,
    num_train_epochs=6,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    logging_dir='logs',
)






In [None]:
from transformers import default_data_collator

data_collator = default_data_collator

model = ViTForImageClassification()

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=preprocessed_train_ds,
    eval_dataset=preprocessed_val_ds,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

  0%|          | 0/978 [00:00<?, ?it/s]

  0%|          | 0/133 [00:00<?, ?it/s]

{'eval_loss': 0.9862049221992493, 'eval_accuracy': 0.7716981132075472, 'eval_runtime': 60.5878, 'eval_samples_per_second': 8.748, 'eval_steps_per_second': 2.195, 'epoch': 1.0}


  0%|          | 0/133 [00:00<?, ?it/s]

{'eval_loss': 0.7298076152801514, 'eval_accuracy': 0.8509433962264151, 'eval_runtime': 61.119, 'eval_samples_per_second': 8.672, 'eval_steps_per_second': 2.176, 'epoch': 2.0}


  0%|          | 0/133 [00:00<?, ?it/s]

{'eval_loss': 0.6045047640800476, 'eval_accuracy': 0.879245283018868, 'eval_runtime': 209.3054, 'eval_samples_per_second': 2.532, 'eval_steps_per_second': 0.635, 'epoch': 3.0}
{'loss': 0.9145, 'grad_norm': 2.859696388244629, 'learning_rate': 9.775051124744377e-06, 'epoch': 3.07}


  0%|          | 0/133 [00:00<?, ?it/s]

{'eval_loss': 0.5406275391578674, 'eval_accuracy': 0.8773584905660378, 'eval_runtime': 211.9499, 'eval_samples_per_second': 2.501, 'eval_steps_per_second': 0.628, 'epoch': 4.0}


  0%|          | 0/133 [00:00<?, ?it/s]

{'eval_loss': 0.5030139684677124, 'eval_accuracy': 0.8924528301886793, 'eval_runtime': 48.5654, 'eval_samples_per_second': 10.913, 'eval_steps_per_second': 2.739, 'epoch': 5.0}


  0%|          | 0/133 [00:00<?, ?it/s]

{'eval_loss': 0.4944866895675659, 'eval_accuracy': 0.8867924528301887, 'eval_runtime': 66.5269, 'eval_samples_per_second': 7.967, 'eval_steps_per_second': 1.999, 'epoch': 6.0}
{'train_runtime': 4956.8688, 'train_samples_per_second': 1.968, 'train_steps_per_second': 0.197, 'train_loss': 0.6203218393774365, 'epoch': 6.0}


TrainOutput(global_step=978, training_loss=0.6203218393774365, metrics={'train_runtime': 4956.8688, 'train_samples_per_second': 1.968, 'train_steps_per_second': 0.197, 'total_flos': 0.0, 'train_loss': 0.6203218393774365, 'epoch': 6.0})

In [None]:
outputs = trainer.predict(preprocessed_test_ds)

  0%|          | 0/130 [00:00<?, ?it/s]

In [None]:
print(outputs.metrics)

{'test_loss': 0.42425525188446045, 'test_accuracy': 0.8932584269662921, 'test_runtime': 47.5455, 'test_samples_per_second': 11.231, 'test_steps_per_second': 2.818}


In [None]:
# Save the model's state dictionary
torch.save(model.state_dict(), r"C:\Users\ASUS\Desktop\shoppin-assignment\vit-model-trained\model.pth")

# Save the feature extractor
feature_extractor.save_pretrained(r"C:\Users\ASUS\Desktop\shoppin-assignment\vit-model-trained")

['C:\\Users\\ASUS\\Desktop\\shoppin-assignment\\vit-model-trained\\preprocessor_config.json']

### Load model checkpoint

In [None]:
# Initialize the model first
model = ViTForImageClassification(num_labels=10)  # or however many labels you have
# Load the saved state dictionary
model.load_state_dict(torch.load(r"C:\Users\ASUS\Desktop\shoppin-assignment\vit-model-trained\model.pth"))
model.eval()  # Set to evaluation mode

  model.load_state_dict(torch.load(r"C:\Users\ASUS\Desktop\shoppin-assignment\vit-model-trained\model.pth"))


ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fe

In [None]:
def modified_forward(self, pixel_values):
    # Get the ViT embeddings
    outputs = self.vit(pixel_values)
    # Get the pooled output (CLS token)
    pooled_output = outputs[1]  # or outputs.pooler_output
    return pooled_output

# Replace model's forward method temporarily
model.forward = modified_forward.__get__(model)

# Run inference to get the embeddings
embeddings = []
for batch in preprocessed_test_ds:
    # Get the preprocessed pixel values which are already in the correct format
    inputs = batch['pixel_values']
    # Convert to tensor and move to device if needed
    inputs = torch.tensor(inputs).unsqueeze(0)  # Add batch dimension
    
    with torch.no_grad():
        embedding = model(pixel_values=inputs)
        embeddings.append(embedding)

vectors = {test_ids[i]: embeddings[i].tolist() for i in range(len(test_ids))}

In [None]:
len(vectors.keys())

520

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

def compute_similarity_matrix(embeddings):
    """
    Compute a pairwise cosine similarity matrix from the embeddings.
    """
    embeddings = np.array(embeddings)
    similarity_matrix = cosine_similarity(embeddings)
    return similarity_matrix

def evaluate_similarity(vectors, labels, top_k=5):
    """
    Evaluate the model using embeddings for image similarity.
    
    Args:
        vectors (dict): A dictionary of image IDs and their corresponding embeddings.
        labels (list): Ground truth labels for the images.
        top_k (int): Number of top results to consider for evaluation.

    Returns:
        dict: Dictionary of evaluation metrics.
    """
    # Convert vectors to matrix and maintain IDs
    ids = list(vectors.keys())  # Convert keys to list to ensure consistent ordering
    embeddings = [vectors[key] for key in ids]  # Use ids list directly
    
    # Convert embeddings to 2D array if needed
    embeddings = np.array(embeddings)
    if len(embeddings.shape) == 3:
        embeddings = embeddings.squeeze(1)  # Remove extra dimension if present
        
    similarity_matrix = compute_similarity_matrix(embeddings)

    # Evaluation metrics
    correct_top_k = 0
    total_queries = len(labels)
    
    # Convert labels to numpy array for easier indexing
    labels = np.array(labels)
    
    for idx in tqdm(range(len(ids)), desc="Evaluating Precision@K"):
        query_label = labels[idx]
        
        # Get similarities for current query
        similarities = similarity_matrix[idx].copy()  # Make a copy to avoid modifying original
        similarities[idx] = -np.inf  # Exclude self-match
        
        # Get top-k most similar image indices
        top_k_indices = np.argsort(similarities)[-top_k:]
        
        # Check if any of the top-k have the same label
        top_k_labels = labels[top_k_indices]
        if query_label in top_k_labels:
            correct_top_k += 1

    precision_at_k = correct_top_k / total_queries

    return {
        "precision@K": precision_at_k,
        "total_queries": total_queries,
        "correct_top_k": correct_top_k
    }


In [None]:
# Run evaluation
results = evaluate_similarity(vectors, outputs.label_ids, top_k=1)
print("Topk=1 Evaluation Results:", results)

print('---------------------------------------------------------')

results = evaluate_similarity(vectors, outputs.label_ids, top_k=5)
print("Topk=5 Evaluation Results:", results)

print('---------------------------------------------------------')

results = evaluate_similarity(vectors, outputs.label_ids, top_k=10)
print("Topk=10 Evaluation Results:", results)

Evaluating Precision@K: 100%|██████████| 520/520 [00:00<00:00, 17299.39it/s]


Topk=1 Evaluation Results: {'precision@K': 0.8769230769230769, 'total_queries': 520, 'correct_top_k': 456}
---------------------------------------------------------


Evaluating Precision@K: 100%|██████████| 520/520 [00:00<00:00, 20505.99it/s]


Topk=5 Evaluation Results: {'precision@K': 0.9634615384615385, 'total_queries': 520, 'correct_top_k': 501}
---------------------------------------------------------


Evaluating Precision@K: 100%|██████████| 520/520 [00:00<00:00, 21393.84it/s]

Topk=10 Evaluation Results: {'precision@K': 0.9769230769230769, 'total_queries': 520, 'correct_top_k': 508}



