<a href="https://colab.research.google.com/github/YakshRathod/NULLCLASS-Internship/blob/Task-5/Task_5_Tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import os
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

img_dir = "/content/drive/MyDrive/Nullclass internship/Task 1/Shapes Dataset"
img_files = [f for f in os.listdir(img_dir) if f.lower().endswith('.png')]

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

embeddings = []
labels = []

for fname in img_files:
    try:
        shape, color, idx = fname[:-4].split('_')
        text = shape                      # <-- Only use shape!
        labels.append(text)
        inputs = tokenizer(text, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
            emb = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            embeddings.append(emb)
    except Exception as e:
        print("Error with file:", fname, e)

embeddings = np.array(embeddings)
print("Embeddings shape:", embeddings.shape)  # [num_images, 768]

np.save('/content/drive/MyDrive/Nullclass internship/Task 5/shape_label_embeddings.npy', embeddings)
print("Saved embeddings to shape_label_embeddings.npy")
with open('/content/drive/MyDrive/Nullclass internship/Task 5/shape_labels.txt', 'w') as f:
    for label in labels:
        f.write(label + "\n")

# --- PCA of unique shape embeddings only ---
unique_shapes = sorted(set(labels))
unique_embeddings = []
for shape in unique_shapes:
    idx = labels.index(shape)            # pick the first occurrence for each shape
    unique_embeddings.append(embeddings[idx])
unique_embeddings = np.array(unique_embeddings)

pca = PCA(n_components=2)
proj = pca.fit_transform(unique_embeddings)
plt.figure(figsize=(10,8))
plt.scatter(proj[:, 0], proj[:, 1], color='royalblue')
for i, shape in enumerate(unique_shapes):
    plt.annotate(shape, (proj[i, 0], proj[i, 1]), fontsize=12)
plt.title("PCA of Unique Shape Label Embeddings")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.grid(True)
plt.tight_layout()
plt.show()

