In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import DataLoader
import itertools

# Define the model
class ColorEmbeddingNN(nn.Module):
    def __init__(self, minilm_model='microsoft/MiniLM-L12-H384-uncased', hidden_dim=4096):
        super(ColorEmbeddingNN, self).__init__()
        self.minilm = AutoModel.from_pretrained(minilm_model)
        self.project = nn.Linear(self.minilm.config.hidden_size, hidden_dim)
        self.final_layer = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim)
        )

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():  # Freeze MiniLM during training
            minilm_output = self.minilm(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        projected = self.project(minilm_output)
        output = self.final_layer(projected)
        return output

# Contrastive Loss
def contrastive_loss(embedding1, embedding2, label, margin=1.0):
    distance = torch.norm(embedding1 - embedding2, p=2, dim=1)
    loss = (1 - label) * torch.pow(distance, 2) + label * torch.pow(torch.clamp(margin - distance, min=0.0), 2)
    return loss.mean()

# Training function
def train(model, embeddings, tokenizer, optimizer, device, epochs=10, batch_size=4, margin=1.0):
    model.to(device)
    model.train()
    color_texts = list(embeddings.keys())

    for epoch in range(epochs):
        total_loss = 0
        for i in range(0, len(color_texts), batch_size):
            batch_colors = color_texts[i:i+batch_size]
            encoded = tokenizer(batch_colors, padding=True, truncation=True, return_tensors='pt')
            input_ids, attention_mask = encoded['input_ids'].to(device), encoded['attention_mask'].to(device)
            batch_embeddings = torch.stack([embeddings[color] for color in batch_colors]).to(device)

            optimizer.zero_grad()
            output = model(input_ids, attention_mask)

            # Generate positive and negative pairs
            pairs = list(itertools.combinations(range(len(batch_colors)), 2))
            loss = 0
            for idx1, idx2 in pairs:
                label = torch.tensor(1.0 if batch_colors[idx1][0] == batch_colors[idx2][0] else 0.0, device=device)
                loss += contrastive_loss(output[idx1].unsqueeze(0), output[idx2].unsqueeze(0), label, margin)

            loss /= len(pairs)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / (len(color_texts) // batch_size + 1):.4f}")

In [2]:
import torch
import os

# Path to the directory containing the .pt files
directory = "/content/"

# Dictionary to store the embeddings
embeddings = {}

# Load each .pt file and store it in the dictionary
for file_name in os.listdir(directory):
    if file_name.endswith(".pt"):
        # Remove the .pt extension for the key
        key = os.path.splitext(file_name)[0]
        # Load the tensor and add it to the dictionary
        embeddings[key] = torch.load(os.path.join(directory, file_name)).to(torch.float32)

# Print the dictionary keys to verify
print("Loaded embeddings:", list(embeddings.keys()))


  embeddings[key] = torch.load(os.path.join(directory, file_name)).to(torch.float32)


Loaded embeddings: ['blue', 'orange', 'green', 'lavender', 'grey', 'pink', 'red', 'brown', 'lime', 'light grey']


In [3]:
# Initialize components
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained('microsoft/MiniLM-L12-H384-uncased')

# Model, loss function, optimizer
model = ColorEmbeddingNN()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Train the model
train(model, embeddings, tokenizer, optimizer, device, epochs=10)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/133M [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Epoch 1, Loss: 0.4433
Epoch 2, Loss: 0.0625
Epoch 3, Loss: 0.0553
Epoch 4, Loss: 0.0333
Epoch 5, Loss: 0.0293
Epoch 6, Loss: 0.0443
Epoch 7, Loss: 1.8337
Epoch 8, Loss: 0.0332
Epoch 9, Loss: 0.0421
Epoch 10, Loss: 0.0477
Epoch 11, Loss: 0.0361
Epoch 12, Loss: 0.0236
Epoch 13, Loss: 0.0239
Epoch 14, Loss: 0.0195
Epoch 15, Loss: 0.0184
Epoch 16, Loss: 0.0204
Epoch 17, Loss: 0.0184
Epoch 18, Loss: 0.0192
Epoch 19, Loss: 0.0215
Epoch 20, Loss: 0.0182
Epoch 21, Loss: 0.0168
Epoch 22, Loss: 0.0169
Epoch 23, Loss: 0.0175
Epoch 24, Loss: 0.0164
Epoch 25, Loss: 0.0161
Epoch 26, Loss: 0.0160
Epoch 27, Loss: 0.0160
Epoch 28, Loss: 0.0160
Epoch 29, Loss: 0.0161
Epoch 30, Loss: 0.0164
Epoch 31, Loss: 0.0160
Epoch 32, Loss: 0.0158
Epoch 33, Loss: 0.0158
Epoch 34, Loss: 0.0157
Epoch 35, Loss: 0.0170
Epoch 36, Loss: 0.0161
Epoch 37, Loss: 0.0160
Epoch 38, Loss: 0.0162
Epoch 39, Loss: 0.0159
Epoch 40, Loss: 0.0159
Epoch 41, Loss: 0.0157
Epoch 42, Loss: 0.0159
Epoch 43, Loss: 0.0156
Epoch 44, Loss: 0.01

In [4]:
def get_embedding(model, tokenizer, color, device):
    model.to(device)
    model.eval()

    with torch.no_grad():
        encoded = tokenizer([color], padding=True, truncation=True, return_tensors='pt')
        input_ids, attention_mask = encoded['input_ids'].to(device), encoded['attention_mask'].to(device)
        embedding = model(input_ids, attention_mask)

    return embedding.cpu()


In [5]:
for color in embeddings.keys():
  torch.save(get_embedding(model, tokenizer, color, device).reshape(4096), f"t_{color}.pt")

In [6]:
import shutil

# Create a zip file containing all .pt files that match the pattern
shutil.make_archive("text_embedding", 'zip', root_dir=".", base_dir=".", verbose=True)

# Move the zip file to the main directory if needed
!mv text_embedding.zip /content/


mv: 'text_embedding.zip' and '/content/text_embedding.zip' are the same file


In [7]:
import zipfile
import glob

# Define the zip file name
zip_filename = "text_embedding.zip"

# Create a zip file and add matching files
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for file in glob.glob("t_*.pt"):  # Adjust pattern as needed
        zipf.write(file)

# Confirm the file is created
print(f"Created {zip_filename}")


Created text_embedding.zip
