# Imports and Dependencies

In [None]:
!pip install git+https://github.com/openai/CLIP.git

In [None]:
import gzip
import numpy as np
import pandas as pd
import numpy as np
import torch
import clip
from sklearn.metrics.pairwise import cosine_similarity
import string
from collections import Counter
import json
import os
import re
from sklearn.decomposition import PCA
import torch.nn.functional as F
import torch.nn as nn
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pickle
import torch
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from collections import defaultdict
from multiprocessing import Pool

# Setup

Please replace `your_path` in the cell below witht the path to your folder with all the data and the annotations.

For the COCO Dataset, you can access this [here](https://drive.google.com/drive/folders/1UNxO77KFFxgAG8G4YX_GTFVHzl96-s2T?usp=sharing).

For the Viz-Wiz Captions Dataset, you can access this [here](https://drive.google.com/drive/folders/1gZM43wBNRhKsUwJgCJTdR95nzXccJLbn?usp=sharing).

Additionally, you may change `output_path` to match the path where you would like to store the embeddings. For the COCO dataset, we recommend you match the labels with the images first before proceeding, as only a subset of the original training dataset is present in the folder (see commented code block below).

In [None]:
from google.colab import drive
drive.mount('/content/drive')

your_path = "path/to/folder"
metadata_path = os.path.join(your_path, "6.8611 Final Project/Viz-Wiz Captions/val.json")
Viz_Wiz_images_dir = os.path.join(your_path, "6.8611 Final Project/Viz-Wiz Captions/val")
output_path = os.path.join(your_path, "6.8611 Final Project/Viz-Wiz Captions/viz_wiz_embeddings.npz")

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)

with open(metadata_path, 'r') as f:
    viz_wiz_data = json.load(f)

images = {img['id']: img for img in viz_wiz_data['images']}
annotations = viz_wiz_data['annotations']
print(f"Loaded {len(images)} images and {len(annotations)} annotations.")

In [None]:
# existing_image_ids = set()
# for file in os.listdir(COCO_images_dir):
#     if os.path.isfile(os.path.join(COCO_images_dir, file)):
#         match = re.match(r"^(\d+)", file) #regex match
#         if match:
#             image_id = int(match.group(1))
#             existing_image_ids.add(image_id)

# print(f"Found {len(existing_image_ids)} valid image IDs in {COCO_images_dir}.")

# filtered_annotations = [ann for ann in annotations if ann['image_id'] in existing_image_ids]
# print(f"Filtered {len(filtered_annotations)} annotations matching existing images.")

# CLIP Embedding Generation

Below are the steps to generate the CLIP embeddings for both the images and their associated captions. Make sure to edit `batch_size` as you want it, in order to match the memory capacities you may have access to.

In [None]:
def save_incremental(image_features, text_features, image_ids, text_captions, output_path):
    if os.path.exists(output_path):
        existing_data = np.load(output_path)
        image_features = np.concatenate([existing_data['image_features'], image_features], axis=0)
        text_features = np.concatenate([existing_data['text_features'], text_features], axis=0)
        image_ids = np.concatenate([existing_data['image_ids'], image_ids], axis=0)
        text_captions = np.concatenate([existing_data['text_captions'], text_captions], axis=0)

    # Save updated data
    np.savez_compressed(
        output_path,
        image_features=image_features,
        text_features=text_features,
        image_ids=image_ids,
        text_captions=text_captions
    )
    print(f"Incrementally saved {len(image_features)} entries to {output_path}.")

In [None]:
batch_size = 1000
image_features, text_features, image_ids, text_captions = [], [], [], []

accumulated_image_features, accumulated_text_features, accumulated_image_ids, accumulated_text_captions = [], [], [], []


# Process annotations
for i, annotation in enumerate(tqdm(filtered_annotations, desc="Extracting Features")):
    image_id = annotation['image_id']
    caption = annotation['caption']
    image_path = os.path.join(Viz_Wiz_images_dir, images[image_id]["file_name"])

    # Image
    try:
        image = preprocess(Image.open(image_path).convert("RGB")).unsqueeze(0).to(device)
        image_feature = clip_model.encode_image(image)
        image_feature = image_feature / image_feature.norm(dim=-1, keepdim=True)  # Normalize
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        continue

    # Text
    tokenized_text = clip.tokenize([caption]).to(device)
    text_feature = clip_model.encode_text(tokenized_text)
    text_feature = text_feature / text_feature.norm(dim=-1, keepdim=True)  # Normalize

    # Store embeddings in accumulated
    accumulated_image_features.append(image_feature.detach().numpy())
    accumulated_text_features.append(text_feature.detach().numpy())
    accumulated_image_ids.append(image_id)
    accumulated_text_captions.append(caption)

    if (i + 1) % batch_size == 0 or (i + 1) == len(annotations):
        save_incremental(
            np.array(accumulated_image_features),
            np.array(accumulated_text_features),
            np.array(accumulated_image_ids),
            np.array(accumulated_text_captions),
            output_path
        )
        print(f"Batch {(i+1)//batch_size} saved.")
        accumulated_image_features, accumulated_text_features, accumulated_image_ids, accumulated_text_captions = [], [], [], []

print("All batches processed and saved.")

# KG Embedding Generation

## Vocabulary and IDF

### Vocabulary

In [None]:
# COCO Captions
coco_embeddings_path = os.path.join(your_path, f"{project_folder}/COCO/coco_embeddings.npz")
coco_data = np.load(coco_embeddings_path, allow_pickle=True)
coco_captions = coco_data["text_captions"]

#Viz-Wiz Captions
viz_wiz_annotations_path = "6.8611 Final Project/Viz-Wiz Captions/val.json"
total_wiz_annotations_path = os.path.join(you_path, viz_wiz_annotations_path)
with open(total_wiz_annotations_path, 'r') as f:
    viz_wiz_data = json.load(f)
viz_wiz_captions = [ann['caption'] for ann in viz_wiz_data['annotations']]

In [None]:
tokens = []
total_captions = coco_captions.tolist() + viz_wiz_captions
for caption in total_captions:
    tokens.extend(normalize_token(word) for word in caption.split())
vocabulary = set(tokens)
word_to_idx = {word: idx for idx, word in enumerate(sorted(vocabulary))}

In [None]:
vocabulary_path = os.path.join(your_path, "6.8611 Final Project/COCO/our_vocabulary.pkl")
with open(vocabulary_path, 'wb') as f:
    pickle.dump(word_to_idx, f)
print("Vocabulary saved!")

### IDF

Make sure you save the IDF values for both COCO and Viz-Wiz Captions.

In [None]:
def calculate_idf(captions, output_path):
    num_documents = len(captions)
    word_document_counts = Counter()

    for caption in captions:
        tokens = set(normalize_token(word) for word in caption.split())
        word_document_counts.update(tokens)

    idf = {word: np.log(num_documents / (1 + count)) for word, count in word_document_counts.items()}
    idf_df = pd.DataFrame({'word': idf.keys(), 'idf': idf.values()})
    idf_df.to_csv(output_path, index=False)
    print(f"IDF values saved to {output_path}")
    return idf

idf_values_path = os.path.join(your_path, "6.8611 Final Project/Viz-Wiz Captions/idf_values.csv")
calculate_idf(viz_wiz_captions, idf_values_path)

## ConceptNet Numberbatch

In [None]:
embeddings_path = os.path.join(your_path, '6.8611 Final Project/ConceptNet_Data_Container/numberbatch-en-19.08.txt.gz')

def load_numberbatch_embeddings(file_path):
    embeddings = []
    terms = []

    with gzip.open(file_path, 'rt', encoding='utf8') as f:
        next(f)
        for line in f:
            elements = line.strip().split()
            term = elements[0]
            vector = list(map(float, elements[1:]))
            terms.append(term)
            embeddings.append(vector)

    return pd.DataFrame(embeddings, index=terms)

numberbatch_df = load_numberbatch_embeddings(embeddings_path)

## TF-IDF

In [None]:
class KGEmbedding:
    def __init__(self, numberbatch_path, idf_path, device="cuda"):
        self.device = device
        self.numberbatch_embeddings = self.load_numberbatch(numberbatch_path)
        self.idf_dict = self.load_idf(idf_path)

    def load_numberbatch(self, numberbatch_path):
        return pd.read_csv(
            numberbatch_path,
            sep=' ',
            header=None,
            index_col=0,
            skiprows=1
        )

    def normalize_token(self, token):
        return token.strip(string.punctuation).lower()

    def calculate_tf(self, caption):
        tokens = [self.normalize_token(word) for word in caption.split()]
        total_tokens = len(tokens)
        token_counts = Counter(tokens)
        tf = {word: count / total_tokens for word, count in token_counts.items()}
        return tf

    def calculate_tf_idf(self, caption, idf):
        tf = self.calculate_tf(caption)
        tf_idf = {word: float(tf_val) * idf.get(word, 0) for word, tf_val in tf.items()}
        return tf_idf

    def load_idf(self, idf_path):
        idf_df = pd.read_csv(idf_path)
        idf_dict = dict(zip(idf_df['word'], idf_df['idf']))
        return idf_dict

    def get_embeddings_for_caption(self, text_caption):
        embeddings = []
        weights = []
        words = text_caption.split()
        weight_dict = self.calculate_tf_idf(text_caption, self.idf_dict)
        for word in words:
            word = self.normalize_token(word)
            if word in numberbatch_df.index:
                vanilla_embedding = self.numberbatch_embeddings.loc[word].values
                weight = weight_dict.get(word, 1.0)
                embeddings.append(vanilla_embedding * weight)
                weights.append(weight)
        if not embeddings:
            return np.zeros(self.numberbatch_embeddings.shape[1])

        embeddings = np.array(embeddings)
        weights = np.array(weights)
        weighted_embedding = np.sum(embeddings, axis=0) / np.sum(weights)
        return weighted_embedding


Run this for both COCO and for Viz-Wiz Captions and update the paths accordingly to store the embeddings. OUr resulting embeddings are in the folders that we linked earlier in the notebook.

In [None]:
regenerate_kg = False
if regenerate_kg:
  viz_wiz_images_dir = f"{project_folder}/Viz-Wiz Captions/val"
  viz_wiz_annotations_path = f"{project_folder}/Viz-Wiz Captions/val.json"

  total_wiz_images_dir = os.path.join(ananya_path, viz_wiz_images_dir)
  total_wiz_annotations_path = os.path.join(ananya_path, viz_wiz_annotations_path)

  with open(total_wiz_annotations_path, 'r') as f:
      viz_wiz_data = json.load(f)

  images = {img['id']: img for img in viz_wiz_data['images']}
  annotations = viz_wiz_data['annotations']
  print(f"Loaded {len(images)} images and {len(annotations)} annotations.")
  viz_wiz_captions = [ann['caption'] for ann in annotations]
  print(f"Loaded {len(viz_wiz_captions)} captions.")
  # set up KG embedding obj
  idf_values_path = os.path.join(ananya_path, f"{project_folder}/Viz-Wiz Captions/idf_values.csv")
  numberbatch_embeddings_path = os.path.join(ananya_path, f'{project_folder}/ConceptNet_Data_Container/numberbatch-en-19.08.txt.gz')
  kg_embedding_obj = KGEmbedding(numberbatch_embeddings_path, idf_values_path)
  kg_embeddings = {}
  for ix, caption in enumerate(viz_wiz_captions):
      kg_embedding = kg_embedding_obj.get_embeddings_for_caption(caption)
      kg_embeddings[caption] = kg_embedding
      if ix % 1000 == 0:
          print(f"Processed {ix} captions.")
  kg_embeddings = torch.tensor(kg_embeddings).float().to(device)
  print(len(viz_wiz_captions))
  print(len(kg_embeddings))

  kg_coco_embedding_path = os.path.join(ananya_path, f"{project_folder}/Viz-Wiz Captions/kg_viz_wiz_embeddings.npz")
  np.savez_compressed(kg_coco_embedding_path, captions=viz_wiz_captions, kg_embeddings=kg_embeddings.cpu())