### Libraries 

In [1]:
import numpy as np
import pandas as pd
import pickle
import os
import umap
import hdbscan
from tqdm.notebook import tqdm
import numpy as np
from sklearn.preprocessing import StandardScaler


def load_data(file):

    print('loading file: ' + file)
    with open(file, 'rb') as f:
        data = pickle.load(f)

    return(data)

def dump_data(data, filename):
    print('writing file: ' + filename)
    with open(filename, 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)

### Load embeddings

In [3]:
project_dir = "/projects/crunchie/boyanova/EEG_Things/Grouping-Embeddings"
class_type = "inanimate"

c_vis = load_data(os.path.join(project_dir, "files", "CLIP_vis_fmri.pickle"))
c_txt = load_data(os.path.join(project_dir,"files", "CLIP_txt_fmri_blip.pickle"))
c_txt["category"] = np.array([x[0:-8] for x in c_vis["stimuli"]])

fmri_stim = np.load(os.path.join(project_dir, "files", "fmri_train_stim.npy"), allow_pickle=True)
animal_mask = np.load(os.path.join(project_dir, "files", "animate_mask.npy"))
main_path_stim = "/projects/crunchie/boyanova/EEG_Things/data_set/Images"

c_vis["stimuli_paths"] = []
for im in tqdm(fmri_stim):
    im_cat = im.split(".")[0][0:-4]
    c_vis["stimuli_paths"].append(os.path.join(main_path_stim, im_cat, im))

c_vis["stimuli_paths"] = np.array(c_vis["stimuli_paths"])
c_vis["category"] = c_txt["category"]

loading file: /projects/crunchie/boyanova/EEG_Things/Grouping-Embeddings/files/CLIP_vis_fmri.pickle
loading file: /projects/crunchie/boyanova/EEG_Things/Grouping-Embeddings/files/CLIP_txt_fmri_blip.pickle


  0%|          | 0/8640 [00:00<?, ?it/s]

### Clusters CLIP-vis

In [5]:
reducer = umap.UMAP(n_neighbors=30, n_components=5, random_state=42)
clusterer = hdbscan.HDBSCAN(min_cluster_size=100, metric='euclidean')

In [None]:
### Get rid of all animate objects
if class_type =="inanimate":
    animal_mask = ~animal_mask
embeddings = c_vis["embeddings"][animal_mask, :]
stimuli = c_vis["stimuli"][animal_mask]
stimuli_paths = c_vis["stimuli_paths"][animal_mask]


scaler = StandardScaler()
embeddings = scaler.fit_transform(embeddings)
reduced_embeddings = reducer.fit_transform(embeddings)
labels = clusterer.fit_predict(reduced_embeddings)
cluster = labels

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [7]:
c_vis = {"embeddings": embeddings, 
        "stimuli": stimuli,
        "stimuli_paths": stimuli_paths,
        "category": np.array([x[0:-8] for x in stimuli]),
        "cluster": labels}

### Compute centroid
embeddings_df = pd.DataFrame(c_vis["embeddings"].tolist())
embeddings_df['cluster'] = c_vis["cluster"]
centroids = embeddings_df.groupby('cluster').mean().to_numpy()

c_vis["cluster_centroids"] = centroids

file_name = f"CLIP_vis_fmri_{class_type}.pickle"
dump_data(c_vis, os.path.join(project_dir, "files", file_name))

### Clusters CLIP-txt

In [156]:
embeddings = c_txt["embeddings"][animal_mask, :]
scaler = StandardScaler()
embeddings = scaler.fit_transform(embeddings)
reduced_embeddings = reducer.fit_transform(embeddings)
labels = clusterer.fit_predict(reduced_embeddings)
cluster = labels

In [165]:
c_txt = {"embeddings": embeddings, 
        "stimuli": stimuli,
        "stimuli_paths": stimuli_paths,
        "category": np.array([x[0:-8] for x in stimuli]),
        "cluster": labels}

In [170]:
### Compute centroid
embeddings_df = pd.DataFrame(c_txt["embeddings"].tolist())
embeddings_df['cluster'] = c_txt["cluster"]
centroids = embeddings_df.groupby('cluster').mean().to_numpy()

In [171]:
c_txt["cluster_centroids"] = centroids
file_name = f"CLIP_txt_fmri_blip_{class_type}.pickle"
dump_data(c_vis, os.path.join(project_dir, "files", file_name))