### Libraries 

In [43]:
import numpy as np
import pandas as pd
import pickle
import os
import umap
import hdbscan
from tqdm.notebook import tqdm
import numpy as np
from sklearn.preprocessing import StandardScaler


def load_data(file):

    print('loading file: ' + file)
    with open(file, 'rb') as f:
        data = pickle.load(f)

    return(data)

def dump_data(data, filename):
    print('writing file: ' + filename)
    with open(filename, 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)

### Load embeddings

In [44]:
c_vis = load_data("CLIP_vis_fmri_768.pickle")
c_txt = load_data("CLIP_txt_fmri_768.pickle")
c_txt["category"] = np.array([x[0:-8] for x in c_vis["stimuli"]])

fmri_stim = np.load("/projects/crunchie/boyanova/EEG_Things/eeg_prep/scripts/fmri_train_stim.npy", allow_pickle=True)
main_path_stim = "/projects/crunchie/boyanova/EEG_Things/data_set/Images"

c_vis["stimuli_paths"] = []
for im in tqdm(fmri_stim):
    im_cat = im.split(".")[0][0:-4]
    c_vis["stimuli_paths"].append(os.path.join(main_path_stim, im_cat, im))

c_vis["stimuli_paths"] = np.array(c_vis["stimuli_paths"])
c_vis["category"] = c_txt["category"]

loading file: CLIP_vis_fmri_768.pickle
loading file: CLIP_txt_fmri_768.pickle


  0%|          | 0/8640 [00:00<?, ?it/s]

In [131]:
bigger_concept = load_data("CLIP_txt_fmri_512.pickle")["bigger_concept"]
flattened_list = []
array_data = bigger_concept
for sublist in array_data:
    if np.ndim(sublist) == 0:  # Check if it's a 0-D array (non-iterable)
        flattened_list.append(sublist.item())  # Convert 0-D array to scalar and add it
    else:
        flattened_list.append(sublist[0]) 

### Clusters CLIP-vis

In [124]:
reducer = umap.UMAP(n_neighbors=30, n_components=5, random_state=42)
clusterer = hdbscan.HDBSCAN(min_cluster_size=100, metric='euclidean')

In [143]:
### Get rid of all animate objects
animal_mask = ~np.isin(np.array(flattened_list), "animal")
embeddings = c_vis["embeddings"][animal_mask, :]
stimuli = c_vis["stimuli"][animal_mask]
stimuli_paths = c_vis["stimuli_paths"][animal_mask]


scaler = StandardScaler()
embeddings = scaler.fit_transform(embeddings)
reduced_embeddings = reducer.fit_transform(embeddings)
labels = clusterer.fit_predict(reduced_embeddings)
cluster = labels

In [151]:
c_vis = {"embeddings": embeddings, 
        "stimuli": stimuli,
        "stimuli_paths": stimuli_paths,
        "category": np.array([x[0:-8] for x in stimuli]),
        "bigger_concept" = np.array(flattened_list)[animal_mask]
        "cluster": labels}

In [152]:
### Compute centroid
embeddings_df = pd.DataFrame(c_vis["embeddings"].tolist())
embeddings_df['cluster'] = c_vis["cluster"]
centroids = embeddings_df.groupby('cluster').mean().to_numpy()

c_vis["cluster_centroids"] = centroids

In [168]:
dump_data(c_vis, "/projects/crunchie/boyanova/EEG_Things/eeg_prep/scripts/CLIP_vis_fmri_768.pickle")

writing file: /projects/crunchie/boyanova/EEG_Things/eeg_prep/scripts/CLIP_vis_fmri_768.pickle


### Clusters CLIP-txt

In [156]:
embeddings = c_txt["embeddings"][animal_mask, :]
scaler = StandardScaler()
embeddings = scaler.fit_transform(embeddings)
reduced_embeddings = reducer.fit_transform(embeddings)
labels = clusterer.fit_predict(reduced_embeddings)
cluster = labels

In [165]:
c_txt = {"embeddings": embeddings, 
        "stimuli": stimuli,
        "stimuli_paths": stimuli_paths,
        "category": np.array([x[0:-8] for x in stimuli]),
        "bigger_concept": np.array(flattened_list)[animal_mask],
        "cluster": labels}

In [170]:
### Compute centroid
embeddings_df = pd.DataFrame(c_txt["embeddings"].tolist())
embeddings_df['cluster'] = c_txt["cluster"]
centroids = embeddings_df.groupby('cluster').mean().to_numpy()

In [171]:
c_txt["cluster_centroids"] = centroids

In [172]:
dump_data(c_txt, "/projects/crunchie/boyanova/EEG_Things/eeg_prep/scripts/CLIP_txt_fmri_768.pickle")

writing file: /projects/crunchie/boyanova/EEG_Things/eeg_prep/scripts/CLIP_txt_fmri_768.pickle
