## MERGED ASSEMBLY CREATION

In [None]:
# import libraries

# general
import numpy as np
import pandas as pd
import xarray as xr
import tqdm as tqdm


# brain-score specific
import brainscore
import brainio 
from brainscore.benchmarks._neural_common import average_repetition
from brainio.assemblies import NeuroidAssembly
from brainio.packaging import write_netcdf # use this function to save it locally
from brainio.packaging import package_data_assembly # use this function to push to S3
from brainio.stimuli import StimulusSet
from brainio.packaging import package_stimulus_set

## 1. Load Oleo and Pico data

In [2]:
imgs_dir_path = '../images'
dependencies_dir_path = '../dependencies'

### 1.1 Load Oleo's DATA

In [3]:
# oleo is already on S3
assembly_oleo = brainscore.get_assembly('dicarlo.Sanghavi2021.domain_transfer')

Loading catalog from entrypoints
Loading lookup from /home/bocini/miniconda3/envs/brainscore/lib/python3.6/site-packages/brainscore/lookup.csv


brainio.dicarlo/assy_dicarlo_Sanghavi2021_domain_transfer.nc: 100%|██████████| 1.29G/1.29G [00:10<00:00, 118MB/s] 
brainio.dicarlo/image_dicarlo_domain_transfer.csv: 100%|██████████| 517k/517k [00:00<00:00, 661kB/s]
brainio.dicarlo/image_dicarlo_domain_transfer.zip: 100%|██████████| 198M/198M [00:13<00:00, 15.2MB/s] 


### 1.2 Load Pico's DATA

In [4]:
# pico is not on S3 yet, we load it from local 
file_path = dependencies_dir_path + '/data_pico/assy_dicarlo_pico_domain_transfer.nc'
assembly_pico = brainio.assemblies.DataAssembly.from_files(file_path)

### 1.3 Common Shared Stimulus Set (with background ids)


In [None]:
!python create_stimulus_set_with_background-id.py

In [6]:
# stimulus set with background id
csv_path = './merged_stimulus_set.csv'
merged_stimulus_set = pd.read_csv(csv_path)

## MERGING DATA ASSEMBLIES

In [11]:
# First of all, we need to make sure that the order of presentation of the images is the same. To do so we average out the repetitions and then compare them:
oleo_avg = average_repetition(assembly_oleo)
pico_avg = average_repetition(assembly_pico)
print('Is the order of presented stimuli the same for both the assemblies? ', np.all(oleo_avg.stimulus_id.values == pico_avg.stimulus_id.values))

Is the order of presented stimuli the same for both the assemblies?  True


In [18]:
# now check if whithin oleo assembly images have always the same number of repetitions
from collections import Counter

repetition_values = assembly_oleo.repetition.values  # Replace with your actual array
counts = Counter(repetition_values)
first_count = counts[repetition_values[0]]
are_all_same = all(count == first_count for count in counts.values())
if are_all_same:
    print("Each number is repeated the same number of times.")
else:
    print("Numbers are not repeated the same number of times.")

Each number is repeated the same number of times.


In [19]:
repetition_values = assembly_pico.repetition.values  # Replace with your actual array
counts = Counter(repetition_values)
first_count = counts[repetition_values[0]]
are_all_same = all(count == first_count for count in counts.values())
if are_all_same:
    print("Each number is repeated the same number of times.")
else:
    print("Numbers are not repeated the same number of times.")

Each number is repeated the same number of times.


In [24]:
merged_stimulus_set

Unnamed: 0,filepath,object_label,image_file_name,object_style,stimulus_source,filename,image_id,stimulus_id,background_id
0,from_ko/04-art-4/im60.png,apple,im0.png,cartoon,Art,b0f59906243e042456b315475f987291aa665774.png,b0f59906243e042456b315475f987291aa665774,b0f59906243e042456b315475f987291aa665774,0
1,from_ko/04-art-0/im61.png,apple,im1.png,cartoon,Art,7ab65cd6d6b0b8399d070a541a0234c8836e1e8b.png,7ab65cd6d6b0b8399d070a541a0234c8836e1e8b,7ab65cd6d6b0b8399d070a541a0234c8836e1e8b,0
2,from_ko/04-art-3/im50.png,apple,im2.png,cartoon,Art,9ac7accbe9bdb749efd9604c16cfe52015c976d6.png,9ac7accbe9bdb749efd9604c16cfe52015c976d6,9ac7accbe9bdb749efd9604c16cfe52015c976d6,0
3,from_ko/04-art-4/im61.png,apple,im3.png,cartoon,Art,247ceb4c3e17f3bb6b661abf2b9497004116efb3.png,247ceb4c3e17f3bb6b661abf2b9497004116efb3,247ceb4c3e17f3bb6b661abf2b9497004116efb3,0
4,from_ko/04-art-0/im62.png,apple,im4.png,cartoon,Art,e7e9e6ad6c6935818e1910f1f45bc8a2be606f24.png,e7e9e6ad6c6935818e1910f1f45bc8a2be606f24,e7e9e6ad6c6935818e1910f1f45bc8a2be606f24,0
...,...,...,...,...,...,...,...,...,...
3133,from_ko/03-sils-0/im107.png,zebra,im3133.png,skeleton,Silhouette,3925265837668ccf0655b9b669c6640a1ab357fe.png,3925265837668ccf0655b9b669c6640a1ab357fe,3925265837668ccf0655b9b669c6640a1ab357fe,116
3134,from_ko/03-sils-1/im106.png,zebra,im3134.png,skeleton,Silhouette,1c4b44fa576b23a2fc37fa49972343617efb3f7c.png,1c4b44fa576b23a2fc37fa49972343617efb3f7c,1c4b44fa576b23a2fc37fa49972343617efb3f7c,117
3135,from_ko/03-sils-2/im107.png,zebra,im3135.png,skeleton,Silhouette,877addd4d9ff7004abd7689b257e0fb74a0fd300.png,877addd4d9ff7004abd7689b257e0fb74a0fd300,877addd4d9ff7004abd7689b257e0fb74a0fd300,120
3136,from_ko/03-sils-4/im105.png,zebra,im3136.png,skeleton,Silhouette,8ca46d5b61975cc6ba6eaf9a542867ac22672cd5.png,8ca46d5b61975cc6ba6eaf9a542867ac22672cd5,8ca46d5b61975cc6ba6eaf9a542867ac22672cd5,112


In [7]:
# need to add background id to the presentation dimension of oleo assembly
assembly_oleo['background_id'] = ('presentation', 
                                  np.array([merged_stimulus_set.background_id[merged_stimulus_set.stimulus_id == stim_id].values[0] 
                                        for stim_id in assembly_oleo['stimulus_id'].values]))

In [8]:
# small renaming and reordering of presentation and neuroid for assembly pico, to make it compatible with oleo
# we also add the backgroud_id to the presentation
data = assembly_pico.values
coords = {
    # selection of presentaiton
    'object_label': ('presentation', assembly_pico['object_label'].values),
    'object_style': ('presentation', assembly_pico['object_style'].values),
    'filepath': ('presentation', assembly_pico['filepath'].values),
    'stimulus_source': ('presentation', assembly_pico['identifier'].values),
    'image_file_name': ('presentation', assembly_pico['image_file_name'].values),
    'image_current_local_file_path': ('presentation', assembly_pico['image_current_local_file_path'].values),
    'image_id': ('presentation', assembly_pico['stimulus_id'].values),
    'repetition': ('presentation', assembly_pico['repetition'].values),
    'stimulus_id': ('presentation', assembly_pico['stimulus_id'].values),
    'filename': ('presentation', [assembly_pico['stimulus_id'].values[i] + '.png' for i in range(len(assembly_pico['stimulus_id'].values))]),
    'background_id' : ('presentation',
                    np.array([merged_stimulus_set.background_id[merged_stimulus_set.stimulus_id == stim_id].values[0] 
                              for stim_id in assembly_pico['stimulus_id'].values])),
    # selection of neuroid
    'col' : ('neuroid', assembly_pico['col'].values),
    'row' : ('neuroid', assembly_pico['row'].values),
    'bank' : ('neuroid', assembly_pico['bank'].values),
    'elec' : ('neuroid', assembly_pico['elec'].values),
    'label' : ('neuroid', assembly_pico['label'].values),
    'arr' : ('neuroid', assembly_pico['arr'].values),
    'hemisphere' : ('neuroid', assembly_pico['hemisphere'].values),
    'subregion' : ('neuroid', assembly_pico['subregion'].values),
    'region' : ('neuroid', assembly_pico['region'].values),
    'animal' : ('neuroid', assembly_pico['animal'].values),
    'neuroid_id' : ('neuroid', assembly_pico['neuroid_id'].values),
    # entire time_bin
    'time_bin' : assembly_pico.time_bin
}
assembly_pico = xr.DataArray(data, dims=['presentation', 'neuroid', 'time_bin'],
                        coords = coords)

assembly_pico = NeuroidAssembly(assembly_pico)

In [9]:
# some other operations for compatibility: we pad nans in Pico presentation coordinate to make sure that the number of repetitions is the same.:
new_data = np.empty((197694, 75, 7))
new_data[:] = np.nan

new_assembly_pico = xr.DataArray(new_data, dims=['presentation', 'neuroid', 'time_bin'],
                        coords = {
                            'presentation': assembly_oleo.presentation,
                            'neuroid' : assembly_pico.neuroid,
                            'time_bin' : assembly_pico.time_bin
                            })

for i in tqdm.tqdm(range(len(assembly_pico)), desc='merging'): # 

    r = i//35*28 +i
    assert assembly_pico.stimulus_id[i] == new_assembly_pico.stimulus_id[r]
    assert assembly_pico.repetition[i] == new_assembly_pico.repetition[r]
    new_assembly_pico[r] = assembly_pico[i]

merging: 100%|██████████| 109830/109830 [17:48<00:00, 102.82it/s]


In [10]:
# now we can finally merge the data:
merged_data = np.concatenate((assembly_oleo.values, new_assembly_pico.values), axis=1)

coords = {
    # entire presentation
    'presentation' : new_assembly_pico.presentation,
    # selection of neuroid
    'col' : ('neuroid', list(assembly_oleo['col'].values) + list(assembly_pico['col'].values)),
    'row' : ('neuroid', list(assembly_oleo['row'].values) + list(assembly_pico['row'].values)),
    'bank' : ('neuroid', list(assembly_oleo['bank'].values) + list(assembly_pico['bank'].values)),
    'elec' : ('neuroid', list(assembly_oleo['elec'].values)+ list(assembly_pico['elec'].values)),
    'label' : ('neuroid', list(assembly_oleo['label'].values) + list(assembly_pico['label'].values)),
    'arr' : ('neuroid', list(assembly_oleo['arr'].values) + list(assembly_pico['arr'].values)),
    'hemisphere' : ('neuroid', list(assembly_oleo['hemisphere'].values) + list(assembly_pico['hemisphere'].values)),
    'subregion' : ('neuroid', list(assembly_oleo['subregion'].values) + list(assembly_pico['subregion'].values)),
    'region' : ('neuroid', list(assembly_oleo['region'].values) + list(assembly_pico['region'].values)),
    'animal' : ('neuroid', list(assembly_oleo['animal'].values) + list(assembly_pico['animal'].values)),
    'electrode_id' : ('neuroid', list(assembly_oleo['neuroid_id'].values )+ list(assembly_pico['neuroid_id'].values)),
    'neuroid_id' : ('neuroid', list(assembly_oleo['animal'].values + '_' + assembly_oleo['neuroid_id'].values) + list(assembly_pico['animal'].values + '_' + assembly_pico['neuroid_id'].values)),
    
    # entire time_bin
    'time_bin' : assembly_oleo.time_bin
}

merged_assembly = xr.DataArray(merged_data,dims=['presentation', 'neuroid', 'time_bin'],
                            coords=coords)
   

In [12]:
merged_assembly = NeuroidAssembly(merged_assembly)
merged_assembly = merged_assembly.transpose('presentation', 'neuroid', 'time_bin')

merged_assembly.attrs = assembly_oleo.attrs

merged_assembly.attrs['identifier']='Igustibagus2024'
merged_assembly.attrs['stimulus_set']=merged_stimulus_set

merged_assembly.name = 'Igustibagus2024'

In [None]:
# package stimuli
stimuli = StimulusSet(merged_stimulus_set)
stimuli.stimulus_paths = {row['stimulus_id']: imgs_dir_path + '/' +  row['filename'] for _, row in stimuli.iterrows()}

stimuli.drop('filename', axis=1, inplace=True)

stimuli.name = 'Igustibagus2024' 

assert len(stimuli) == 3138 

package_stimulus_set(catalog_name='brainio_brainscore', proto_stimulus_set=stimuli, stimulus_set_identifier=stimuli.name,
                     bucket_name="brainio-brainscore") # upload to S3

In [None]:
# upload to S3
package_data_assembly('brainio_brainscore', merged_assembly, assembly_identifier=merged_assembly.name,
        stimulus_set_identifier=stimuli.name, assembly_class_name="NeuronRecordingAssembly", 
        bucket_name="brainio-brainscore")