In [1]:
import os

In [2]:
import re

In [3]:
import zipfile

In [4]:
import glob

In [5]:
import itertools

In [6]:
import numpy as np

In [7]:
import pandas as pd

In [8]:
import xarray as xr

In [9]:
import matplotlib.pyplot as plt

In [10]:
import mkgu

In [11]:
from mkgu.knownfile import KnownFile as kf

In [12]:
from mkgu.knownfile import FileRecord, Sighting

In [13]:
from mkgu.lookup import pwdb

In [14]:
from mkgu.assemblies import AssemblyModel, AssemblyStoreMap, AssemblyStoreModel

In [15]:
from mkgu.stimuli import ImageModel, AttributeModel, ImageMetaModel, StimulusSetModel, ImageStoreModel, \
    StimulusSetImageMap, ImageStoreMap

## Process .nc files

In [16]:
v2_base_path = "/braintree/data2/active/users/jjpr/mkgu_packaging/crcns/v2-1"

In [17]:
nc_files = glob.glob(os.path.join(v2_base_path, "*/*/*.nc"), recursive=True)
sorted(nc_files)

['/braintree/data2/active/users/jjpr/mkgu_packaging/crcns/v2-1/V2Data1/DGrat/data.nc',
 '/braintree/data2/active/users/jjpr/mkgu_packaging/crcns/v2-1/V2Data10/NatRev/data.nc',
 '/braintree/data2/active/users/jjpr/mkgu_packaging/crcns/v2-1/V2Data11/NatRev/data.nc',
 '/braintree/data2/active/users/jjpr/mkgu_packaging/crcns/v2-1/V2Data12/NatRev/data.nc',
 '/braintree/data2/active/users/jjpr/mkgu_packaging/crcns/v2-1/V2Data13/NatRev/data.nc',
 '/braintree/data2/active/users/jjpr/mkgu_packaging/crcns/v2-1/V2Data14/NatRev/data.nc',
 '/braintree/data2/active/users/jjpr/mkgu_packaging/crcns/v2-1/V2Data15/NatRev/data.nc',
 '/braintree/data2/active/users/jjpr/mkgu_packaging/crcns/v2-1/V2Data16/NatRev/data.nc',
 '/braintree/data2/active/users/jjpr/mkgu_packaging/crcns/v2-1/V2Data17/NatRev/data.nc',
 '/braintree/data2/active/users/jjpr/mkgu_packaging/crcns/v2-1/V2Data18/NatRev/data.nc',
 '/braintree/data2/active/users/jjpr/mkgu_packaging/crcns/v2-1/V2Data19/NatRev/data.nc',
 '/braintree/data2/acti

In [None]:
gd_arrays = {}
for f in nc_files:
    gd_arrays[f] = xr.open_dataarray(f)
gd_arrays

In [None]:
for gd_array_key in gd_arrays:
    gd_array = gd_arrays[gd_array_key]
    gd_array = gd_array.T.rename({"image_file_name": "presentation"})
    gd_array.coords["presentation_id"] = ("presentation", range(gd_array.shape[1]))
    gd_array.coords["neuroid_id"] = ("neuroid", gd_array["neuroid"].values)
    gd_arrays[gd_array_key] = gd_array
gd_arrays

In [None]:
def massage_file_name(file_name):
    split = re.split("\\\\|/", file_name)
    split = [t for t in split if t]
    relative_path = os.path.join(*split[-5:])
    full_path = os.path.join("/", *split)
    basename = split[-1]
    exists = os.path.exists(full_path)
    sha1 = kf(full_path).sha1
    result = {
        "image_file_path_original": relative_path,
        "image_id": sha1
    }
    return result

In [None]:
for gd_array_key in gd_arrays:
    print(gd_array_key)
    gd_array = gd_arrays[gd_array_key]
    df_massage = pd.DataFrame(list(map(massage_file_name, gd_array["presentation"].values)))
    for column in df_massage.columns:
        gd_array.coords[column] = ("presentation", df_massage[column])
    gd_array.reset_index(["neuroid", "presentation"], drop=True, inplace=True)
    mkgu.assemblies.gather_indexes(gd_array)
    gd_array.reset_index(["category_name"], drop=True, inplace=True)
gd_arrays

#### Combine arrays

In [None]:
aligned = xr.align(*list(gd_arrays.values()), join="outer")
aligned

In [None]:
aligned[0].shape

In [None]:
[(~np.isnan(da)).any() for da in aligned]

In [None]:
non_nan_indices = []
for da in aligned:
    non_nan_indices.append(np.flatnonzero(~np.isnan(da.values))) 
non_nan_indices

In [None]:
# should all be False
for a, b in itertools.combinations(non_nan_indices, 2):
    print(np.in1d(a, b).any())

In [None]:
blank = np.full_like(aligned[0], np.nan)
blank

In [None]:
da_result = xr.DataArray(blank, coords=aligned[0].coords, dims=aligned[0].dims)
da_result

In [None]:
for da in aligned:
    da_result = da_result.combine_first(da)
da_result

In [None]:
def levels_for_index(xr_data, index):
    return xr_data.indexes[index].names

def all_index_levels(xr_data):
    nested = [levels_for_index(xr_data, index) for index in xr_data.indexes]
    return [x for inner in nested for x in inner]

In [None]:
da_result.reset_index(all_index_levels(da_result), inplace=True)

In [18]:
target_netcdf_file = "/braintree/data2/active/users/jjpr/mkgu_packaging/crcns/v2-1/crcns_v2-1_neuronal.nc"

In [None]:
da_result.to_netcdf(target_netcdf_file)

In [19]:
da_loaded = xr.open_dataarray(target_netcdf_file)

In [20]:
da_loaded

<xarray.DataArray (neuroid: 135, presentation: 3494103)>
[471703905 values with dtype=float64]
Coordinates:
    region                    (neuroid) object ...
    animal                    (neuroid) object ...
    neuroid_id                (neuroid) object ...
    stimulusRepeats           (presentation) int64 ...
    presentation_id           (presentation) int64 ...
    image_file_path_original  (presentation) object ...
    image_id                  (presentation) object ...
Dimensions without coordinates: neuroid, presentation

In [43]:
non_nan = np.nonzero(~np.isnan(da_loaded))
non_nan

<xarray.DataArray (neuroid: 2, presentation: 74740)>
array([[      0,       0,       0, ...,     134,     134,     134],
       [     16,      35,      54, ..., 3481545, 3481546, 3481547]])
Coordinates:
    region                    (neuroid) object ...
    animal                    (neuroid) object ...
    neuroid_id                (neuroid) object ...
    stimulusRepeats           (presentation) int64 ...
    presentation_id           (presentation) int64 ...
    image_file_path_original  (presentation) object ...
    image_id                  (presentation) object '437fcb15ba2caa46c4a2aea77796c8316b8a98cf' ...
Dimensions without coordinates: neuroid, presentation

## Make the image zip file

In [21]:
df_image_meta = pd.DataFrame({"image_id": np.unique(da_loaded["image_id"].values)})

In [22]:
df_image_meta

Unnamed: 0,image_id
0,00000a2518d148e9b455537488e325a7f42f39d8
1,00001776837935437e2784af953a4db24cc19536
2,00003bb7e9ca9265dc521763f73514f9c4d48f8a
3,00005ad0cbb2bafc56814919fda60268a1077ba3
4,00005e46dace8673b631b75f1df3a6d1a8b2c1c2
5,00006b3962b738bb25ab44265092ac6ef71943ed
6,00007b43039dc9f26f12d6e222460a70a1400d1f
7,00008a40da04647d4350b9acb0b6657326c1ba54
8,00008be32ed56378291bc536589d97bc39e35557
9,00008eead95d140fb4591430b11193525dfc379a


In [23]:
def first_dupe(sha1):
    fr = FileRecord.get(sha1=sha1)
    return fr.sightings[0].location
# order is not guaranteed, so on subsequent runs test that you got the same result, see below

In [24]:
df_image_meta["first_dupe"] = list(map(first_dupe, df_image_meta["image_id"]))
df_image_meta

Unnamed: 0,image_id,first_dupe
0,00000a2518d148e9b455537488e325a7f42f39d8,/braintree/data2/active/users/jjpr/mkgu_packag...
1,00001776837935437e2784af953a4db24cc19536,/braintree/data2/active/users/jjpr/mkgu_packag...
2,00003bb7e9ca9265dc521763f73514f9c4d48f8a,/braintree/data2/active/users/jjpr/mkgu_packag...
3,00005ad0cbb2bafc56814919fda60268a1077ba3,/braintree/data2/active/users/jjpr/mkgu_packag...
4,00005e46dace8673b631b75f1df3a6d1a8b2c1c2,/braintree/data2/active/users/jjpr/mkgu_packag...
5,00006b3962b738bb25ab44265092ac6ef71943ed,/braintree/data2/active/users/jjpr/mkgu_packag...
6,00007b43039dc9f26f12d6e222460a70a1400d1f,/braintree/data2/active/users/jjpr/mkgu_packag...
7,00008a40da04647d4350b9acb0b6657326c1ba54,/braintree/data2/active/users/jjpr/mkgu_packag...
8,00008be32ed56378291bc536589d97bc39e35557,/braintree/data2/active/users/jjpr/mkgu_packag...
9,00008eead95d140fb4591430b11193525dfc379a,/braintree/data2/active/users/jjpr/mkgu_packag...


In [25]:
def get_relative(path, base):
    split_path = path.split("/")
    split_base = base.split("/")
    target_path = "/".join(split_path[len(split_base):])
    return target_path

In [28]:
file_base = "/braintree/data2/active/users/jjpr/mkgu_packaging"

In [29]:
df_image_meta["relative_path"] = list(map(lambda x: get_relative(x, file_base), df_image_meta["first_dupe"]))
df_image_meta

Unnamed: 0,image_id,first_dupe,relative_path
0,00000a2518d148e9b455537488e325a7f42f39d8,/braintree/data2/active/users/jjpr/mkgu_packag...,crcns/v2-1/V2Data8/NatRev/stimuli/41590fd57c0b...
1,00001776837935437e2784af953a4db24cc19536,/braintree/data2/active/users/jjpr/mkgu_packag...,crcns/v2-1/V2Data14/NatRev/stimuli/2f03d6fc365...
2,00003bb7e9ca9265dc521763f73514f9c4d48f8a,/braintree/data2/active/users/jjpr/mkgu_packag...,crcns/v2-1/V2Data4/DGrat/stimuli/6954025a16641...
3,00005ad0cbb2bafc56814919fda60268a1077ba3,/braintree/data2/active/users/jjpr/mkgu_packag...,crcns/v2-1/V2Data14/NatRev/stimuli/beee725f0d2...
4,00005e46dace8673b631b75f1df3a6d1a8b2c1c2,/braintree/data2/active/users/jjpr/mkgu_packag...,crcns/v2-1/V2Data1/DGrat/stimuli/616577d68f558...
5,00006b3962b738bb25ab44265092ac6ef71943ed,/braintree/data2/active/users/jjpr/mkgu_packag...,crcns/v2-1/V2Data18/NatRev/stimuli/b1178068ddf...
6,00007b43039dc9f26f12d6e222460a70a1400d1f,/braintree/data2/active/users/jjpr/mkgu_packag...,crcns/v2-1/V2Data18/NatRev/stimuli/f803727e2e6...
7,00008a40da04647d4350b9acb0b6657326c1ba54,/braintree/data2/active/users/jjpr/mkgu_packag...,crcns/v2-1/V2Data15/NatRev/stimuli/aa2ec48ff96...
8,00008be32ed56378291bc536589d97bc39e35557,/braintree/data2/active/users/jjpr/mkgu_packag...,crcns/v2-1/V2Data17/NatRev/stimuli/1f67ce6b1df...
9,00008eead95d140fb4591430b11193525dfc379a,/braintree/data2/active/users/jjpr/mkgu_packag...,crcns/v2-1/V2Data17/NatRev/stimuli/727a677da46...


In [34]:
target_zip_path = "/braintree/home/jjpr/.mkgu/data/gallant.Willmore2010/gallant_crcns_v2-1_stimuli.zip"

In [36]:
!mkdir -p /braintree/home/jjpr/.mkgu/data/gallant.Willmore2010

In [37]:
with zipfile.ZipFile(target_zip_path, 'w') as target_zip:
    for image in df_image_meta.itertuples():
        target_zip.write(image.first_dupe, arcname=image.relative_path)

In [38]:
containing_dir = os.path.dirname(target_zip_path)
with zipfile.ZipFile(target_zip_path, 'r') as new_zip:
    new_zip.extractall(containing_dir)

In [40]:
def copied(source):
    relative = get_relative(source, file_base)
    target = os.path.join(containing_dir, relative)
    return os.path.exists(target)

In [41]:
df_image_meta["copied"] = list(map(copied, df_image_meta["first_dupe"]))
df_image_meta

Unnamed: 0,image_id,first_dupe,relative_path,copied
0,00000a2518d148e9b455537488e325a7f42f39d8,/braintree/data2/active/users/jjpr/mkgu_packag...,crcns/v2-1/V2Data8/NatRev/stimuli/41590fd57c0b...,True
1,00001776837935437e2784af953a4db24cc19536,/braintree/data2/active/users/jjpr/mkgu_packag...,crcns/v2-1/V2Data14/NatRev/stimuli/2f03d6fc365...,True
2,00003bb7e9ca9265dc521763f73514f9c4d48f8a,/braintree/data2/active/users/jjpr/mkgu_packag...,crcns/v2-1/V2Data4/DGrat/stimuli/6954025a16641...,True
3,00005ad0cbb2bafc56814919fda60268a1077ba3,/braintree/data2/active/users/jjpr/mkgu_packag...,crcns/v2-1/V2Data14/NatRev/stimuli/beee725f0d2...,True
4,00005e46dace8673b631b75f1df3a6d1a8b2c1c2,/braintree/data2/active/users/jjpr/mkgu_packag...,crcns/v2-1/V2Data1/DGrat/stimuli/616577d68f558...,True
5,00006b3962b738bb25ab44265092ac6ef71943ed,/braintree/data2/active/users/jjpr/mkgu_packag...,crcns/v2-1/V2Data18/NatRev/stimuli/b1178068ddf...,True
6,00007b43039dc9f26f12d6e222460a70a1400d1f,/braintree/data2/active/users/jjpr/mkgu_packag...,crcns/v2-1/V2Data18/NatRev/stimuli/f803727e2e6...,True
7,00008a40da04647d4350b9acb0b6657326c1ba54,/braintree/data2/active/users/jjpr/mkgu_packag...,crcns/v2-1/V2Data15/NatRev/stimuli/aa2ec48ff96...,True
8,00008be32ed56378291bc536589d97bc39e35557,/braintree/data2/active/users/jjpr/mkgu_packag...,crcns/v2-1/V2Data17/NatRev/stimuli/1f67ce6b1df...,True
9,00008eead95d140fb4591430b11193525dfc379a,/braintree/data2/active/users/jjpr/mkgu_packag...,crcns/v2-1/V2Data17/NatRev/stimuli/727a677da46...,True


In [42]:
all(df_image_meta["copied"])

True

## Make the StimulusSet lookup meta

In [44]:
pwdb.connect(reuse_if_open=True)

True

In [45]:
pwdb.create_tables(models=[ImageModel, AttributeModel, ImageMetaModel, StimulusSetModel, ImageStoreModel, StimulusSetImageMap, ImageStoreMap])

In [46]:
gallant_v2_images, created = StimulusSetModel.get_or_create(name="gallant.Willmore2010")

In [49]:
gallant_v2_image_store, created = ImageStoreModel.get_or_create(location_type="S3", store_type="zip",
                                  location="https://mkgu-gallant-crcns.s3.amazonaws.com/gallant_crcns_v2-1_stimuli.zip")

In [50]:
eav_image_file_sha1, created = AttributeModel.get_or_create(name="image_file_sha1", type="str")
eav_image_file_path_unique, created = AttributeModel.get_or_create(name="image_file_path_unique", type="str")

In [52]:
for image in df_image_meta.itertuples():
    pw_image = ImageModel(image_id=image.image_id)
    pw_stimulus_set_image_map = StimulusSetImageMap(stimulus_set=gallant_v2_images, image=pw_image)
    pw_image_image_store_map = ImageStoreMap(image=pw_image, image_store=gallant_v2_image_store,
                                             path=image.relative_path)
    pw_image.save()
    pw_stimulus_set_image_map.save()
    pw_image_image_store_map.save()
    ImageMetaModel(image=pw_image, attribute=eav_image_file_sha1, value=str(image.image_id)).save()
    ImageMetaModel(image=pw_image, attribute=eav_image_file_path_unique, value=str(image.relative_path)).save()

In [53]:
gallant_v2_stimulus_set = mkgu.get_stimulus_set("gallant.Willmore2010")
gallant_v2_stimulus_set

Unnamed: 0,id,image_id,image_file_sha1,image_file_path_unique
0,125778,00000a2518d148e9b455537488e325a7f42f39d8,00000a2518d148e9b455537488e325a7f42f39d8,crcns/v2-1/V2Data8/NatRev/stimuli/41590fd57c0b...
1,125779,00001776837935437e2784af953a4db24cc19536,00001776837935437e2784af953a4db24cc19536,crcns/v2-1/V2Data14/NatRev/stimuli/2f03d6fc365...
2,125780,00003bb7e9ca9265dc521763f73514f9c4d48f8a,00003bb7e9ca9265dc521763f73514f9c4d48f8a,crcns/v2-1/V2Data4/DGrat/stimuli/6954025a16641...
3,125781,00005ad0cbb2bafc56814919fda60268a1077ba3,00005ad0cbb2bafc56814919fda60268a1077ba3,crcns/v2-1/V2Data14/NatRev/stimuli/beee725f0d2...
4,125782,00005e46dace8673b631b75f1df3a6d1a8b2c1c2,00005e46dace8673b631b75f1df3a6d1a8b2c1c2,crcns/v2-1/V2Data1/DGrat/stimuli/616577d68f558...
5,125783,00006b3962b738bb25ab44265092ac6ef71943ed,00006b3962b738bb25ab44265092ac6ef71943ed,crcns/v2-1/V2Data18/NatRev/stimuli/b1178068ddf...
6,125784,00007b43039dc9f26f12d6e222460a70a1400d1f,00007b43039dc9f26f12d6e222460a70a1400d1f,crcns/v2-1/V2Data18/NatRev/stimuli/f803727e2e6...
7,125785,00008a40da04647d4350b9acb0b6657326c1ba54,00008a40da04647d4350b9acb0b6657326c1ba54,crcns/v2-1/V2Data15/NatRev/stimuli/aa2ec48ff96...
8,125786,00008be32ed56378291bc536589d97bc39e35557,00008be32ed56378291bc536589d97bc39e35557,crcns/v2-1/V2Data17/NatRev/stimuli/1f67ce6b1df...
9,125787,00008eead95d140fb4591430b11193525dfc379a,00008eead95d140fb4591430b11193525dfc379a,crcns/v2-1/V2Data17/NatRev/stimuli/727a677da46...


## Make the DataAssembly lookup meta

In [54]:
pwdb.create_tables(models=[AssemblyModel, AssemblyStoreMap, AssemblyStoreModel])

In [55]:
assy = AssemblyModel(name="gallant.Willmore2010", assembly_class="NeuronRecordingAssembly",
                     stimulus_set=gallant_v2_images)
assy.save()

1

In [56]:
store = AssemblyStoreModel(assembly_type="netCDF",
                           location_type="S3",
                           location="https://mkgu-gallant-crcns.s3.amazonaws.com/crcns_v2-1_neuronal.nc")
store.save()

1

In [57]:
assy_store_map = AssemblyStoreMap(assembly_model=assy, assembly_store_model=store, role="gallant.Willmore2010")
assy_store_map.save()

1

In [59]:
gallant_v2 = mkgu.get_assembly("gallant.Willmore2010")
gallant_v2

<xarray.NeuronRecordingAssembly (neuroid: 135, presentation: 3494103)>
array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])
Coordinates:
  * neuroid                   (neuroid) MultiIndex
  - region                    (neuroid) object 'V2' 'V2' 'V2' 'V2' 'V2' 'V2' ...
  - animal                    (neuroid) object 'e' 'e' 'e' 'e' 'e' 'e' 'e' ...
  - neuroid_id                (neuroid) object 'e0017' 'e0018' 'e0021' ...
  * presentation              (presentation) MultiIndex
  - stimulusRepeats           (presentation) int64 1 1 1 1 1 1 1 1 1 1 1 1 1 ...
  - presentation_id           (presentation) int64 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
  - image_file_path_original  (presentation) object 'v2-1/V2Data1/DGrat/stimuli/1b2a86f3f332f098b9eb2567fe8f04cd.jpg' ...
  - image_id 

In [60]:
len(np.unique(gallant_v2["image_file_path_unique"].values))

771017