# Data loading

##  Loading Data

### Amplifiers

This is an example of how to load data.

In [None]:
from pathlib import Path
from spikeinterface.extractors import IntanRecordingExtractor
from dicarlo_lab_to_nwb.conversion.data_locator import locate_intan_file_path

data_folder = Path("/media/heberto/One Touch/DiCarlo-CN-data-share")
image_set_name = "domain-transfer-2023"
subject = "pico"
session_date = "20230214"
session_time = "140610"

intan_file_path = locate_intan_file_path(
    data_folder=data_folder,
    image_set_name=image_set_name,
    subject=subject,
    session_date=session_date,
    session_time=session_time,
)

recording = IntanRecordingExtractor(
    file_path=intan_file_path,
    stream_name="RHD2000 amplifier channel",
    all_annotations=True,
    ignore_integrity_checks=False,
)
recording

This particular example has timestamps discontinuities, to load the data regardless we set the parameter `ignore_integrity_checks=True`.

In [None]:
recording = IntanRecordingExtractor(
    file_path=intan_file_path,
    stream_name="RHD2000 amplifier channel",
    all_annotations=True,
    ignore_integrity_checks=True,
)
recording

### Auxiliary input

In [None]:
recording_auxiliary_input = IntanRecordingExtractor(
    file_path=intan_file_path,
    stream_name="RHD2000 auxiliary input channel",
    all_annotations=True,
    ignore_integrity_checks=True,
)

recording_auxiliary_input

### ADC input

In [None]:
recording_adc_input = IntanRecordingExtractor(
    file_path=intan_file_path,
    stream_name="USB board ADC input channel",
    all_annotations=True,
    ignore_integrity_checks=True,
)

recording_adc_input

### Digital channel 
Requires neo version from github https://github.com/NeuralEnsemble/python-neo/

In [None]:
recording_digital = IntanRecordingExtractor(
    file_path=intan_file_path,
    stream_name="USB board digital input channel",
    all_annotations=True,
    ignore_integrity_checks=True,
)

recording_digital

## Loading the probe

In [None]:
from dicarlo_lab_to_nwb.conversion.probe import build_probe_group
from dicarlo_lab_to_nwb.conversion.data_locator import locate_intan_file_path
from spikeinterface.extractors import IntanRecordingExtractor

data_folder = Path("/media/heberto/One Touch/DiCarlo-CN-data-share")
image_set_name = "domain-transfer-2023"
subject = "pico"
session_date = "20230214"
session_time = "140610"


intan_file_path = locate_intan_file_path(
    data_folder=data_folder,
    image_set_name=image_set_name,
    subject=subject,
    session_date=session_date,
    session_time=session_time,
)


stream_name = "RHD2000 amplifier channel"
recording = IntanRecordingExtractor(
    file_path=intan_file_path,
    stream_name=stream_name,
    ignore_integrity_checks=True,
    all_annotations=True,
)


probe_group = build_probe_group(recording=recording)


from probeinterface.plotting import plot_probe
import matplotlib.pyplot as plt
import numpy as np



fig = plt.figure(figsize=(16, 8))
ax = fig.add_subplot(111)

probe = probe_group.probes[0]
channel_ids = recording.get_channel_ids()
corresponding_channel_ids = [channel_ids[i] for i in probe.device_channel_indices]

text_on_contact = np.asarray(corresponding_channel_ids)

plot_probe(probe=probe, ax=ax, with_contact_id=True, text_on_contact=text_on_contact)

In [None]:
from probeinterface.plotting import plot_probe_group

fig = plt.figure(figsize=(16, 8))
ax = fig.add_subplot(111)


plot_probe_group(probe_group, ax=ax, same_axes=True, with_contact_id=False)


# Sorting Pipeline

To run a sorting pipeline we need a recording with a geometry attached.

In [None]:
from spikeinterface.extractors import IntanRecordingExtractor
from spikeinterface.sorters import run_sorter_by_property


from dicarlo_lab_to_nwb.conversion.data_locator import locate_intan_file_path
from dicarlo_lab_to_nwb.conversion.probe import attach_probe_to_recording

data_folder = Path("/media/heberto/One Touch/DiCarlo-CN-data-share")
image_set_name = "domain-transfer-2023"
subject = "pico"
session_date = "20230214"
session_time = "140610"


intan_file_path = locate_intan_file_path(
    data_folder=data_folder,
    image_set_name=image_set_name,
    subject=subject,
    session_date=session_date,
    session_time=session_time,
)


stream_name = "RHD2000 amplifier channel"
recording = IntanRecordingExtractor(
    file_path=intan_file_path,
    stream_name=stream_name,
    ignore_integrity_checks=True,
    all_annotations=True,
)


attach_probe_to_recording(recording=recording)
recording

Most sorters have been designed with high density probes in mind. They will work with a single channel probe, but the results may not be as good as some units might be supressed by the spatial regularization.

Because of this we performed sorting in two ways so you can compare the results:

1. We do one sorting per probe
2. We do one sorting per channel to avoid interference of the spatial regularization



## Performing a sorting per probe

In [None]:
from spikeinterface.core import load_extractor

sorting_folder = Path("./sorting_done")
overwrite = False

if sorting_folder.exists() and not overwrite:
    sorting = load_extractor(sorting_folder)
else:
    sorting = run_sorter_by_property(
        sorter_name="kilosort2",
        recording=recording,
        folder="./sorting_folder_probe",
        grouping_property="probe",
        docker_image=True,
    )

    sorting.save(folder=sorting_folder)
    

Action item:
* How to save the sorting results to numpy
* Quality metrics:
    * Which channels are visually driven? we repeat the image 20 times, we randomly choose the first 10 images of the set same of itme
and correlate the responses.
    *   

In [None]:
sorting

In [None]:
from spikeinterface.core import create_sorting_analyzer


sorting_analyzer = create_sorting_analyzer(sorting=sorting, recording=recording)




## Performing a sorting per channel

In [None]:
sorting = run_sorter_by_property(
    sorter_name="kilosort3",
    recording=recording,
    folder="./sorting_folder_per_channel",
    grouping_property="channel_names",
    docker_image=True,
)


In [None]:
from spikeinterface.sorters import available_sorters

available_sorters()

In [None]:
from spikeinterface.core import load_extractor

sorting_folder = Path("./sorting_done_per_channel")
overwrite = False

if sorting_folder.exists() and not overwrite:
    sorting = load_extractor(sorting_folder)
else:
    sorting = run_sorter_by_property(
        sorter_name="tridesclous",
        recording=recording,
        folder="./sorting_folder_per_channel",
        grouping_property="channel_names",
        docker_image=True,
    )

    sorting.save(folder=sorting_folder)
    


# Peak Detection Pipeline

## Artificial data

In [None]:
import spikeinterface.widgets as sw

from spikeinterface.core.generate import generate_ground_truth_recording


recording, sorting = generate_ground_truth_recording(num_channels=4, num_units=1, durations=[1], seed=0)


w_ts = sw.plot_traces(recording, time_range=(0, 1))
w_rs = sw.plot_rasters(sorting, time_range=(0, 1))

In [None]:
import numpy as np
from dicarlo_lab_to_nwb.conversion.pipeline import di_carlo_peak_detection



job_kwargs = dict(n_jobs=1, verbose=True, progress_bar=True, chunk_duration=1.0)
noise_threshold = 3  # The number of standard deviations for peak detection

spike_times_per_channel = di_carlo_peak_detection(recording=recording, noise_threshold=noise_threshold, job_kwargs=job_kwargs)

In [None]:
sorting.get_unit_spike_train(0, return_times=True)

In [None]:
spike_times_per_channel[0]

## Your data

In [None]:
from pathlib import Path

import spikeinterface.widgets as sw
from dicarlo_lab_to_nwb.conversion.data_locator import locate_intan_file_path

from spikeinterface.extractors import IntanRecordingExtractor


data_folder = Path("/media/heberto/One Touch/DiCarlo-CN-data-share")
image_set_name = "domain-transfer-2023"
subject = "pico"
session_date = "20230214"
session_time = "140610"


intan_file_path = locate_intan_file_path(
    data_folder=data_folder,
    image_set_name=image_set_name,
    subject=subject,
    session_date=session_date,
    session_time=session_time,
)



recording = IntanRecordingExtractor(
    file_path=intan_file_path,
    stream_name="RHD2000 amplifier channel",
    all_annotations=True,
    ignore_integrity_checks=True,
)

# If you want to select only one channel
channel_ids = recording.get_channel_ids()[0:1]
recording = recording.select_channels(channel_ids=channel_ids)
w_ts = sw.plot_traces(recording, time_range=(0, 1), return_scaled=True)


#### Preprocess

In [None]:
from dicarlo_lab_to_nwb.conversion.pipeline import DiCarloBandPass, DiCarloNotch


f_notch = 60  # Hz
bandwidth = 10
f_low = 300.0
f_high = 6000.0

vectorized = True 
notched_recording = DiCarloNotch(recording, f_notch=f_notch, bandwidth=bandwidth, vectorized=vectorized)
preprocessed_recording = DiCarloBandPass(notched_recording, f_low=f_low, f_high=f_high, vectorized=vectorized)

# For this instance each array 96 channels, 400 micrometes apart
w_ts = sw.plot_traces(preprocessed_recording, time_range=(0, 1), return_scaled=True)

#### Run the peak detection on a short portion of the data

In [None]:
from dicarlo_lab_to_nwb.conversion.pipeline import di_carlo_peak_detection

noise_threshold = 3  # The number of standard deviations for peak detection

start_frame = 0
seconds_of_data = 1.0
end_frame = int(preprocessed_recording.sampling_frequency * seconds_of_data)
preprocessed_recording = preprocessed_recording.frame_slice(start_frame=start_frame, end_frame=end_frame)

spike_times_per_channel = di_carlo_peak_detection(
    recording=preprocessed_recording,
    noise_threshold=noise_threshold,
)

spike_times_per_channel

#### Everything can be wrapped up in a couple of lines

In [None]:
from spikeinterface.extractors import IntanRecordingExtractor
from dicarlo_lab_to_nwb.conversion.pipeline import thresholding_pipeline
from dicarlo_lab_to_nwb.conversion.data_locator import locate_intan_file_path


image_set_name = "domain-transfer-2023"
subject = "pico"
session_date = "20230214"
session_time = "140610"

# Parameters of the pipeline
f_notch = 60  # Hz
bandwidth = 10
f_low = 300.0
f_high = 6000.0
noise_threshold = 3  # The number of standard deviations for peak detection


data_folder = Path("/media/heberto/One Touch/DiCarlo-CN-data-share")

intan_file_path = locate_intan_file_path(
    data_folder=data_folder,
    image_set_name=image_set_name,
    subject=subject,
    session_date=session_date,
    session_time=session_time,
)


stream_name = "RHD2000 amplifier channel"
recording = IntanRecordingExtractor(
    file_path=intan_file_path,
    stream_name=stream_name,
    ignore_integrity_checks=True,
    all_annotations=True,
)

spike_times_per_channel_vectorized = thresholding_pipeline(
    recording=recording.frame_slice(start_frame=0, end_frame=1000), # Remove frame_slice to run the whole pipeline
    f_notch=f_notch,
    bandwidth=bandwidth,
    f_low=f_low,
    f_high=f_high,
    noise_threshold=noise_threshold,
)

spike_times_per_channel

## Speed comparisons

In [None]:
import time

import numpy as np

from spikeinterface.extractors import IntanRecordingExtractor
from dicarlo_lab_to_nwb.conversion.pipeline import thresholding_pipeline

image_set_name = "domain-transfer-2023"
subject = "pico"
session_date = "20230214"
session_time = "140610"

# Parameters of the pipeline
f_notch = 60  # Hz
bandwidth = 10
f_low = 300.0
f_high = 6000.0
noise_threshold = 3  # The number of standard deviations for peak detection

data_folder = Path("/media/heberto/One Touch/DiCarlo-CN-data-share")
intan_file_path = locate_intan_file_path(
    data_folder=data_folder,
    image_set_name=image_set_name,
    subject=subject,
    session_date=session_date,
    session_time=session_time,
)


stream_name = "RHD2000 amplifier channel"
recording = IntanRecordingExtractor(
    file_path=intan_file_path,
    stream_name=stream_name,
    ignore_integrity_checks=True,
    all_annotations=True,
)


recording

#### Pipeline vectorized vs non-vectorized
We do the comparision with some seconds of data only, here we only use chunks of 1 seconds and we parallelize over 1 seconds chunks
to showcase the speedup of the vectorized version.

In [None]:
start_frame = 0
seconds_of_data = 16.0
end_frame = int(recording.sampling_frequency * seconds_of_data)
recording_short = recording.frame_slice(start_frame=start_frame, end_frame=end_frame)
recording_to_use = recording_short

vectorized = True
job_kwargs = dict(n_jobs=1, progress_bar=True, verbose=True, chunk_duration=1.0)

time_start = time.time()

spike_times_per_channel_vectorized = thresholding_pipeline(
    recording=recording_to_use,
    f_notch=f_notch,
    bandwidth=bandwidth,
    f_low=f_low,
    f_high=f_high,
    noise_threshold=noise_threshold,
    vectorized=vectorized,
    job_kwargs=job_kwargs,
)

time_stop = time.time()
time_taken = time_stop - time_start
print(f"Time elapsed: {time_taken} seconds")

time_vectorized = time_taken


vectorized = False
time_start = time.time()

spike_times_per_channel = thresholding_pipeline(
    recording=recording_to_use,
    f_notch=f_notch,
    bandwidth=bandwidth,
    f_low=f_low,
    f_high=f_high,
    noise_threshold=noise_threshold,
    vectorized=vectorized,
    job_kwargs=job_kwargs,
)

time_stop = time.time()
time_taken = time_stop - time_start
print(f"Time elapsed: {time_taken} seconds")

time_non_vectorized = time_taken

speedup = time_non_vectorized/time_vectorized
print(f"Speedup: {speedup:.2f} (times faster)" )

# Test that the results are the same
for channel_index, spike_times in spike_times_per_channel.items():
    spike_times_vectorized = spike_times_per_channel_vectorized[channel_index]
    assert np.allclose(spike_times, spike_times_vectorized), f"Channel {channel_index} spike times do not match"
    
recording_to_use = recording_short
vectorized = True

job_kwargs = dict(n_jobs=-1, progress_bar=True, verbose=True, chunk_duration=1.0)


time_start = time.time()

spike_times_per_channel = thresholding_pipeline(
    recording=recording_to_use,
    f_notch=f_notch,
    bandwidth=bandwidth,
    f_low=f_low,
    f_high=f_high,
    noise_threshold=noise_threshold,
    vectorized=vectorized,
    job_kwargs=job_kwargs,
)

time_stop = time.time()
time_taken = time_stop - time_start
print(f"Time elapsed: {time_taken} seconds")

time_vectorized_multiprocessing  = time_taken

speedup = time_non_vectorized/time_vectorized_multiprocessing
print(f"Speedup: {speedup:.2f} (times faster)" )

### Full recording parallelized in multiple cores 

In [None]:
recording_to_use = recording
vectorized = True

# Note that is using as many cores as chunks (n_jobs=num_chunks)
chunk_duration = 1.0
job_kwargs = dict(n_jobs=-1, verbose=True, progress_bar=True, chunk_size=chunk_duration)

time_start = time.time()

spike_times_per_channel = thresholding_pipeline(
    recording=recording_to_use,
    f_notch=f_notch,
    bandwidth=bandwidth,
    f_low=f_low,
    f_high=f_high,
    noise_threshold=noise_threshold,
    vectorized=vectorized,
    job_kwargs=job_kwargs,
)

time_stop = time.time()
time_taken = time_stop - time_start
print(f"Time elapsed: {time_taken} seconds")

time_multiprocessing = time_taken


job_kwargs = dict(n_jobs=1, verbose=True, progress_bar=True, chunk_size=chunk_duration)

time_start = time.time()

spike_times_per_channel = thresholding_pipeline(
    recording=recording_to_use,
    f_notch=f_notch,
    bandwidth=bandwidth,
    f_low=f_low,
    f_high=f_high,
    noise_threshold=noise_threshold,
    vectorized=vectorized,
    job_kwargs=job_kwargs,
)

time_stop = time.time()
time_taken = time_stop - time_start
print(f"Time elapsed: {time_taken} seconds")

time_single_core = time_taken

speedup = time_single_core/time_multiprocessing

print(f"Speedup: {speedup:.2f} (times faster)" )


In [None]:
speedup = (time_non_vectorized * recording_to_use.get_duration() * 0.10 )/time_multiprocessing

print(f"Speedup with respect to non-vectorized version: {speedup:.2f} (times faster)" )

### Full pipeline spikeinterface vs DiCarlo form Intan and NWB

#### Load NWB Recording

In [None]:
from pathlib import Path
import math 


folder_path = Path.home() / "conversion_nwb" 
# folder_path = folder_path / "nwb_stub"
assert folder_path.is_dir()

file_path = folder_path / "pico_20230214_140610.nwb"
assert file_path.is_file()

from spikeinterface.extractors import NwbRecordingExtractor

recording = NwbRecordingExtractor(file_path=file_path)


recording_to_use = recording
vectorized = True


job_kwargs = dict(n_jobs=-1, verbose=True, progress_bar=True, chunk_duration=1.0)


time_start = time.time()

spike_times_per_channel = thresholding_pipeline(
    recording=recording_to_use,
    f_notch=f_notch,
    bandwidth=bandwidth,
    f_low=f_low,
    f_high=f_high,
    noise_threshold=noise_threshold,
    vectorized=vectorized,
    job_kwargs=job_kwargs,
)

time_stop = time.time()
time_taken = time_stop - time_start
print(f"Time elapsed: {time_taken} seconds")

time_nwb = time_taken

speedup = time_multiprocessing/time_nwb
print(f"Speedup: {speedup:.2f} (times faster)" )


## Running a faithful comparison by using 10 chunks
This can't use multiprocessing because is uses too much memory. 

In [None]:
import time
import math 

import numpy as np

from spikeinterface.extractors import IntanRecordingExtractor
from dicarlo_lab_to_nwb.conversion.pipeline import thresholding_pipeline

image_set_name = "domain-transfer-2023"
subject = "pico"
session_date = "20230214"
session_time = "140610"

# Parameters of the pipeline
f_notch = 60  # Hz
bandwidth = 10
f_low = 300.0
f_high = 6000.0
noise_threshold = 3  # The number of standard deviations for peak detection

data_folder = Path("/media/heberto/One Touch/DiCarlo-CN-data-share")
image_set_name = "domain-transfer-2023"
subject = "pico"
session_date = "20230214"
session_time = "140610"


intan_file_path = locate_intan_file_path(
    data_folder=data_folder,
    image_set_name=image_set_name,
    subject=subject,
    session_date=session_date,
    session_time=session_time,
)

stream_name = "RHD2000 amplifier channel"
recording = IntanRecordingExtractor(
    file_path=intan_file_path,
    stream_name=stream_name,
    ignore_integrity_checks=True,
    all_annotations=True,
)



recording_to_use = recording
vectorized = True

# Note that is using as many cores as chunks (n_jobs=num_chunks)
num_chunks = 10
chunk_size =  math.ceil(recording.get_num_samples() / num_chunks)

###  If you have a lot of memory you can test adding more jobs, example code on how to get the number of cores.
# import psutil  # You have the pip install psutil
# num_cores = psutil.cpu_count(logical=True)
# n_jobs = min(num_cores, num_chunks)

job_kwargs = dict(n_jobs=1, verbose=True, progress_bar=True, chunk_size=chunk_duration)

time_start = time.time()

spike_times_per_channel = thresholding_pipeline(
    recording=recording_to_use,
    f_notch=f_notch,
    bandwidth=bandwidth,
    f_low=f_low,
    f_high=f_high,
    noise_threshold=noise_threshold,
    vectorized=vectorized,
    job_kwargs=job_kwargs,
)