# Data loading

##  Loading Data

### Amplifiers

In [None]:
from pathlib import Path

# This has to be modified for experiment_path
experiment_path = Path("/media/heberto/One Touch/DiCarlo-CN-data-share/exp_domain-transfer-2023/exp_domain-transfer-2023.sub_pico/raw_files/intanraw")
assert experiment_path.is_dir()
session_folder =  experiment_path / "pico_domain-transfer-2023_230215_161322"
#session_folder = experiment_path / "pico_domain-transfer-2023_230214_140610"  # This file has a timestamp problem
assert session_folder.is_dir()

file_path = session_folder / "info.rhd"
assert file_path.is_file()

In [None]:
from spikeinterface.extractors import IntanRecordingExtractor


recording = IntanRecordingExtractor(
    file_path=file_path,
    stream_name="RHD2000 amplifier channel",
    all_annotations=True,
    ignore_integrity_checks=True,
)
recording

### Auxiliary input

In [None]:
recording_auxiliary_input = IntanRecordingExtractor(
    file_path=file_path,
    stream_name="RHD2000 auxiliary input channel",
    all_annotations=True,
    ignore_integrity_checks=True,
)

recording_auxiliary_input

### ADC input

In [None]:
recording_adc_input = IntanRecordingExtractor(
    file_path=file_path,
    stream_name="USB board ADC input channel",
    all_annotations=True,
    ignore_integrity_checks=True,
)

recording_adc_input

### Digital channel 
Requires neo version from github https://github.com/NeuralEnsemble/python-neo/

In [None]:
recording_digital = IntanRecordingExtractor(
    file_path=file_path,
    stream_name="USB board digital input channel",
    all_annotations=True,
    ignore_integrity_checks=True,
)

recording_digital

# Pipeline

## Artificial data

In [None]:
import spikeinterface.widgets as sw

from spikeinterface.core.generate import generate_ground_truth_recording


recording, sorting = generate_ground_truth_recording(num_channels=4, num_units=1, durations=[1], seed=0)


w_ts = sw.plot_traces(recording, time_range=(0, 1))
w_rs = sw.plot_rasters(sorting, time_range=(0, 1))

In [None]:
import numpy as np
from dicarlo_lab_to_nwb.conversion.pipeline import di_carlo_peak_detection



job_kwargs = dict(n_jobs=1, verbose=True, progress_bar=True, chunk_duration=1.0)
noise_threshold = 3  # The number of standard deviations for peak detection

spike_times_per_channel = di_carlo_peak_detection(recording=recording, noise_threshold=noise_threshold, job_kwargs=job_kwargs)

In [None]:
sorting.get_unit_spike_train(0, return_times=True)

In [None]:
spike_times_per_channel[0]

## Your data

In [None]:
from pathlib import Path

import spikeinterface.widgets as sw

from spikeinterface.extractors import IntanRecordingExtractor


# This has to be modified for experiment_path
experiment_path = Path(
    "/media/heberto/One Touch/DiCarlo-CN-data-share/exp_domain-transfer-2023/exp_domain-transfer-2023.sub_pico/raw_files/intanraw"
)
assert experiment_path.is_dir()
session_folder = experiment_path / "pico_domain-transfer-2023_230215_161322"
# session_folder = experiment_path / "pico_domain-transfer-2023_230214_140610"  # This file has a timestamp problem
assert session_folder.is_dir()

file_path = session_folder / "info.rhd"
assert file_path.is_file()


recording = IntanRecordingExtractor(
    file_path=file_path,
    stream_name="RHD2000 amplifier channel",
    all_annotations=True,
    ignore_integrity_checks=True,
)

# If you want to select only one channel
channel_ids = recording.get_channel_ids()[0:1]
recording = recording.select_channels(channel_ids=channel_ids)
w_ts = sw.plot_traces(recording, time_range=(0, 1), return_scaled=True)


#### Preprocess

In [None]:
from dicarlo_lab_to_nwb.conversion.pipeline import DiCarloBandPass, DiCarloNotch


f_notch = 50  # Hz
bandwidth = 10
f_low = 300.0
f_high = 6000.0

vectorized = True 
notched_recording = DiCarloNotch(recording, f_notch=f_notch, bandwidth=bandwidth, vectorized=vectorized)
preprocessed_recording = DiCarloBandPass(notched_recording, f_low=f_low, f_high=f_high, vectorized=vectorized)

# For this instance each array 96 channels, 400 micrometes apart
w_ts = sw.plot_traces(preprocessed_recording, time_range=(0, 1), return_scaled=True)

#### Run the peak detection on a short portion of the data

In [None]:
from dicarlo_lab_to_nwb.conversion.pipeline import di_carlo_peak_detection

noise_threshold = 3  # The number of standard deviations for peak detection

start_frame = 0
seconds_of_data = 1.0
end_frame = int(preprocessed_recording.sampling_frequency * seconds_of_data)
preprocessed_recording = preprocessed_recording.frame_slice(start_frame=start_frame, end_frame=end_frame)

spike_times_per_channel = di_carlo_peak_detection(
    recording=preprocessed_recording,
    noise_threshold=noise_threshold,
)

spike_times_per_channel

#### Everything can be wrapped up in a couple of lines

In [None]:
from dicarlo_lab_to_nwb.conversion.pipeline import di_carlo_pipeline


image_set_name = "domain-transfer-2023"
subject = "pico"
session_date = "20230214"
session_time = "140610"

# Parameters of the pipeline
f_notch = 50  # Hz
bandwidth = 10
f_low = 300.0
f_high = 6000.0
noise_threshold = 3  # The number of standard deviations for peak detection


data_folder = Path("/media/heberto/One Touch/DiCarlo-CN-data-share")
assert data_folder.is_dir(), f"Data directory not found: {data_folder}"

experiment_folder = data_folder / f"exp_{image_set_name}"
assert experiment_folder.is_dir(), f"Experiment folder not found: {experiment_folder}"

subject_folder = experiment_folder / f"exp_{image_set_name}.sub_{subject}"
assert subject_folder.is_dir(), f"Subject folder not found: {subject_folder}"

raw_data_folder = subject_folder / "raw_files"
assert raw_data_folder.is_dir(), f"Raw files folder not found: {raw_data_folder}"

intan_session_folder = raw_data_folder / "intanraw" / f"{subject}_{image_set_name}_{session_date[2:]}_{session_time}"
assert intan_session_folder.is_dir(), f"Intan session folder not found: {intan_session_folder}"

intan_file_path = intan_session_folder / "info.rhd"


stream_name = "RHD2000 amplifier channel"
recording = IntanRecordingExtractor(
    file_path=intan_file_path,
    stream_name=stream_name,
    ignore_integrity_checks=True,
    all_annotations=True,
)

spike_times_per_channel_vectorized = di_carlo_pipeline(
    recording=recording.frame_slice(start_frame=0, end_frame=1000), # Remove frame_slice to run the whole pipeline
    f_notch=f_notch,
    bandwidth=bandwidth,
    f_low=f_low,
    f_high=f_high,
    noise_threshold=noise_threshold,
)

spike_times_per_channel

## Speed comparisons

In [None]:
import time

import numpy as np

from spikeinterface.extractors import IntanRecordingExtractor
from dicarlo_lab_to_nwb.conversion.pipeline import di_carlo_pipeline

image_set_name = "domain-transfer-2023"
subject = "pico"
session_date = "20230214"
session_time = "140610"

# Parameters of the pipeline
f_notch = 50  # Hz
bandwidth = 10
f_low = 300.0
f_high = 6000.0
noise_threshold = 3  # The number of standard deviations for peak detection

data_folder = Path("/media/heberto/One Touch/DiCarlo-CN-data-share")
assert data_folder.is_dir(), f"Data directory not found: {data_folder}"

experiment_folder = data_folder / f"exp_{image_set_name}"
assert experiment_folder.is_dir(), f"Experiment folder not found: {experiment_folder}"

subject_folder = experiment_folder / f"exp_{image_set_name}.sub_{subject}"
assert subject_folder.is_dir(), f"Subject folder not found: {subject_folder}"

raw_data_folder = subject_folder / "raw_files"
assert raw_data_folder.is_dir(), f"Raw files folder not found: {raw_data_folder}"

intan_session_folder = raw_data_folder / "intanraw" / f"{subject}_{image_set_name}_{session_date[2:]}_{session_time}"
assert intan_session_folder.is_dir(), f"Intan session folder not found: {intan_session_folder}"

intan_file_path = intan_session_folder / "info.rhd"


stream_name = "RHD2000 amplifier channel"
recording = IntanRecordingExtractor(
    file_path=intan_file_path,
    stream_name=stream_name,
    ignore_integrity_checks=True,
    all_annotations=True,
)


recording

#### Pipeline vectorized vs non-vectorized
We do the comparision with some seconds of data only

In [None]:
start_frame = 0
seconds_of_data = 16.0
end_frame = int(recording.sampling_frequency * seconds_of_data)
recording_short = recording.frame_slice(start_frame=start_frame, end_frame=end_frame)
recording_to_use = recording_short

vectorized = True
job_kwargs = dict(n_jobs=1, progress_bar=True, verbose=True, chunk_duration=1.0)

time_start = time.time()

spike_times_per_channel_vectorized = di_carlo_pipeline(
    recording=recording_to_use,
    f_notch=f_notch,
    bandwidth=bandwidth,
    f_low=f_low,
    f_high=f_high,
    noise_threshold=noise_threshold,
    vectorized=vectorized,
    job_kwargs=job_kwargs,
)

time_stop = time.time()
time_taken = time_stop - time_start
print(f"Time elapsed: {time_taken} seconds")

time_vectorized = time_taken


vectorized = False
time_start = time.time()

spike_times_per_channel = di_carlo_pipeline(
    recording=recording_to_use,
    f_notch=f_notch,
    bandwidth=bandwidth,
    f_low=f_low,
    f_high=f_high,
    noise_threshold=noise_threshold,
    vectorized=vectorized,
    job_kwargs=job_kwargs,
)

time_stop = time.time()
time_taken = time_stop - time_start
print(f"Time elapsed: {time_taken} seconds")

time_non_vectorized = time_taken

speedup = time_non_vectorized/time_vectorized
print(f"Speedup: {speedup:.2f} (times faster)" )

# Test that the results are the same
for channel_index, spike_times in spike_times_per_channel.items():
    spike_times_vectorized = spike_times_per_channel_vectorized[channel_index]
    assert np.allclose(spike_times, spike_times_vectorized), f"Channel {channel_index} spike times do not match"

In [None]:
recording_to_use = recording_short
vectorized = True
time_start = time.time()
job_kwargs = dict(n_jobs=-1, progress_bar=True, verbose=True, chunk_duration=1.0)

spike_times_per_channel = di_carlo_pipeline(
    recording=recording_to_use,
    f_notch=f_notch,
    bandwidth=bandwidth,
    f_low=f_low,
    f_high=f_high,
    noise_threshold=noise_threshold,
    vectorized=vectorized,
    job_kwargs=job_kwargs,
)

time_stop = time.time()
time_taken = time_stop - time_start
print(f"Time elapsed: {time_taken} seconds")

time_vectorized_multiprocessing  = time_taken

speedup = time_non_vectorized/time_vectorized_multiprocessing
print(f"Speedup: {speedup:.2f} (times faster)" )

### Full recording parallelize in multiple cores

In [None]:

recording_to_use = recording
vectorized = True

job_kwargs = dict(n_jobs=-1, progress_bar=True, verbose=True, chunk_duration=1.0)

time_start = time.time()

spike_times_per_channel = di_carlo_pipeline(
    recording=recording_to_use,
    f_notch=f_notch,
    bandwidth=bandwidth,
    f_low=f_low,
    f_high=f_high,
    noise_threshold=noise_threshold,
    vectorized=vectorized,
    job_kwargs=job_kwargs,
)

time_stop = time.time()
time_taken = time_stop - time_start
print(f"Time elapsed: {time_taken} seconds")

time_multiprocessing = time_taken


job_kwargs = dict(n_jobs=1, progress_bar=True, verbose=True, chunk_duration=1.0)
time_start = time.time()

spike_times_per_channel = di_carlo_pipeline(
    recording=recording_to_use,
    f_notch=f_notch,
    bandwidth=bandwidth,
    f_low=f_low,
    f_high=f_high,
    noise_threshold=noise_threshold,
    vectorized=vectorized,
    job_kwargs=job_kwargs,
)

time_stop = time.time()
time_taken = time_stop - time_start
print(f"Time elapsed: {time_taken} seconds")

time_single_core = time_taken

speedup = time_single_core/time_multiprocessing

print(f"Speedup: {speedup:.2f} (times faster)" )


In [None]:
speedup = (time_non_vectorized * recording_to_use.get_duration() * 0.10 )/time_multiprocessing

print(f"Speedup with respect to non-vectorized version: {speedup:.2f} (times faster)" )

### Full pipeline spikeinterface vs DiCarlo form Intan and NWB

#### Load NWB Recording

In [None]:
from pathlib import Path


folder_path = Path.home() / "conversion_nwb" 
# folder_path = folder_path / "nwb_stub"
assert folder_path.is_dir()

file_path = folder_path / "pico_20230214_140610.nwb"
assert file_path.is_file()

from spikeinterface.extractors import NwbRecordingExtractor


recording = NwbRecordingExtractor(file_path=file_path)


recording_to_use = recording
vectorized = True

job_kwargs = dict(n_jobs=-1, progress_bar=True, verbose=True, chunk_duration=1.0)

time_start = time.time()

spike_times_per_channel = di_carlo_pipeline(
    recording=recording_to_use,
    f_notch=f_notch,
    bandwidth=bandwidth,
    f_low=f_low,
    f_high=f_high,
    noise_threshold=noise_threshold,
    vectorized=vectorized,
    job_kwargs=job_kwargs,
)

time_stop = time.time()
time_taken = time_stop - time_start
print(f"Time elapsed: {time_taken} seconds")

time_nwb = time_taken

speedup = time_multiprocessing/time_nwb
print(f"Speedup: {speedup:.2f} (times faster)" )
