# Ingest

Ingest all *.rdf files recursively, convert to a structured numpy array and upload to S3 for easy public download via numpy.DataSource().open(url)

See https://github.com/braingeneers/braingeneers/wiki/File-Formats for details on the source file format

Note: The original Intan code saves the data as float64 which effectively increaes the size of the data by 4x vs its original source as 16 bit samples. The below code is modified to keep things in 16 bit form and as a result does not scale the amplifier_data structure leaving that to the analysis phase. There is still significant overhead added vs. the raw file (82M vs. 44M) but its much better then just storing the float64

In [27]:
import os
import glob
import json
import numpy as np

os.chdir("/public/groups/braingeneers/")

In [41]:
#! /bin/env python
#
# Michael Gibson 17 July 2015
# Modified Adrian Foy Sep 2018

# Modified to leave amplifier_data as 16 bit unsigned int

import sys, struct, math, os, time
import numpy as np

from intanutil.read_header import read_header
from intanutil.get_bytes_per_data_block import get_bytes_per_data_block
from intanutil.read_one_data_block import read_one_data_block
from intanutil.notch_filter import notch_filter
from intanutil.data_to_result import data_to_result


def read_data(filename):
    """Reads Intan Technologies RHD2000 data file generated by evaluation board GUI.
    
    Data are returned in a dictionary, for future extensibility.
    """

    tic = time.time()
    fid = open(filename, 'rb')
    filesize = os.path.getsize(filename)

    header = read_header(fid)

    print('Found {} amplifier channel{}.'.format(header['num_amplifier_channels'], plural(header['num_amplifier_channels'])))
    print('Found {} auxiliary input channel{}.'.format(header['num_aux_input_channels'], plural(header['num_aux_input_channels'])))
    print('Found {} supply voltage channel{}.'.format(header['num_supply_voltage_channels'], plural(header['num_supply_voltage_channels'])))
    print('Found {} board ADC channel{}.'.format(header['num_board_adc_channels'], plural(header['num_board_adc_channels'])))
    print('Found {} board digital input channel{}.'.format(header['num_board_dig_in_channels'], plural(header['num_board_dig_in_channels'])))
    print('Found {} board digital output channel{}.'.format(header['num_board_dig_out_channels'], plural(header['num_board_dig_out_channels'])))
    print('Found {} temperature sensors channel{}.'.format(header['num_temp_sensor_channels'], plural(header['num_temp_sensor_channels'])))
    print('')

    # Determine how many samples the data file contains.
    bytes_per_block = get_bytes_per_data_block(header)

    # How many data blocks remain in this file?
    data_present = False
    bytes_remaining = filesize - fid.tell()
    if bytes_remaining > 0:
        data_present = True

    if bytes_remaining % bytes_per_block != 0:
        raise Exception('Something is wrong with file size : should have a whole number of data blocks')

    num_data_blocks = int(bytes_remaining / bytes_per_block)

    num_amplifier_samples = header['num_samples_per_data_block'] * num_data_blocks
    num_aux_input_samples = int((header['num_samples_per_data_block'] / 4) * num_data_blocks)
    num_supply_voltage_samples = 1 * num_data_blocks
    num_board_adc_samples = header['num_samples_per_data_block'] * num_data_blocks
    num_board_dig_in_samples = header['num_samples_per_data_block'] * num_data_blocks
    num_board_dig_out_samples = header['num_samples_per_data_block'] * num_data_blocks

    record_time = num_amplifier_samples / header['sample_rate']

    if data_present:
        print('File contains {:0.3f} seconds of data.  Amplifiers were sampled at {:0.2f} kS/s.'.format(record_time, header['sample_rate'] / 1000))
    else:
        print('Header file contains no data.  Amplifiers were sampled at {:0.2f} kS/s.'.format(header['sample_rate'] / 1000))

    if data_present:
        # Pre-allocate memory for data.
        print('')
        print('Allocating memory for data...')

        data = {}
        if (header['version']['major'] == 1 and header['version']['minor'] >= 2) or (header['version']['major'] > 1):
            data['t_amplifier'] = np.zeros(num_amplifier_samples, dtype=np.int)
        else:
            data['t_amplifier'] = np.zeros(num_amplifier_samples, dtype=np.uint)

        # NOTE: Changed from uint to uint16
        data['amplifier_data'] = np.zeros([header['num_amplifier_channels'], num_amplifier_samples], dtype=np.uint16)
        data['aux_input_data'] = np.zeros([header['num_aux_input_channels'], num_aux_input_samples], dtype=np.uint)
        data['supply_voltage_data'] = np.zeros([header['num_supply_voltage_channels'], num_supply_voltage_samples], dtype=np.uint)
        data['temp_sensor_data'] = np.zeros([header['num_temp_sensor_channels'], num_supply_voltage_samples], dtype=np.uint)
        data['board_adc_data'] = np.zeros([header['num_board_adc_channels'], num_board_adc_samples], dtype=np.uint)
        
        # by default, this script interprets digital events (digital inputs and outputs) as booleans
        # if unsigned int values are preferred(0 for False, 1 for True), replace the 'dtype=np.bool' argument with 'dtype=np.uint' as shown
        # the commented line below illustrates this for digital input data; the same can be done for digital out
        
        #data['board_dig_in_data'] = np.zeros([header['num_board_dig_in_channels'], num_board_dig_in_samples], dtype=np.uint)
        data['board_dig_in_data'] = np.zeros([header['num_board_dig_in_channels'], num_board_dig_in_samples], dtype=np.bool)
        data['board_dig_in_raw'] = np.zeros(num_board_dig_in_samples, dtype=np.uint)
        
        data['board_dig_out_data'] = np.zeros([header['num_board_dig_out_channels'], num_board_dig_out_samples], dtype=np.bool)
        data['board_dig_out_raw'] = np.zeros(num_board_dig_out_samples, dtype=np.uint)

        # Read sampled data from file.
        print('Reading data from file...')

        # Initialize indices used in looping
        indices = {}
        indices['amplifier'] = 0
        indices['aux_input'] = 0
        indices['supply_voltage'] = 0
        indices['board_adc'] = 0
        indices['board_dig_in'] = 0
        indices['board_dig_out'] = 0

        print_increment = 10
        percent_done = print_increment
        for i in range(num_data_blocks):
            read_one_data_block(data, header, indices, fid)

            # Increment indices
            indices['amplifier'] += header['num_samples_per_data_block']
            indices['aux_input'] += int(header['num_samples_per_data_block'] / 4)
            indices['supply_voltage'] += 1
            indices['board_adc'] += header['num_samples_per_data_block']
            indices['board_dig_in'] += header['num_samples_per_data_block']
            indices['board_dig_out'] += header['num_samples_per_data_block']            

            fraction_done = 100 * (1.0 * i / num_data_blocks)
            if fraction_done >= percent_done:
                print('{}% done...'.format(percent_done))
                percent_done = percent_done + print_increment

        # Make sure we have read exactly the right amount of data.
        bytes_remaining = filesize - fid.tell()
        if bytes_remaining != 0: raise Exception('Error: End of file not reached.')



    # Close data file.
    fid.close()

    if (data_present):
        print('Parsing data...')

        # Extract digital input channels to separate variables.
        for i in range(header['num_board_dig_in_channels']):
            data['board_dig_in_data'][i, :] = np.not_equal(np.bitwise_and(data['board_dig_in_raw'], (1 << header['board_dig_in_channels'][i]['native_order'])), 0)

        # Extract digital output channels to separate variables.
        for i in range(header['num_board_dig_out_channels']):
            data['board_dig_out_data'][i, :] = np.not_equal(np.bitwise_and(data['board_dig_out_raw'], (1 << header['board_dig_out_channels'][i]['native_order'])), 0)

        # Scale voltage levels appropriately.
        # NOTE: Commented out to reduce size of file by 4x
#         data['amplifier_data'] = np.multiply(0.195, (data['amplifier_data'].astype(np.int32) - 32768))      # units = microvolts
        data['aux_input_data'] = np.multiply(37.4e-6, data['aux_input_data'])               # units = volts
        data['supply_voltage_data'] = np.multiply(74.8e-6, data['supply_voltage_data'])     # units = volts
        if header['eval_board_mode'] == 1:
            data['board_adc_data'] = np.multiply(152.59e-6, (data['board_adc_data'].astype(np.int32) - 32768)) # units = volts
        elif header['eval_board_mode'] == 13:
            data['board_adc_data'] = np.multiply(312.5e-6, (data['board_adc_data'].astype(np.int32) - 32768)) # units = volts
        else:
            data['board_adc_data'] = np.multiply(50.354e-6, data['board_adc_data'])           # units = volts
        data['temp_sensor_data'] = np.multiply(0.01, data['temp_sensor_data'])               # units = deg C

        # Check for gaps in timestamps.
        num_gaps = np.sum(np.not_equal(data['t_amplifier'][1:]-data['t_amplifier'][:-1], 1))
        assert num_gaps == 0  # We don't handle missing samples in all our downstream analysis

        # Scale time steps (units = seconds).
        data['t_amplifier'] = data['t_amplifier'] / header['sample_rate']
        data['t_aux_input'] = data['t_amplifier'][range(0, len(data['t_amplifier']), 4)]
        data['t_supply_voltage'] = data['t_amplifier'][range(0, len(data['t_amplifier']), header['num_samples_per_data_block'])]
        data['t_board_adc'] = data['t_amplifier']
        data['t_dig'] = data['t_amplifier']
        data['t_temp_sensor'] = data['t_supply_voltage']

        # If the software notch filter was selected during the recording, apply the
        # same notch filter to amplifier data here.
        if header['notch_filter_frequency'] > 0:
            print('Applying notch filter...')

            print_increment = 10
            percent_done = print_increment
            for i in range(header['num_amplifier_channels']):
                data['amplifier_data'][i,:] = notch_filter(data['amplifier_data'][i,:], header['sample_rate'], header['notch_filter_frequency'], 10)

                fraction_done = 100 * (i / header['num_amplifier_channels'])
                if fraction_done >= percent_done:
                    print('{}% done...'.format(percent_done))
                    percent_done += print_increment
    else:
        data = [];

    # Move variables to result struct.
    result = data_to_result(header, data, data_present)

    print('Done!  Elapsed time: {0:0.1f} seconds'.format(time.time() - tic))
    return result

def plural(n):
    """Utility function to optionally pluralize words based on the value of n.
    """

    if n == 1:
        return ''
    else:
        return 's'

In [42]:
paths = glob.glob("mea/ucsf/11-12-2018/**/*.rhd")
print(paths[0])
data = read_data(paths[0])

mea/ucsf/11-12-2018/organoid 1/recording background 100Hz filter_181208_130030.rhd

Reading Intan Technologies RHD2000 Data File, Version 1.5

n signal groups 7
Found 16 amplifier channels.
Found 3 auxiliary input channels.
Found 1 supply voltage channel.
Found 0 board ADC channels.
Found 0 board digital input channels.
Found 0 board digital output channels.
Found 1 temperature sensors channel.

File contains 41.112 seconds of data.  Amplifiers were sampled at 20.00 kS/s.

Allocating memory for data...
Reading data from file...
10% done...
20% done...
30% done...
40% done...
50% done...
60% done...
70% done...
80% done...
90% done...
Parsing data...
No missing timestamps in data.
Done!  Elapsed time: 17.8 seconds


In [43]:
for k,v in data.items():
    print(k, sys.getsizeof(v) // 2**20)

t_supply_voltage 0
t_amplifier 6
spike_triggers 0
notes 0
supply_voltage_channels 0
amplifier_channels 0
amplifier_data 25
t_aux_input 1
aux_input_channels 0
aux_input_data 4
t_temp_sensor 0
frequency_parameters 0
supply_voltage_data 0


In [44]:
for path in glob.glob("mea/ucsf/11-12-2018/**/*.rhd"):
    data = read_data(path)
    
    signal = data["amplifier_data"]

    metadata = data

    del metadata["t_supply_voltage"]
    del metadata["t_amplifier"]
    # spike_triggers 0
    # notes 0
    # supply_voltage_channels 0
    # amplifier_channels 0
    del metadata["amplifier_data"]
    del metadata["t_aux_input"]
    del metadata["aux_input_channels"]
    del metadata["aux_input_data"]
    del metadata["t_temp_sensor"]
    # frequency_parameters 0
    del metadata["supply_voltage_data"]

    metadata["source_file_path"] = path  # Add annotation of source file path and name
    np.save(path[:-4] + ".npy", signal)
    with open(path[:-4] + ".json", "w") as f:
        json.dump(metadata, f)
    


Reading Intan Technologies RHD2000 Data File, Version 1.5

n signal groups 7
Found 16 amplifier channels.
Found 3 auxiliary input channels.
Found 1 supply voltage channel.
Found 0 board ADC channels.
Found 0 board digital input channels.
Found 0 board digital output channels.
Found 1 temperature sensors channel.

File contains 41.112 seconds of data.  Amplifiers were sampled at 20.00 kS/s.

Allocating memory for data...
Reading data from file...
10% done...
20% done...
30% done...
40% done...
50% done...
60% done...
70% done...
80% done...
90% done...
Parsing data...
No missing timestamps in data.
Done!  Elapsed time: 17.7 seconds

Reading Intan Technologies RHD2000 Data File, Version 1.5

n signal groups 7
Found 17 amplifier channels.
Found 3 auxiliary input channels.
Found 1 supply voltage channel.
Found 0 board ADC channels.
Found 0 board digital input channels.
Found 0 board digital output channels.
Found 1 temperature sensors channel.

File contains 60.012 seconds of data.  Ampli

70% done...
80% done...
90% done...
Parsing data...
No missing timestamps in data.
Done!  Elapsed time: 14.5 seconds

Reading Intan Technologies RHD2000 Data File, Version 1.5

n signal groups 7
Found 16 amplifier channels.
Found 3 auxiliary input channels.
Found 1 supply voltage channel.
Found 0 board ADC channels.
Found 0 board digital input channels.
Found 0 board digital output channels.
Found 1 temperature sensors channel.

File contains 60.012 seconds of data.  Amplifiers were sampled at 20.00 kS/s.

Allocating memory for data...
Reading data from file...
10% done...
20% done...
30% done...
40% done...
50% done...
60% done...
70% done...
80% done...
90% done...
Parsing data...
No missing timestamps in data.
Done!  Elapsed time: 26.4 seconds

Reading Intan Technologies RHD2000 Data File, Version 1.5

n signal groups 7
Found 16 amplifier channels.
Found 3 auxiliary input channels.
Found 1 supply voltage channel.
Found 0 board ADC channels.
Found 0 board digital input channels.
Foun

10% done...
20% done...
30% done...
40% done...
50% done...
60% done...
70% done...
80% done...
90% done...
Parsing data...
No missing timestamps in data.
Done!  Elapsed time: 26.3 seconds

Reading Intan Technologies RHD2000 Data File, Version 1.5

n signal groups 7
Found 16 amplifier channels.
Found 3 auxiliary input channels.
Found 1 supply voltage channel.
Found 0 board ADC channels.
Found 0 board digital input channels.
Found 0 board digital output channels.
Found 1 temperature sensors channel.

File contains 60.012 seconds of data.  Amplifiers were sampled at 20.00 kS/s.

Allocating memory for data...
Reading data from file...
10% done...
20% done...
30% done...
40% done...
50% done...
60% done...
70% done...
80% done...
90% done...
Parsing data...
No missing timestamps in data.
Done!  Elapsed time: 25.6 seconds

Reading Intan Technologies RHD2000 Data File, Version 1.5

n signal groups 7
Found 16 amplifier channels.
Found 3 auxiliary input channels.
Found 1 supply voltage channel

In [38]:
!ls -alh '/public/groups/braingeneers/mea/ucsf/11-12-2018/organoid 4/'

total 266M
drwxr-xr-x 2 30059  614 4.0K Dec 27 05:23 .
drwxr-sr-x 6 30059  614   98 Dec 11 20:52 ..
-rw-r--r-- 1 30059 2000 7.6K Dec 27 05:22 recording_4_wt 40mM KCl_181208_143057.json
-rw-r--r-- 1 30059 2000  37M Dec 27 05:22 recording_4_wt 40mM KCl_181208_143057.npy
-rw-r--r-- 1 30059 2000  44M Dec  8 22:31 recording_4_wt 40mM KCl_181208_143057.rhd
-rw-r--r-- 1 30059 2000 7.6K Dec 27 05:23 recording_4_wt 40mM KCl_181208_143157.json
-rw-r--r-- 1 30059 2000  37M Dec 27 05:23 recording_4_wt 40mM KCl_181208_143157.npy
-rw-r--r-- 1 30059 2000  44M Dec  8 22:32 recording_4_wt 40mM KCl_181208_143157.rhd
-rw-r--r-- 1 30059 2000 7.6K Dec 27 05:23 recording_4_wt 40mM KCl_181208_143257.json
-rw-r--r-- 1 30059 2000  37M Dec 27 05:23 recording_4_wt 40mM KCl_181208_143257.npy
-rw-r--r-- 1 30059 2000  44M Dec  8 22:33 recording_4_wt 40mM KCl_181208_143257.rhd
-rw-r--r-- 1 30059 2000 7.6K Dec 27 05:23 recording_4_wt 40mM KCl_181208_143357.json
-rw-r--r-- 1 30059 2000  13M Dec 27 05:23 r

In [45]:
np.load("mea/ucsf/11-12-2018/organoid 3/recording_4_wt_40mM KCl_181208_142127.npy")

array([[32056, 32037, 32008, ..., 61045, 61015, 61222],
       [32187, 32146, 32098, ..., 60991, 60990, 61149],
       [31977, 31973, 31930, ..., 61741, 61708, 61855],
       ...,
       [26713, 26656, 26657, ..., 61601, 61722, 61763],
       [25689, 25591, 25587, ..., 59036, 59030, 59023],
       [35515, 35318, 35186, ..., 61631, 62083, 62554]], dtype=uint16)

In [46]:
# Upload all of the .npy files to S3 replacing spaces with -
for path in glob.glob("mea/ucsf/11-12-2018/**/*.npy") + glob.glob("mea/ucsf/11-12-2018/**/*.json"):
    !aws --profile {os.getenv("AWS_PROFILE")} --endpoint {os.getenv("AWS_S3_ENDPOINT")} \
        s3 cp '{path}' s3://braingeneers/{path.replace(" ", "-")} --acl public-read

upload: mea/ucsf/11-12-2018/organoid 1/recording background 100Hz filter_181208_130030.npy to s3://braingeneers/mea/ucsf/11-12-2018/organoid-1/recording-background-100Hz-filter_181208_130030.npy
upload: mea/ucsf/11-12-2018/organoid 1/recording_1_181208_122247.npy to s3://braingeneers/mea/ucsf/11-12-2018/organoid-1/recording_1_181208_122247.npy
upload: mea/ucsf/11-12-2018/organoid 1/recording_1_181208_122347.npy to s3://braingeneers/mea/ucsf/11-12-2018/organoid-1/recording_1_181208_122347.npy
upload: mea/ucsf/11-12-2018/organoid 1/recording_2_100Hz high pass_181208_122907.npy to s3://braingeneers/mea/ucsf/11-12-2018/organoid-1/recording_2_100Hz-high-pass_181208_122907.npy
upload: mea/ucsf/11-12-2018/organoid 1/recording_2_100Hz high pass_181208_123007.npy to s3://braingeneers/mea/ucsf/11-12-2018/organoid-1/recording_2_100Hz-high-pass_181208_123007.npy
upload: mea/ucsf/11-12-2018/organoid 1/recording_3_one ref_181208_124702.npy to s3://braingeneers/mea/ucsf/11-12-2018/organoid-1/recordin

upload: mea/ucsf/11-12-2018/organoid 2/recording_3 wt_181208_135431.json to s3://braingeneers/mea/ucsf/11-12-2018/organoid-2/recording_3-wt_181208_135431.json
upload: mea/ucsf/11-12-2018/organoid 2/recording_3 wt_181208_135531.json to s3://braingeneers/mea/ucsf/11-12-2018/organoid-2/recording_3-wt_181208_135531.json
upload: mea/ucsf/11-12-2018/organoid 2/recording_3 wt_low freq noise_181208_140344.json to s3://braingeneers/mea/ucsf/11-12-2018/organoid-2/recording_3-wt_low-freq-noise_181208_140344.json
upload: mea/ucsf/11-12-2018/organoid 2/recording_3 wt_low freq noise_181208_140444.json to s3://braingeneers/mea/ucsf/11-12-2018/organoid-2/recording_3-wt_low-freq-noise_181208_140444.json
upload: mea/ucsf/11-12-2018/organoid 2/recording_3 wt_low freq noise_2_181208_140551.json to s3://braingeneers/mea/ucsf/11-12-2018/organoid-2/recording_3-wt_low-freq-noise_2_181208_140551.json
upload: mea/ucsf/11-12-2018/organoid 2/recording_3 wt_low freq noise_2_181208_140651.json to s3://braingeneers/

In [48]:
!aws --profile {os.getenv("AWS_PROFILE")} --endpoint {os.getenv("AWS_S3_ENDPOINT")} \
    s3 ls --recursive s3://braingeneers/mea/ucsf/

2018-12-27 05:48:03       7714 mea/ucsf/11-12-2018/organoid-1/recording-background-100Hz-filter_181208_130030.json
2018-12-27 05:46:44   26311808 mea/ucsf/11-12-2018/organoid-1/recording-background-100Hz-filter_181208_130030.npy
2018-12-27 05:48:04       8104 mea/ucsf/11-12-2018/organoid-1/recording_1_181208_122247.json
2018-12-27 05:46:47   40808288 mea/ucsf/11-12-2018/organoid-1/recording_1_181208_122247.npy
2018-12-27 05:48:06       8104 mea/ucsf/11-12-2018/organoid-1/recording_1_181208_122347.json
2018-12-27 05:46:48    2717408 mea/ucsf/11-12-2018/organoid-1/recording_1_181208_122347.npy
2018-12-27 05:48:07       8121 mea/ucsf/11-12-2018/organoid-1/recording_2_100Hz-high-pass_181208_122907.json
2018-12-27 05:46:51   40808288 mea/ucsf/11-12-2018/organoid-1/recording_2_100Hz-high-pass_181208_122907.npy
2018-12-27 05:48:08       8122 mea/ucsf/11-12-2018/organoid-1/recording_2_100Hz-high-pass_181208_123007.json
2018-12-27 05:46:53    3207008 mea/ucsf/11-12-2018/organoid-1/recording_2_1

In [47]:
# Read a single numpy file in directly from S3 to verify
with np.DataSource(None).open("https://s3.nautilus.optiputer.net/braingeneers/mea/ucsf/11-12-2018/organoid-3/recording_4_wt_40mM-KCl_181208_142127.npy", "rb") as f:
    signal = np.load(f)
signal

array([[32056, 32037, 32008, ..., 61045, 61015, 61222],
       [32187, 32146, 32098, ..., 60991, 60990, 61149],
       [31977, 31973, 31930, ..., 61741, 61708, 61855],
       ...,
       [26713, 26656, 26657, ..., 61601, 61722, 61763],
       [25689, 25591, 25587, ..., 59036, 59030, 59023],
       [35515, 35318, 35186, ..., 61631, 62083, 62554]], dtype=uint16)