In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json

In [2]:
data_dir = os.path.join("m:", "neuro2voc", "raw_data", "j8v8", "20200228_02")

# Load Frequencies

In [3]:
# Load frequencies
raw_frequencies = np.load(os.path.join(data_dir, 'frequencies.spec.npy'))
frequencies = raw_frequencies[0]

# Load and pad spectrograms
spec_files = ['raw_g0_t0.nidq.bin.spec.npy', 'raw_g1_t0.nidq.bin.spec.npy', 'raw_g2_t0.nidq.bin.spec.npy']
padded_specs = []

for spec_file in spec_files:
    spec = np.load(os.path.join(data_dir, spec_file))
    padding = 4  # 16ms * 250Hz = 40 samples
    padded_spec = np.pad(spec, ((0, 0), (padding, 0)), mode='constant')
    padded_specs.append(padded_spec)

padded_spec_0, padded_spec_1, padded_spec_2 = padded_specs

In [4]:
print(f"Padded spec_0 shape: {padded_spec_0.shape}")

Padded spec_0 shape: (128, 799315)


# Load Annotations

In [5]:
# Load and process annotations
annotation_files = ['annotations_raw_g0_t0.nidq.bin.csv', 'annotations_raw_g1_t0.nidq.bin.csv', 'annotations_raw_g2_t0.nidq.bin.csv']
filtered_dfs = []

for annotation_file in annotation_files:
    annotations_df = pd.read_csv(os.path.join(data_dir, annotation_file))
    annotations_df['onset_sec'] = annotations_df['onset'] / 20000
    annotations_df['duration_sec'] = annotations_df['duration'] / 20000
    filtered_df = annotations_df[annotations_df['cluster_id'].between(2, 8)]
    filtered_dfs.append(filtered_df)

filtered_df_0, filtered_df_1, filtered_df_2 = filtered_dfs

In [6]:
filtered_df_0.head()

Unnamed: 0,file,onset,duration,cluster_id,onset_sec,duration_sec
15,raw_g0_t0.nidq.bin,15521214.0,986.0,2,776.0607,0.0493
16,raw_g0_t0.nidq.bin,15525652.0,979.0,2,776.2826,0.04895
17,raw_g0_t0.nidq.bin,15576333.0,929.0,2,778.81665,0.04645
18,raw_g0_t0.nidq.bin,15583214.0,991.0,2,779.1607,0.04955
19,raw_g0_t0.nidq.bin,15588407.0,2960.0,3,779.42035,0.148


# Load Spike Times

In [7]:
# Load spike times
spike_times_files = ['spiketimes_raw_g0_t0.nidq.bin.npy', 'spiketimes_raw_g1_t0.nidq.bin.npy', 'spiketimes_raw_g2_t0.nidq.bin.npy']
spike_times = []

for spike_file in spike_times_files:
    spike_time = np.load(os.path.join(data_dir, spike_file))[0]
    spike_times.append(spike_time)

In [8]:
# Print some information about the loaded data
for i, (spec, filtered_df, spike_time) in enumerate(zip(padded_specs, filtered_dfs, spike_times)):
    spec_duration = spec.shape[1] / (250*60)
    spike_duration = spike_time.max() / (20000*60)
    print(f"File raw_g{i}_t0.nidq.bin.spec.npy duration: {spec_duration:.2f} minutes")
    print(f"File spiketimes_raw_g{i}_t0.nidq.bin.npy duration: {spike_duration:.2f} minutes")
    print(f"Number of annotations for g{i}: {len(filtered_df)}")
    print()

File raw_g0_t0.nidq.bin.spec.npy duration: 53.29 minutes
File spiketimes_raw_g0_t0.nidq.bin.npy duration: 53.29 minutes
Number of annotations for g0: 417

File raw_g1_t0.nidq.bin.spec.npy duration: 90.06 minutes
File spiketimes_raw_g1_t0.nidq.bin.npy duration: 89.91 minutes
Number of annotations for g1: 191

File raw_g2_t0.nidq.bin.spec.npy duration: 77.90 minutes
File spiketimes_raw_g2_t0.nidq.bin.npy duration: 70.37 minutes
Number of annotations for g2: 2851



In [9]:
spike_times[0][-10]

63896708.468520574

# Process Spike Times

In [10]:
spike_places_files = ['spikecluster_raw_g0_t0.nidq.bin.npy', 'spikecluster_raw_g1_t0.nidq.bin.npy', 'spikecluster_raw_g2_t0.nidq.bin.npy']

spike_dfs = []

for times_file, places_file in zip(spike_times_files, spike_places_files):
    times = np.load(os.path.join(data_dir, times_file))[0]
    places = np.load(os.path.join(data_dir, places_file))[0]
    
    # Convert times to integers and adjust sampling rate from 20000 Hz to 30000 Hz
    # times = np.round(times * 30000 / 20000).astype(int)
    # neural_resolution = "30k"
    
    times = np.round(times)
    neural_resolution = "20k"
    # Create a DataFrame for this file
    df = pd.DataFrame({'time': times, 'place': places})
    spike_dfs.append(df)

# Unpack the DataFrames
df_0, df_1, df_2 = spike_dfs

In [11]:
df_0.head()

Unnamed: 0,time,place
0,1463.0,326.0
1,15893.0,326.0
2,20591.0,326.0
3,25481.0,326.0
4,29877.0,326.0


In [12]:
# Find and remove duplicates with the same time and place
for i in range(3):
    df = spike_dfs[i]
    df['place'] = df['place'].astype(int)
    duplicates = df[df.duplicated(subset=['time', 'place'], keep=False)].sort_values(['time', 'place'])
    print(f"Data {i}:")
    
    if len(duplicates) > 0:
        print("Duplicated spike times with the same place found:")
        print(duplicates)
        print(f"Total number of duplicates: {len(duplicates)}")
        
        # Remove duplicates
        print(f"DataFrame shape before removing duplicates: {df.shape}")
        df_deduped = df.drop_duplicates(subset=['time', 'place'], keep='first')
        removed_count = len(df) - len(df_deduped)
        spike_dfs[i] = df_deduped
        print(f"Removed {removed_count} duplicate rows.")
        print(f"DataFrame shape after removing duplicates: {df_deduped.shape}")
    else:
        print("No duplicated spike times with the same place found.")
    print()

# Update individual DataFrames
df_0, df_1, df_2 = spike_dfs

Data 0:
Duplicated spike times with the same place found:
               time  place
1709741   2412097.0     31
1709742   2412097.0     31
1200553   5969994.0     27
1200554   5969994.0     27
1991691   5972994.0     40
1991692   5972994.0     40
73586     9623895.0    315
73587     9623895.0    315
2433316  11996845.0     55
2433317  11996845.0     55
632104   12627972.0     21
632105   12627972.0     21
79349    18724584.0    315
79350    18724584.0    315
237957   22453711.0    307
237958   22453711.0    307
1840560  38588677.0     35
1840561  38588677.0     35
291478   40034080.0    377
291479   40034080.0    377
3298932  46195650.0    327
3298933  46195650.0    327
Total number of duplicates: 22
DataFrame shape before removing duplicates: (3318179, 2)
Removed 11 duplicate rows.
DataFrame shape after removing duplicates: (3318168, 2)

Data 1:
Duplicated spike times with the same place found:
                time  place
3083952   39213386.0     35
3083953   39213386.0     35
15440  

# Now convert the spike places into index

In [13]:
for i, df in enumerate(spike_dfs):
    unique_places = df['place'].nunique()
    print(f"Data {i}: Number of unique places: {unique_places}")
    print(f"Unique place values: {sorted(df['place'].unique())}")
    print()
    
unique_places = set()
for df in spike_dfs:
    unique_places.update(df['place'].unique())
unique_places = sorted(unique_places)

print(f"Number of unique places across all DataFrames: {len(unique_places)}")

Data 0: Number of unique places: 75
Unique place values: [12, 19, 20, 21, 25, 27, 31, 34, 35, 36, 38, 39, 40, 41, 44, 45, 46, 47, 48, 49, 50, 51, 52, 54, 55, 56, 57, 58, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69, 72, 73, 81, 85, 92, 106, 107, 109, 110, 111, 284, 287, 291, 293, 298, 307, 311, 315, 319, 326, 327, 330, 346, 356, 370, 377, 385, 389, 396, 401, 403, 406, 411, 416, 419, 425, 428]

Data 1: Number of unique places: 75
Unique place values: [12, 19, 20, 21, 25, 27, 31, 34, 35, 36, 38, 39, 40, 41, 44, 45, 46, 47, 48, 49, 50, 51, 52, 54, 55, 56, 57, 58, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69, 72, 73, 81, 85, 92, 106, 107, 109, 110, 111, 284, 287, 291, 293, 298, 307, 311, 315, 319, 326, 327, 330, 346, 356, 370, 377, 385, 389, 396, 401, 403, 406, 411, 416, 419, 425, 428]

Data 2: Number of unique places: 75
Unique place values: [12, 19, 20, 21, 25, 27, 31, 34, 35, 36, 38, 39, 40, 41, 44, 45, 46, 47, 48, 49, 50, 51, 52, 54, 55, 56, 57, 58, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69, 72, 73, 

In [14]:
cluster_info_path = os.path.join(data_dir, "cluster_info.tsv")
cluster_info = pd.read_csv(cluster_info_path, sep='\t')

In [15]:
filtered_cluster_info = cluster_info[cluster_info['id'].isin(unique_places)]
sorted_cluster_info = filtered_cluster_info.sort_values('depth')

num_rows = len(sorted_cluster_info)
print(f"\nNumber of rows in sorted_cluster_info: {num_rows}")

# Reindex with a new index
sorted_cluster_info.reset_index(drop=True, inplace=True)
sorted_cluster_info['new_index'] = sorted_cluster_info.index

# Print reindexed sorted_cluster_info
print("\nReindexed Sorted Cluster Info:")
print(sorted_cluster_info)

print(f"\nNumber of rows (returned): {num_rows}")


Number of rows in sorted_cluster_info: 75

Reindexed Sorted Cluster Info:
     id  Amplitude  ContamPct KSLabel       amp   ch  depth         fr group  \
0   326       43.8       52.3     mua  3.532561   18    200   7.026394   mua   
1   315       47.8       43.4     mua  3.426856   47    480  10.556721   mua   
2   311       47.8       43.4     mua  3.847038   47    480  12.385760   mua   
3    12       71.5        2.9    good  6.398156   65    660  72.496604  good   
4   307       41.3       81.2     mua  3.364714   82    840   8.601645   mua   
..  ...        ...        ...     ...       ...  ...    ...        ...   ...   
70  346       46.8       41.3     mua  3.699689  297   2980   4.820183   mua   
71  330       46.6       99.4     mua  3.491918  317   3180   3.833593   mua   
72   92       47.3       12.4     mua  3.606650  318   3200   5.159582   mua   
73  327       45.5       52.4     mua  2.837812  321   3220   1.932522   mua   
74  428       57.1       20.8     mua  4.5424

In [16]:
id_to_new_index = dict(zip(sorted_cluster_info['id'], sorted_cluster_info['new_index']))
print("Mapping sample:")
print(list(id_to_new_index.items())[:10])
print()

Mapping sample:
[(326, 0), (315, 1), (311, 2), (12, 3), (307, 4), (377, 5), (356, 6), (385, 7), (370, 8), (20, 9)]



In [17]:
def reindex_place(df):
    df['reindexed_place'] = df['place'].map(id_to_new_index)
    return df

for i, df in enumerate(spike_dfs):
    spike_dfs[i] = reindex_place(df)
    print(f"Data {i}: Added reindexed_place")
    print(f"Number of unique reindexed_place values: {df['reindexed_place'].nunique()}")
    print(f"reindexed_place range: {df['reindexed_place'].min()} to {df['reindexed_place'].max()}")
    print(f"Number of NaN values in reindexed_place: {df['reindexed_place'].isna().sum()}")
    print()

# Update individual DataFrames
df_0, df_1, df_2 = spike_dfs

Data 0: Added reindexed_place
Number of unique reindexed_place values: 75
reindexed_place range: 0 to 74
Number of NaN values in reindexed_place: 0

Data 1: Added reindexed_place
Number of unique reindexed_place values: 75
reindexed_place range: 0 to 74
Number of NaN values in reindexed_place: 0

Data 2: Added reindexed_place
Number of unique reindexed_place values: 75
reindexed_place range: 0 to 74
Number of NaN values in reindexed_place: 0



In [18]:
df_0.tail()

Unnamed: 0,time,place,reindexed_place
3318174,63917341.0,428,74
3318175,63918320.0,428,74
3318176,63919842.0,428,74
3318177,63933035.0,428,74
3318178,63942756.0,428,74


# Save data frame

In [19]:
save_dir = os.path.join("m:", "neuro2voc", "task_5", "data")
os.makedirs(save_dir, exist_ok=True)

# Save each DataFrame to a CSV file
# for i, df in enumerate([df_0, df_1, df_2]):
#     file_name = f"processed_spikes_{i}_{neural_resolution}.csv"
#     file_path = os.path.join(save_dir, file_name)
#     df.to_csv(file_path, index=False)
#     print(f"Saved DataFrame {i} to {file_path}")

# print("All DataFrames have been saved successfully.")

# Save binary

In [24]:
def process_dataframes(dataframes, output_dir, merge=True):
    os.makedirs(output_dir, exist_ok=True)
    
    if merge:
        spike_times = np.concatenate([df['time'].values for df in dataframes])
        neuron_ids = np.concatenate([df['reindexed_place'].values for df in dataframes])
        
        max_time = int(np.ceil(spike_times.max()))
        num_neurons = neuron_ids.max() + 1
        time_series = np.zeros((num_neurons, max_time), dtype=np.int16)
        
        for time, neuron in zip(spike_times, neuron_ids):
            time_index = int(time)
            if time_index < max_time:
                time_series[neuron, time_index] = 1
        
        time_series.tofile(os.path.join(output_dir, 'neural_data_merged.bin'))
        print(f"Merged data saved. Shape: {time_series.shape}")
    
    else:
        for i, df in enumerate(dataframes):
            spike_times = df['time'].values
            neuron_ids = df['reindexed_place'].values
            
            max_time = int(np.ceil(spike_times.max()))
            num_neurons = neuron_ids.max() + 1
            time_series = np.zeros((num_neurons, max_time), dtype=np.int16)
            
            for time, neuron in zip(spike_times, neuron_ids):
                time_index = int(time)
                if time_index < max_time:
                    time_series[neuron, time_index] = 1
            
            time_series.tofile(os.path.join(output_dir, f'neural_data_2_{neural_resolution}.bin'))
            print(f"Data 2 has been saved. Shape: {time_series.shape}")
    
    channel_map = np.arange(num_neurons, dtype=np.int32)
    np.save(os.path.join(output_dir, 'channel_map.npy'), channel_map)
    print(f"Channel mapping has been saved. Number of chanels: {num_neurons}")

In [25]:
process_dataframes([df_2], save_dir, merge=False)

Data 2 has been saved. Shape: (75, 84439846)
Channel mapping has been saved. Number of chanels: 75
