# Data pre-processing
Pre-processing of ephys (voltage + timestamps) and position data. 
The pre-processing of ephys data includes:

    (1). The conversion of data from the .dat files into voltage; 
    (2). The removal of the first frame index if necessary (it is for chunk=0)
    (3). The subsampling of data to 1:10;
    (4). Saving the data according to tetrode channels into .csv files;
    
  The pre-processing of ephys timestamps includes:
  
    (1). The removal of the first frame index if necessary (it is for chunk=0)
    (2). Conversion to seconds;
    (3). Subsamplig of timestamps to 1:10.
    
  
The pre-processing of position data includes:

    (1). Visual confirmation of each runs (numbered);
    (2). Creation of a run specifications file with run information (error/correct, sample/test, trial number, described below. Created outside the notebook ans saved in a csv file);
    (3). Collect of ROI limits using the video first frame (start ROI, choice point, corners and reward ports);
    (4). Combine run specifications to the position data. Save all into a .csv file.
    

#### Imports

In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
from tqdm import tqdm
import glob
import matplotlib.pyplot as plt
from ephys_utils import *
from position_utils import * 

#### Definition of variables

In [2]:
sample_rate = 30000  # in Hz
nr_channels = 128
chunk_size = 900000  
path='E:/EPHYS/data/NAPOLEAO_DNMP22_18_trials_20180504094832'

## Ephys data pre-processing
Includes the pre-processing of both voltage and timestamps.

##### Get data file names

In [3]:
amplifier_file, ttl_input_file, timestamp_file = get_data_filenames(path)

Amplifier file: 20180504094832_b_amplifier.dat
TTL input file: 20180504094832_b_ttlinput.dat
TTL input file: 20180504094832_b_tstamp_rhd2000.dat


##### Calculate number of blocks

In [4]:
nr_blocks, leftover=calculate_nr_of_blocks(amplifier_file, nr_channels, chunk_size)
all_chunk_sizes=[chunk_size]*nr_blocks#+[chunk_size*leftover]
#Print number of blocks
print('Number of blocks to process: {}'.format(len(all_chunk_sizes)))

Number of samples per channel: 74620920.0
Number of blocks to process: 82


##### Read, convert and reorganize chunks of ephys data

In [61]:
with open(amplifier_file, 'r') as fileid:
    
    for count, ch_s in tqdm(enumerate(all_chunk_sizes)):
        
        # Open chunk into a dataframe
        chunk = open_and_convert_chunk_to_df(fileid, ch_s, nr_channels)
        
        # Convert values to voltage
        chunk_converted = chunk.astype(int).apply(lambda x: ((x-32768) * 0.195))
        
        if count==0: 
            # Get first video frame index in the TTL input file
            first_frame_index = get_ttl_input_first_index(ttl_input_file)
            chunk_clean=chunk_converted.iloc[first_frame_index:].reset_index(drop=True)
            # Split chunk according to tetrode mapping and save to .csv files 
            chunk_clean_subsampled = chunk_clean.iloc[::10]
            organize_chunk_by_tt_and_save(path, chunk_clean_subsampled, count)
            
        else:
            # Split chunk according to tetrode mapping and save to .csv files  
            chunk_subsampled = chunk_converted.iloc[::10]
            organize_chunk_by_tt_and_save(path, chunk_subsampled, count) 

82it [1:18:32, 57.47s/it]


##### Read, convert and store chunks of ephys timestamps

Clip on TTL input using first_frame_index and split into chunks

In [13]:
# Clip on TTL input using first_frame_index
# Split into chunks
with open(timestamp_file, 'r') as fileid:
    for count, ch_s in tqdm(enumerate(all_chunk_sizes)):
        fileid.seek(0, 1)
        # Open chunk and convert into a dataframe
        timestamps_chunk = np.fromfile(fileid, np.int32, ch_s)
        timestamps_chunk = pd.DataFrame(timestamps_chunk)
        
        if count==0: 
            # Clip on TTL input first frame index
            first_frame_index = get_ttl_input_first_index(ttl_input_file)
            timestamps_chunk_clean=timestamps_chunk.iloc[first_frame_index:].reset_index(drop=True)
            # Get the first_timestamp
            first_timestamp = timestamps_chunk_clean.iloc[0]
            
            # Convert to seconds from start
            timestamps_converted = (timestamps_chunk_clean - first_timestamp)*(1/sample_rate)
        else:
            # Convert to second from start
            timestamps_converted = (timestamps_chunk - first_timestamp) * (1/sample_rate)
            
        timestamps_converted = timestamps_converted.rename(columns={'0':'t'})
        timestamps_subsampled = timestamps_converted.iloc[::10]
        print(count)
        print(len(timestamps_subsampled[timestamps_subsampled.duplicated()]))
        # Save chunk into a .csv file
        filename='timestamps_chunk{}.csv'.format(count)
        timestamps_subsampled.to_csv(os.path.join(path, 'Ephys_timestamps', filename))

0it [00:00, ?it/s]

0
0


1it [00:01,  1.51s/it]

1
0


2it [00:02,  1.26s/it]

2
0


3it [00:03,  1.11s/it]

3
0


4it [00:04,  1.06s/it]

4
0


5it [00:05,  1.03s/it]

5
0


6it [00:06,  1.02it/s]

6
0


7it [00:07,  1.06it/s]

7
0


8it [00:08,  1.06it/s]

8
0


9it [00:09,  1.08it/s]

9
0


10it [00:09,  1.10it/s]

10
0


11it [00:10,  1.06it/s]

11
0


12it [00:11,  1.09it/s]

12
0


13it [00:12,  1.01it/s]

13
0


14it [00:13,  1.04it/s]

14
0


15it [00:15,  1.04s/it]

15
0


16it [00:16,  1.11s/it]

16
0


17it [00:17,  1.06s/it]

17
0


18it [00:18,  1.05s/it]

18
0


19it [00:19,  1.11s/it]

19
0


20it [00:20,  1.10s/it]

20
0


21it [00:21,  1.02s/it]

21
0


22it [00:22,  1.03s/it]

22
0


23it [00:23,  1.01it/s]

23
0


24it [00:24,  1.11s/it]

24
0


25it [00:26,  1.22s/it]

25
0


26it [00:27,  1.28s/it]

26
0


27it [00:28,  1.20s/it]

27
0


28it [00:29,  1.17s/it]

28
0


29it [00:30,  1.12s/it]

29
0


30it [00:31,  1.08s/it]

30
0


31it [00:32,  1.05s/it]

31
0


32it [00:34,  1.09s/it]

32
0


33it [00:35,  1.08s/it]

33
0


34it [00:36,  1.22s/it]

34
0


35it [00:37,  1.17s/it]

35
0


36it [00:38,  1.15s/it]

36
0


37it [00:39,  1.17s/it]

37
0


38it [00:41,  1.17s/it]

38
0


39it [00:42,  1.10s/it]

39
0


40it [00:43,  1.05s/it]

40
0


41it [00:43,  1.03s/it]

41
0


42it [00:45,  1.03s/it]

42
0


43it [00:46,  1.04s/it]

43
0


44it [00:47,  1.06s/it]

44
0


45it [00:48,  1.05s/it]

45
0


46it [00:49,  1.01s/it]

46
0


47it [00:50,  1.01it/s]

47
0


48it [00:51,  1.02it/s]

48
0


49it [00:52,  1.01it/s]

49
0


50it [00:53,  1.01s/it]

50
0


51it [00:54,  1.06s/it]

51
0


52it [00:55,  1.03s/it]

52
0


53it [00:56,  1.00s/it]

53
0


54it [00:57,  1.02s/it]

54
0


55it [00:58,  1.01it/s]

55
0


56it [00:59,  1.04it/s]

56
0


57it [01:00,  1.03s/it]

57
0


58it [01:01,  1.02s/it]

58
0


59it [01:02,  1.05s/it]

59
0


60it [01:03,  1.03s/it]

60
0


61it [01:04,  1.03s/it]

61
0


62it [01:05,  1.01s/it]

62
0


63it [01:06,  1.05s/it]

63
0


64it [01:07,  1.07s/it]

64
0


65it [01:08,  1.05s/it]

65
0


66it [01:09,  1.05s/it]

66
0


67it [01:10,  1.03s/it]

67
0


68it [01:11,  1.06s/it]

68
0


69it [01:12,  1.11s/it]

69
0


70it [01:14,  1.23s/it]

70
0


71it [01:16,  1.34s/it]

71
0


72it [01:17,  1.37s/it]

72
0


73it [01:18,  1.26s/it]

73
0


74it [01:19,  1.16s/it]

74
0


75it [01:20,  1.08s/it]

75
0


76it [01:21,  1.03s/it]

76
0


77it [01:22,  1.04s/it]

77
0


78it [01:23,  1.01it/s]

78
0


79it [01:24,  1.03it/s]

79
0


80it [01:25,  1.05it/s]

80
70128


81it [01:26,  1.05it/s]

81
89904


82it [01:26,  1.06s/it]



# Position data pre-processing

In [None]:
# Clip on TTL input using first_frame_index
# Split into chunks
with open(timestamp_file, 'r') as fileid:
    for count, ch_s in tqdm(enumerate(all_chunk_sizes)):
        fileid.seek(0, 1)
        # Open chunk and convert into a dataframe
        timestamps_chunk = np.fromfile(fileid, np.int32, ch_s)
        timestamps_chunk = pd.DataFrame(timestamps_chunk)
        
        if count==0: 
            # Clip on TTL input first frame index
            first_frame_index = get_ttl_input_first_index(ttl_input_file)
            timestamps_chunk_clean=timestamps_chunk.iloc[first_frame_index:].reset_index(drop=True)
            # Get the first_timestamp
            first_timestamp = timestamps_chunk_clean.iloc[0]
            
            # Convert to seconds from start
            timestamps_converted = (timestamps_chunk_clean - first_timestamp)*(1/sample_rate)
        else:
            # Convert to second from start
            timestamps_converted = (timestamps_chunk - first_timestamp) * (1/sample_rate)
            
        timestamps_converted = timestamps_converted.rename(columns={'0':'t'})
        timestamps_subsampled = timestamps_converted.iloc[::10]
         
        # Save chunk into a .csv file
        filename='timestamps_chunk{}.csv'.format(count)
        timestamps_subsampled.to_csv(os.path.join(path, 'Ephys_timestamps', filename))

#### Visual confirmation of individual runs in the session

In [67]:
visual_check_of_individual_runs(path, -30, 300)

Session code: 20180504094832, Rat code: NAPOLEAO

 Opening timestamps:20180504094832_b_tstamp_image.csv. Length:35078

 Opening  x position:20180504094832_b_xcoord.csv. Length:35078

 Opening y position:20180504094832_b_ycoord.csv. Length:35078

   timestamp          x         y   x_diff  run_nr         session       rat
0   0.000000  116.26666  75.66666  0.00000     1.0  20180504094832  NAPOLEAO
1   0.092365  115.35556  61.04444 -0.91110     1.0  20180504094832  NAPOLEAO
2   0.118362  115.30000  61.00000 -0.05556     1.0  20180504094832  NAPOLEAO
3   0.155866  115.10910  60.92728 -0.19090     1.0  20180504094832  NAPOLEAO
4   0.187123  115.10000  60.90000 -0.00910     1.0  20180504094832  NAPOLEAO


KeyboardInterrupt: 

### Creation of a run specs file:
Confirm that all runs are visually consistent with the rat's trajectories in space. Create a run_specs .csv file (for example: 'xxx(ratname)_RUN_SPECS_DNMPxx_2020-06-30T14_05_16.csv'), containing a mapping between the run numbers and each run information: 

    1st. column - run_nr;  
    2nd column - run_type ('S', sample or 'T', test); 
    3rd column - outcome(0 - error, 1 - correct). 
    4th colum - trial number
If a run does not exist (is not visually consistent with a rat' trajectory, all other columns should contain 'NaN').
Column names should not be included in the file. They are added later on!
Save as a .csv file into the directory containing the position and timestamp files, such that the run specifications can be added later on during the analysis.

### Collect CP and start limits
In this order:

<b>1st: start limits</b>
   - Only the xlim will be used later on. Size of ROI not as important.
   - Use the reference tape to create ROI. 

<b>2nd: CP limits</b>
   - Size of ROI is important and it should align with maze limits.
   - Use maze CP square limits to create ROI. 

<b>3nd: Corner1 limits (left from start region) </b>
   - Size of ROI is important and it should align with maze limits.

<b>4th:  RW1 limits (left from start region) </b>
   - Only the xlim will be used later on. Size of ROI not as important.
   - Target xlim to be in the center of the well.

<b>5th:  Corner2 limits  (right from start region) </b>
   - Size of ROI is important and it should align with maze limits.

<b>6th: RW2 limits (right from start region) </b>
   - Only the xlim will be used later on. Size of ROI not as important.
   - Target xlim to be in the center of the well.

In [33]:
session_code = collect_maze_limits(path)

Directory is empty. Selecting the 1st frame from the videos
E:/EPHYS/data/NAPOLEAO_DNMP22_20180504094832\ROI_Frames\20180504094832_b_movie_1stframe.jpg
(219, 502, 87, 54)
E:/EPHYS/data/NAPOLEAO_DNMP22_20180504094832\ROI_Frames\20180504094832_b_movie_1stframe.jpg
(1068, 505, 73, 55)
E:/EPHYS/data/NAPOLEAO_DNMP22_20180504094832\ROI_Frames\20180504094832_b_movie_1stframe.jpg
(1068, 135, 77, 64)
E:/EPHYS/data/NAPOLEAO_DNMP22_20180504094832\ROI_Frames\20180504094832_b_movie_1stframe.jpg
(880, 131, 25, 64)
E:/EPHYS/data/NAPOLEAO_DNMP22_20180504094832\ROI_Frames\20180504094832_b_movie_1stframe.jpg
(1050, 856, 78, 68)
E:/EPHYS/data/NAPOLEAO_DNMP22_20180504094832\ROI_Frames\20180504094832_b_movie_1stframe.jpg
(880, 864, 31, 73)
      x      y  width  height         session
0  43.8  100.4   17.4    10.8  20180504094832
       x      y  width  height         session
0  213.6  101.0   14.6    11.0  20180504094832
       x     y  width  height         session
0  176.0  26.2    5.0    12.8  20180504

### Add run specifications
All position data from runs with NaNs will be removed from the dataframe

In [34]:
data = add_run_specs(path, session_code)

### Save data into a .csv file

In [35]:
file_path = os.path.join(path,
                        "Timestamped_position", 
                        "%s_timestamped_position_df_clean.csv"%session_code)
data.to_csv(file_path, index=False)

### Calculate the common average

In [82]:
common_average_tts = [1,4,8,12] 

In [83]:
if 'HOMERO' in path:
    
    tt_lfps = {}
    
    # Calculate common average from one channel per tetrode
    for tt in tqdm(common_average_tts): 
        
        tt_path=os.path.join(path, 'TT{}'.format(tt))
        files = get_file_list(tt_path, "*.csv")   
        tt_lfp=list() 
        
        for f in files:
            # Read each ephys raw data file
            file_path = os.path.join(tt_path, f)
            chunk = pd.read_csv(file_path, index_col=0)
                
            # Append chunk dataframe from each file to list
            tt_lfp.append(chunk.iloc[:,0]) 
                
        # Concatenate and store in a dictionary   
        tt_lfps[tt]=pd.concat(tt_lfp)
        
    data= pd.concat(tt_lfps, axis=1)
    common_average = np.mean(data, axis=1)
    file_path = os.path.join(path, "%s_common_average.csv"%session_code)

    common_average.to_csv(file_path, index=False)

100%|████████████████████████████████████████████| 4/4 [00:48<00:00, 12.14s/it]


In [84]:
common_average.shape

(4318861,)