In [1]:
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style("white")
import sys
import os
sys.path.append("engine/")
sys.path.append("funcs/")

from theano import config
import numpy as np
import pickle 
import copy
import time 
import pandas as pd
from operator import itemgetter

%matplotlib inline
sys.setrecursionlimit(10000)

Unpreprocessed data available at:
https://www.dropbox.com/s/ugmet5eky8g0l5n/datasets%201-5%20nopreprocessing.zip?dl=0

In [2]:
def get_hz(spikes,hz):
    return sum(spikes)/(len(spikes)/hz)

def percentile_detrend(trace,wwidth,percentile):
    
    trend = np.zeros_like(trace)
    for t in range(len(trace)):
        window = np.max([0,t-wwidth/2]).astype(int),np.min([len(trace),t+wwidth/2]).astype(int)
        trend[t] = np.percentile(trace[window[0]:window[1]],5)
    return trend

def percentile_resize(trace,botperc,topperc):

    bot = np.percentile(trace, botperc)
    top = np.percentile(trace, topperc)

    return (trace - bot) / (top - bot)

def dFoF(trace,wwidth,percentile):
    
    restrace = np.zeros_like(trace)
    for t in range(len(trace)):
        window = np.max([0,t-wwidth/2]).astype(int),np.min([len(trace),t+wwidth/2]).astype(int)
        trend = np.percentile(trace[window[0]:window[1]],percentile)
        restrace[t] = (trace[t]-trend)/trend
    return restrace

def list_mean(list):
    totsum = 0
    totlen = 0
    for t in list:
        totsum += np.sum(t)
        totlen += len(t)
    return totsum/totlen

# CAI-3


## 1 OGB

In [3]:
traces = pd.read_csv('1.train.calcium.nopreprocessing.csv')
spiketimes = pd.read_csv('1.train.spike.nopreprocessing.csv')

n_cells = len(traces.columns)

traces_raw = [[] for x in range(n_cells)]
spikes_raw = [[] for x in range(n_cells)]
traces_prep = [[] for x in range(n_cells)]

fps = []
spikefps = 300

for i in range(0,n_cells):
    fps.append(traces[str(i+1)][0])
    tra = traces[str(i+1)][1:]
    sp = spiketimes[str(i+1)][1:]
    tra = np.array(tra[~np.isnan(tra)])
    sp = np.array(sp[~np.isnan(sp)])
    traces_raw[i].append(tra)
    
    T = len(tra)*1000/fps[-1]
    bins = np.arange(0,T,1000/spikefps)
    spikes_raw[i].append(np.histogram(sp, bins)[0])

In [4]:
traces = pd.read_csv('1.test.calcium.nopreprocessing.csv')

n_cells = len(traces.columns)

traces_testset = [[] for x in range(n_cells)]
traces_testset_prep = [[] for x in range(n_cells)]

fps_testset = []

for i in range(0,n_cells):

    fps_testset.append(traces[str(i+1)][0])
    tra = traces[str(i+1)][1:]
    tra = np.array(tra[~np.isnan(tra)])
    traces_testset[i].append(tra)

In [5]:
for i in range(len(traces_raw)):
    for j in range(len(traces_raw[i])):        
        
        trace = percentile_resize(traces_raw[i][j],5,60)
        traces_prep[i].append(trace)

for i in range(len(traces_testset)):
    for j in range(len(traces_testset[i])):        
        
        trace = percentile_resize(traces_testset[i][j],5,60)
        traces_testset_prep[i].append(trace)

In [6]:
for i in range(len(traces_prep)):
    mean = list_mean(traces_prep[i])
    for j in range(len(traces_prep[i])):
        traces_prep[i][j] = np.array(traces_prep[i][j] - mean, dtype = config.floatX)
        
for i in range(len(traces_testset_prep)):
    mean = list_mean(traces_testset_prep[i])
    for j in range(len(traces_testset_prep[i])):
        traces_testset_prep[i][j] = np.array(traces_testset_prep[i][j] - mean, dtype = config.floatX)

spikes_train = copy.deepcopy(spikes_raw)
traces_train = copy.deepcopy(traces_prep)
      
data_dict = {'traces': traces_train, 'spikes': spikes_train, 'traces_test':traces_prep, 'spikes_test':spikes_raw, 'traces_testset':traces_testset_prep, 'fps' : fps, 'fps_testset' : fps_testset, 'spike_fps': spikefps}

with open('real_cai3np_ogb1.pkl', 'wb') as f:
    pickle.dump(data_dict, f, pickle.HIGHEST_PROTOCOL)

## OGB 2

In [7]:
traces = pd.read_csv('2.train.calcium.nopreprocessing.csv')
spiketimes = pd.read_csv('2.train.spike.nopreprocessing.csv')

n_cells = len(traces.columns)

traces_raw = [[] for x in range(n_cells)]
spikes_raw = [[] for x in range(n_cells)]
traces_prep = [[] for x in range(n_cells)]

fps = []
spikefps = 300

for i in range(0,n_cells):
    fps.append(traces[str(i+1)][0])
    tra = traces[str(i+1)][1:]
    sp = spiketimes[str(i+1)][1:]
    tra = np.array(tra[~np.isnan(tra)])
    sp = np.array(sp[~np.isnan(sp)])
    traces_raw[i].append(tra)
    
    T = len(tra)*1000/fps[-1]
    bins = np.arange(0,T,1000/spikefps)
    spikes_raw[i].append(np.histogram(sp, bins)[0])

In [8]:
traces = pd.read_csv('2.test.calcium.nopreprocessing.csv')

n_cells = len(traces.columns)

traces_testset = [[] for x in range(n_cells)]
traces_testset_prep = [[] for x in range(n_cells)]

fps_testset = []

for i in range(0,n_cells):
    fps_testset.append(traces[str(i+1)][0])
    tra = traces[str(i+1)][1:]
    tra = np.array(tra[~np.isnan(tra)])
    traces_testset[i].append(tra)

In [9]:
for i in range(len(traces_raw)):
    for j in range(len(traces_raw[i])):        
        
        trace = percentile_resize(traces_raw[i][j],5,60)
        traces_prep[i].append(trace)
        
for i in range(len(traces_testset)):
    for j in range(len(traces_testset[i])):        
        
        trace = percentile_resize(traces_testset[i][j],5,60)
        traces_testset_prep[i].append(trace)
        

In [10]:
spikes_train = copy.deepcopy(spikes_raw)
traces_train = copy.deepcopy(traces_prep)

traces_train[7][0] = traces_train[7][0][300:]
spikes_train[7][0] = spikes_train[7][0][int(300*spikefps/fps[7]):]

traces_train[8][0] = traces_train[8][0][:3100]
spikes_train[8][0] = spikes_train[8][0][:int(3100*spikefps/fps[8])]

traces_train[12][0] = traces_train[12][0][800:]
spikes_train[12][0] = spikes_train[12][0][int(800*spikefps/fps[12]):]

traces_train[13][0] = traces_train[13][0][200:]
spikes_train[13][0] = spikes_train[13][0][int(200*spikefps/fps[13]):]

In [11]:
for i in range(len(traces_train)):
    mean = list_mean(traces_train[i])
    for j in range(len(traces_train[i])):
        traces_train[i][j] = np.array(traces_train[i][j] - mean, dtype = config.floatX)
        traces_prep[i][j] = np.array(traces_prep[i][j] - mean, dtype = config.floatX)
        
for i in range(len(traces_testset_prep)):
    mean = list_mean(traces_testset_prep[i])
    for j in range(len(traces_testset_prep[i])):
        traces_testset_prep[i][j] = np.array(traces_testset_prep[i][j] - mean, dtype = config.floatX)
        
data_dict = {'traces': traces_train, 'spikes': spikes_train, 'traces_test':traces_prep, 'spikes_test':spikes_raw, 'traces_testset':traces_testset_prep, 'fps' : fps, 'fps_testset' : fps_testset, 'spike_fps': spikefps}

with open('real_cai3np_ogb2.pkl', 'wb') as f:
    pickle.dump(data_dict, f, pickle.HIGHEST_PROTOCOL)

## OGB 4

In [12]:
traces = pd.read_csv('4.train.calcium.nopreprocessing.csv')
spiketimes = pd.read_csv('4.train.spike.nopreprocessing.csv')

n_cells = len(traces.columns)

traces_raw = [[] for x in range(n_cells)]
spikes_raw = [[] for x in range(n_cells)]
traces_prep = [[] for x in range(n_cells)]

fps = []
spikefps = 300

for i in range(0,n_cells):
    fps.append(traces[str(i+1)][0])
    tra = traces[str(i+1)][1:]
    sp = spiketimes[str(i+1)][1:]
    tra = np.array(tra[~np.isnan(tra)])
    sp = np.array(sp[~np.isnan(sp)])
    traces_raw[i].append(tra)
    
    T = len(tra)*1000/fps[-1]
    bins = np.arange(0,T,1000/spikefps)
    spikes_raw[i].append(np.histogram(sp, bins)[0])

In [13]:
traces = pd.read_csv('4.test.calcium.nopreprocessing.csv')

n_cells = len(traces.columns)

traces_testset = [[] for x in range(n_cells)]
traces_testset_prep = [[] for x in range(n_cells)]

fps_testset = []

for i in range(0,n_cells):
    fps_testset.append(traces[str(i+1)][0])
    tra = traces[str(i+1)][1:]
    tra = np.array(tra[~np.isnan(tra)])
    traces_testset[i].append(tra)

In [14]:
for i in range(len(traces_raw)):
    for j in range(len(traces_raw[i])):        
        
        trace = percentile_resize(traces_raw[i][j],5,60)
        traces_prep[i].append(trace)

for i in range(len(traces_testset)):
    for j in range(len(traces_testset[i])):        
        
        trace = percentile_resize(traces_testset[i][j],5,60)
        traces_testset_prep[i].append(trace)

In [15]:
spikes_train = copy.deepcopy(spikes_raw)
traces_train = copy.deepcopy(traces_prep)

for i in range(len(traces_train)):
    mean = list_mean(traces_train[i])
    for j in range(len(traces_train[i])):
        traces_train[i][j] = np.array(traces_train[i][j] - mean, dtype = config.floatX)
        traces_prep[i][j] = np.array(traces_prep[i][j] - mean, dtype = config.floatX)
        
for i in range(len(traces_testset_prep)):
    mean = list_mean(traces_testset_prep[i])
    for j in range(len(traces_testset_prep[i])):
        traces_testset_prep[i][j] = np.array(traces_testset_prep[i][j] - mean, dtype = config.floatX)
        
data_dict = {'traces': traces_train, 'spikes': spikes_train, 'traces_test':traces_prep, 'spikes_test':spikes_raw, 'traces_testset':traces_testset_prep, 'fps' : fps, 'fps_testset' : fps_testset, 'spike_fps': spikefps}

with open('real_cai3np_ogb4.pkl', 'wb') as f:
    pickle.dump(data_dict, f, pickle.HIGHEST_PROTOCOL)

## GCamp6s 3


In [16]:
traces = pd.read_csv('3.train.calcium.nopreprocessing.csv')
spiketimes = pd.read_csv('3.train.spike.nopreprocessing.csv')

n_cells = len(traces.columns)

traces_raw = [[] for x in range(n_cells)]
spikes_raw = [[] for x in range(n_cells)]
traces_prep = [[] for x in range(n_cells)]


fps = []
spikefps = 300

for i in range(0,n_cells):
    fps.append(traces[str(i+1)][0])
    tra = traces[str(i+1)][1:]
    sp = spiketimes[str(i+1)][1:]
    tra = np.array(tra[~np.isnan(tra)])
    sp = np.array(sp[~np.isnan(sp)])
    traces_raw[i].append(tra)
    
    T = len(tra)*1000/fps[-1]
    bins = np.arange(0,T,1000/spikefps)
    spikes_raw[i].append(np.histogram(sp, bins)[0])

In [17]:
traces = pd.read_csv('3.test.calcium.nopreprocessing.csv')

n_cells = len(traces.columns)

traces_testset = [[] for x in range(n_cells)]
traces_testset_prep = [[] for x in range(n_cells)]

fps_testset = []

for i in range(0,n_cells):

    fps_testset.append(traces[str(i+1)][0])
    tra = traces[str(i+1)][1:]
    tra = np.array(tra[~np.isnan(tra)])
    traces_testset[i].append(tra)

In [18]:
for i in range(len(traces_raw)):
    for j in range(len(traces_raw[i])):        
        
        trace = dFoF(traces_raw[i][j],10000,5)
        traces_prep[i].append(trace)
        

for i in range(len(traces_testset)):
    for j in range(len(traces_testset[i])):        
        
        trace = dFoF(traces_testset[i][j],10000,5)
        traces_testset_prep[i].append(trace)

In [19]:
spikes_train = copy.deepcopy(spikes_raw)
traces_train = copy.deepcopy(traces_prep)

traces_train[4][0] = traces_train[4][0][:12500]
spikes_train[4][0] = spikes_train[4][0][:int(12500*spikefps/fps[8])]

In [20]:
for i in range(len(traces_train)):
    mean = list_mean(traces_train[i])
    for j in range(len(traces_train[i])):
        traces_train[i][j] = np.array(traces_train[i][j] - mean, dtype = config.floatX)
        traces_prep[i][j] = np.array(traces_prep[i][j] - mean, dtype = config.floatX)
        
del traces_train[5][0]
del spikes_train[5][0]        
        
for i in range(len(traces_testset_prep)):
    mean = list_mean(traces_testset_prep[i])
    for j in range(len(traces_testset_prep[i])):
        traces_testset_prep[i][j] = np.array(traces_testset_prep[i][j] - mean, dtype = config.floatX)
        
data_dict = {'traces': traces_train, 'spikes': spikes_train, 'traces_test':traces_prep, 'spikes_test':spikes_raw, 'traces_testset':traces_testset_prep, 'fps' : fps, 'fps_testset' : fps_testset, 'spike_fps': spikefps}

with open('real_cai3np_gc3.pkl', 'wb') as f:
    pickle.dump(data_dict, f, pickle.HIGHEST_PROTOCOL)

## GCamp6s 5

In [21]:
traces = pd.read_csv('5.train.calcium.nopreprocessing.csv')
spiketimes = pd.read_csv('5.train.spike.nopreprocessing.csv')

n_cells = len(traces.columns)

traces_raw = [[] for x in range(n_cells)]
spikes_raw = [[] for x in range(n_cells)]
traces_prep = [[] for x in range(n_cells)]


fps = []
spikefps = 300

for i in range(0,n_cells):
    fps.append(traces[str(i+1)][0])
    tra = traces[str(i+1)][1:]
    sp = spiketimes[str(i+1)][1:]
    tra = np.array(tra[~np.isnan(tra)])
    sp = np.array(sp[~np.isnan(sp)])
    traces_raw[i].append(tra)
    
    T = len(tra)*1000/fps[-1]
    bins = np.arange(0,T,1000/spikefps)
    spikes_raw[i].append(np.histogram(sp, bins)[0])

In [22]:
traces = pd.read_csv('5.test.calcium.nopreprocessing.csv')

n_cells = len(traces.columns)

traces_testset = [[] for x in range(n_cells)]
traces_testset_prep = [[] for x in range(n_cells)]

fps_testset = []

for i in range(0,n_cells):

    fps_testset.append(traces[str(i+1)][0])
    tra = traces[str(i+1)][1:]
    tra = np.array(tra[~np.isnan(tra)])
    traces_testset[i].append(tra)

In [23]:
for i in range(len(traces_raw)):
    for j in range(len(traces_raw[i])):        
        
        trace = dFoF(traces_raw[i][j],10000,5)
        traces_prep[i].append(trace)


for i in range(len(traces_testset)):
    for j in range(len(traces_testset[i])):        
        
        trace = dFoF(traces_testset[i][j],10000,5)
        traces_testset_prep[i].append(trace)

In [24]:
spikes_train = copy.deepcopy(spikes_raw)
traces_train = copy.deepcopy(traces_prep)

In [25]:
for i in range(len(traces_train)):
    mean = list_mean(traces_train[i])
    for j in range(len(traces_train[i])):
        traces_train[i][j] = np.array(traces_train[i][j] - mean, dtype = config.floatX)
        traces_prep[i][j] = np.array(traces_prep[i][j] - mean, dtype = config.floatX)     
        
for i in range(len(traces_testset_prep)):
    mean = list_mean(traces_testset_prep[i])
    for j in range(len(traces_testset_prep[i])):
        traces_testset_prep[i][j] = np.array(traces_testset_prep[i][j] - mean, dtype = config.floatX)
        
data_dict = {'traces': traces_train, 'spikes': spikes_train, 'traces_test':traces_prep, 'spikes_test':spikes_raw, 'traces_testset':traces_testset_prep, 'fps' : fps,'fps_testset' : fps_testset, 'spike_fps': spikefps}

with open('real_cai3np_gc5.pkl', 'wb') as f:
    pickle.dump(data_dict, f, pickle.HIGHEST_PROTOCOL)