In [None]:
import os
import glob
import natsort
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Some signal averaging for Mitchell

## Load files first
The example data files are all placed in `data` folder, write a function to create a list of files in memory:

In [None]:
def get_files(pattern):
    """
    Extracts file in alphanumerical order that match the provided pattern
    """
    if isinstance(pattern, list):
        pattern = os.path.join(*pattern)
        
    files = natsort.natsorted(glob.glob(pattern))
    if not files:
        raise FileNotFoundError('Pattern could not detect file(s)')
        
    return files

### Inputs Cell:

Place your path to input and output directories in `" "`.

In [None]:
data_dir = 'data'    # Replace data with relative path to new folder like I mentioned before
output_dir = 'outputs'    # same replacement for outputs

Load as `DataFrames`, mag or magenta should work now.

In [None]:
import re

data_files = get_files(data_dir + '/*.csv')
print('There are {} data csv files'.format(len(data_files)))

# split into cyan, yellow, magenta
cyan_data = [val for val in data_files if re.search(r'cyan\.csv', val)]
yellow_data = [val for val in data_files if re.search(r'yellow\.csv', val)]
magenta_data = [val for val in data_files if re.search(r'(mag|magenta)\.csv', val)]

# sanity check to make sure they are the same number of files
assert len(cyan_data)==len(yellow_data)==len(magenta_data)

## Create channel specific dataframes

In [None]:
df_cyan = pd.concat((pd.read_csv(f, usecols=['Value']) for f in cyan_data), axis = 1).fillna(0)
df_yellow = pd.concat((pd.read_csv(f, usecols=['Value']) for f in yellow_data), axis = 1).fillna(0)
df_magenta = pd.concat((pd.read_csv(f, usecols=['Value']) for f in magenta_data), axis = 1).fillna(0)

df_cyan.columns = np.arange(df_cyan.shape[1])
df_yellow.columns = np.arange(df_yellow.shape[1])
df_magenta.columns = np.arange(df_magenta.shape[1])
# show example:
df_magenta.head()

Now we have 3 Nx56 dataframes that are actually usable ... not stupid `.csv` files

### Center around max

In [None]:
# find which signal had the smallest number of pixels drawn, use this as standard length
shortest_cyan = np.argmax((df_cyan == 0).astype(int).sum(axis=0).values)
shortest_yellow = np.argmax((df_yellow == 0).astype(int).sum(axis=0).values)
shortest_magenta = np.argmax((df_magenta == 0).astype(int).sum(axis=0).values)
print('Shortest length of cyan is at column {}'.format(shortest_cyan))
print('Shortest length of yellow is at column {}'.format(shortest_yellow))
print('Shortest length of magenta is at column {}'.format(shortest_magenta))

In [None]:
print('File 19 is {}'.format(cyan_data[shortest_cyan]))

Seems like `_9_4_cyan.csv` is the one with the shortest line! Now we need to find where this guy's maximum is:

In [None]:
print('Cyan maximum is at index {}'.format(np.argmax(df_cyan.iloc[:,19].values)))
cyan_shortest_viz = df_cyan.iloc[:,19].values
plt.plot(np.ma.masked_equal(cyan_shortest_viz, 0))
plt.vlines(np.argmax(cyan_shortest_viz), ymin= 288, ymax = 430, linestyles = 'dashed', colors = 'r') 

Now we write a function to realign the data:

In [None]:
def realign_data(in_data):
    """
    Center data around maximum of shortest column, pad with 0's 
    
    Returns:
        d - new dataframe with realigned data
        shifts - how each entry was shifted
    """
    x, y = in_data.shape
    d = pd.DataFrame(0, index=np.arange(x), columns = np.arange(y))
    shifts = np.zeros(y)
    
    # Find longest length sample and find it's peak
    ind_longest = np.argmin((in_data == 0).astype(int).sum(axis=0).values)
    peak_longest = np.argmax(in_data.loc[:, ind_longest].values)
    
    # arrange the rest of the data's peaks into the new dataframe lining up to longest peak
    for column in in_data:
        peak = np.argmax(in_data[column].values)
        pdiff = peak_longest - peak
        d[column] = in_data[column].shift(periods=pdiff, fill_value=0)
        assert np.argmax(d[column]) == peak_longest
        shifts[column] = pdiff
    return d, shifts

def shift_data(in_data, shifts):
    """
    Shift dataframe columns based on input reference shifts
    
    Returns:
        d - new shifted dataframe
    """
    x, y = in_data.shape
    d = pd.DataFrame(0, index=np.arange(x), columns = np.arange(y))
    ind_shifts = shifts.astype(int)
    for column in in_data:
        d[column] = in_data[column].shift(periods=ind_shifts[column], fill_value=0)
        
    return d

### Visualize the averaged aligned curve:

In [None]:
from sklearn.preprocessing import minmax_scale

cyan_aligned, shifts = realign_data(df_cyan)
yellow_aligned = shift_data(df_yellow, shifts)
magenta_aligned = shift_data(df_magenta, shifts)

ave_cyan = np.zeros(cyan_aligned.shape[0])
ave_yellow = np.zeros(yellow_aligned.shape[0])
ave_magenta = np.zeros(magenta_aligned.shape[0])

In [None]:
for index, row in cyan_aligned.iterrows():
    #vec = cyan_aligned.loc[index,:].values
    val = row[row != 0]
    ave_cyan[index] = np.mean(val)

for column, vals in cyan_aligned.iteritems():
    event_ind = cyan_aligned.index[cyan_aligned[column]!=0].tolist()
    cyan_aligned.loc[event_ind, column] = minmax_scale(cyan_aligned.loc[event_ind,column].values)
plt.plot(minmax_scale(ave_cyan))

In [None]:
for index, row in yellow_aligned.iterrows():
    val = row[row!=0]
    ave_yellow[index] = np.mean(val)

for column, vals in yellow_aligned.iteritems():
    event_ind = yellow_aligned.index[yellow_aligned[column]!=0].tolist()
    yellow_aligned.loc[event_ind, column] = minmax_scale(yellow_aligned.loc[event_ind,column].values)
    
plt.plot(minmax_scale(ave_yellow))

In [None]:
for index, row in magenta_aligned.iterrows():
    val = row[row!=0]
    ave_magenta[index] = np.mean(val)

for column, vals in magenta_aligned.iteritems():
    event_ind = magenta_aligned.index[magenta_aligned[column]!=0].tolist()
    magenta_aligned.loc[event_ind, column] = minmax_scale(magenta_aligned.loc[event_ind,column].values)
    
plt.plot(minmax_scale(ave_magenta))

## Together:

In [None]:
fig, ax = plt.subplots()
ax.plot(minmax_scale(ave_cyan), c='c')
ax.plot(minmax_scale(ave_yellow), c='y')
ax.plot(minmax_scale(ave_magenta), c='m')

### Save as `outputs/.csv` 

Save the averages as 1 `csv` file where each column is a color. And save each individual colors as separate `csv` files.

In [None]:
df_example_average = pd.DataFrame({'cyan': minmax_scale(ave_cyan),
                                   'yellow': minmax_scale(ave_yellow),
                                   'magenta': minmax_scale(ave_magenta)})
df_example_average.to_csv(output_dir+'/averages.csv', index = False)
cyan_aligned.to_csv(output_dir+'/cyan_data.csv', index=False)
yellow_aligned.to_csv(output_dir+'/yellow_data.csv', index=False)
magenta_aligned.to_csv(output_dir+'/magenta_data.csv', index=False)