In [None]:
import os
import re
from collections import Counter
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_context('notebook')
sns.set(style='darkgrid')
from pipe import Pipe
import easier as ezr

%matplotlib inline
import pylab as pl
import holoviews as hv
# hv.extension('bokeh')

pd.set_option("display.max_columns",101)

In [None]:
# the directory holding data files
data_dir = './20180318/'

# the file name for the test log (it should live in the data directory)
test_log_file_base_name = 'test_log_20180318.csv'

# this is the maximum number of identical samples to take 
max_samples = 3
channel_mapper= dict(
    a='sig_gen',
    b='res_volt',
    c='rec_volt',
    d='sec_volt'
)

# compute the full path of the test log file
test_log_file = os.path.join(data_dir, test_log_file_base_name)

In [None]:
def find_files(data_dir):
    """
    Find all data files under a specified directory
    """
    # the regex pattern for identifying a data file
    rex_file = re.compile(r'.*/?\d+\-\d+(_\d+)?\.csv')
    
    # initialize empty list of data files
    data_files = []
    
    # recursively search data directory
    for root, dirs, files in os.walk(data_dir):
        for file in files:
            # only consider data files
            if rex_file.match(file):
                
                # compute the full path to the datafile
                file_name = os.path.join(root, file)
                
                # the file_tag is what is put into the log file
                file_tag = re.sub(r'(_\d+)?.csv', '', file)
                
                # add the data file
                data_files.append((file_tag, file_name))
    
    # create and return the output dataframe
    df_files = pd.DataFrame(data_files, columns=['file_tag', 'file_name'])
    return df_files

In [None]:
def make_log_frame(test_log_file, data_dir, max_samples):
    # read in the test log
    df_log = pd.read_csv(test_log_file).drop(['test_no'], axis=1)
    
    # standardize column names
    df_log = df_log.rename(
        columns=dict(primary_position='pos', file_name='file_tag', fatigue_life='life'))
    
    # sometimes the file will have blank fields.  These are garbage
    df_log.dropna(inplace=True)
    
    # get a frame of all files in the data directory
    df_files = find_files(data_dir)
    
    # Use the file_tag to link each data file with its corresponding log-file entry
    df_out = pd.merge(df_log, df_files, on=['file_tag'], how='right')
    
    
    def compute_sample_num(batch):
        """
        For each condition measured, this function computes the sample numbe
        """
        batch.insert(3, 'sample_num', np.array(range(len(batch))) + 1)
        return batch  
    
    # These fields identify measurement conditions (that can be sample multiple times)
    grouping_fields = ['sample', 'frequency', 'bends', 'pos']
    
    # This will order by filename within batches, although it's no clear that's needed
    sorting_fields = grouping_fields + ['file_name']
    df_out = df_out.sort_values(by=sorting_fields)
    
    # Actually run the groupby to assign sample number
    df_out = df_out.groupby(by=grouping_fields).apply(compute_sample_num)
    
    # Select the final output fields you want in the log frame
    df_out = df_out[grouping_fields + ['sample_num', 'file_name']]
    
    # Don't include more than max_samples for each measurement condition
    df_out = df_out[df_out.sample_num <= max_samples].reset_index(drop=True)
    
    return df_out

df_log = make_log_frame(test_log_file, data_dir, max_samples=max_samples)
display(df_log.head(5))
print(len(df_log))
print('sample', sorted(df_log['sample'].unique()))
print('freq', sorted(df_log.frequency.unique()))
print('bends', sorted(df_log.bends.unique()))

In [None]:
def get_data(df_log, data_dir, channel_mapper, njobs=1, recompute=False):
    """
    Run the computation across files to extract features
    """
    # Define the file (in the data directory) that will hold the analysis results
    results_file = os.path.join(data_dir, 'results.txt')
    
    # These computations can take a really long time, so only recompute if you have to
    if recompute:
        
        # We will be appending a bunch of frames, so initialze to no frame
        p = Pipe(df_log, channel_mapper, n_jobs=njobs, harmonic=3)
        p.process()
        df = p.df

        df.to_csv(results_file, index=False)
        
    df = pd.read_csv(results_file)
    return df

with ezr.Timer('get_data'):
    df = get_data(df_log, data_dir, channel_mapper, njobs=2, recompute=False)
    
####################3
#TODO: Bake this into the log frame
df.rename(columns=dict(sample='sample_name'), inplace=True)
df.loc[:, 'sample_name'] = df.sample_name.str.replace('_.*', '')
df.head()
###################

In [None]:
def plot_experiment(df, xparams, col_name):
    pl.rcParams['figure.figsize'] = (20, 6)
    sns.set_context('talk')

    # successively filter the frame to have only params listed in xparams
    key_cols = list(xparams.keys()) + ['pos']
    for k, v in xparams.items():
        if isinstance(v, tuple):
            overlay_col = k
            baseline_val = v[0]
        else:
            df = df[df[k] == v]
            
    # pull out the baseline_frames
    baseline_batch = df[df[overlay_col] == baseline_val]
    baseline_batch = baseline_batch.groupby(by=key_cols).mean().reset_index()
    
            
    # Run plotting overlays for all overlay groups
    for key, batch in df.groupby(by=overlay_col):
        mean_batch = batch.groupby(by=key_cols).mean().reset_index()
        
        pl.figure()
        _plot_frame(xparams, batch, baseline_batch, mean_batch, col_name)
    

In [None]:
def _plot_frame(xparams, df, df_baseline, df_mean,  col_name):
    title_val = ' '.join([f'{k}={v[0] if isinstance(v, tuple) else v}' for (k, v) in xparams.items()])
    baseline_col = f'{col_name}_b'
    mean_col = f'{col_name}_m'
    
    df = df[['pos', col_name]]
    dfb = df_baseline[['pos', col_name]]
    dfm = df_mean[['pos', col_name]]
    
    dfj = pd.merge(df, dfb, on='pos', suffixes=['', '_b']).dropna()
    dfj = pd.merge(dfj, dfm, on='pos', suffixes=['', '_m']).dropna()
    
    
    dfj.loc[:, 'delta'] = dfj[col_name] - dfj[baseline_col]
    dfj.loc[:, 'ratio'] = dfj[col_name] / dfj[baseline_col]
    dfj.loc[:, 'delta_m'] = dfj[mean_col] - dfj[baseline_col]
    dfj.loc[:, 'ratio_m'] = dfj[mean_col] / dfj[baseline_col]
    
    pl.subplot(131)
    pl.plot(dfj.pos, dfj[baseline_col])
    pl.plot(dfj.pos, dfj[mean_col])
    pl.plot(dfj.pos, dfj[col_name], 'o')
    pl.xlabel('Position along Pipe (inches)')
    pl.title(title_val)
    pl.ylabel(col_name)
    
    pl.subplot(132)
    pl.plot(dfj.pos, dfj.ratio, 'o')
    pl.plot(dfj.pos, dfj.ratio_m, '-')
    
    pl.title('Ratio')

    pl.subplot(133)
    pl.plot(dfj.pos, dfj.delta, 'o')
    pl.plot(dfj.pos, dfj.delta_m, '-')
    
    pl.title('Difference')
    
    


In [None]:
display(df_log.groupby(by=['sample', 'frequency', 'bends']).count() / max_samples)

xparams = dict(
    sample_name='ss',
    frequency=1000,
    bends=(0.,),
)

# xparams = dict(
#     sample_name='ss',
#     frequency=(50,),
#     bends=0.,
# )


plot_experiment(df, xparams, 'sec_harm_db')
