# Data summarizing code

Save the code snipets that summarize data obtained directly from batch processes such as *df2_kinetics.py*. The snipets here should generate a *summary.csv* file for each day's data (of a specific type). 

## Guidelines
- Each snipet should contain a summarizing part and a retrieval part

## <font color='blue'> Plans

<font color='blue'>
- ...
    
</font>

## Table of contents

1. Correlation length: angle and velocity
2. df2_kinetics
    - GNF curves at different concentrations
    - $\alpha$
    - Order parameter
3. Summarize local correlation between df and energy
4. Summarize energy spectrum data

## 0 Packages

In [1]:
import pandas as pd
import numpy as np
import os
from corrLib import readdata
from corr_utils import *
from IPython.display import clear_output

In [2]:
data_master_dir = r'E:\Google Drive' # data folder: Google drive for now, but may change in the future
dirs = data_log_mapping(kw='aug') # video info for GNF raw data, obtained from Aug 3rd to Aug 6th

### 1 Correlation length: angle and velocity

In [None]:
# summarize data
# sample 10 frames out of the last 1/3 of frames
corr_data_path = r'E:\moreData\08052020\cav_imseq'
num_sample = 20 # number of frames to sample
sample_clA = []
sample_clV = []
sfL = next(os.walk(corr_data_path))[1]
sfL_new = []
for s in sfL:
#     print('sample ' + str(n))
    corr_data_path_num = os.path.join(corr_data_path, s)
    
    # sample 20 frames in the last 1/3 frames
    
    l = len(corrLib.readdata(corr_data_path_num))
    if l > 60:
        samples = np.random.randint((l*2/3)/2, (l-1)/2, num_sample) * 2
        clAL = []
        clVL = []
        for i in samples:
            data_raw = pd.read_csv(os.path.join(corr_data_path_num, '{0:04d}-{1:04d}.csv'.format(i, i+1))) # X Y CA CV
            data_AV = xy_to_r(data_raw).sort_values(by=['R'])     
            clA, fit = corr_length(data_AV.rename(columns={'CA': 'C'}), fitting_range=500)
            clV, fit = corr_length(data_AV.rename(columns={'CV': 'C'}), fitting_range=500)
            clAL.append(clA)
            clVL.append(clV)
        sample_clA.append(np.array(clAL).mean())
        sample_clV.append(np.array(clVL).mean())
        sfL_new.append(int(s))
    else:
        raise ValueError('Too few data to sample from. Need at least 60 .csv files.')
summary = pd.DataFrame({'sample': sfL_new, 'clA': sample_clA, 'clV': sample_clV})
summary.to_csv(os.path.join(corr_data_path, 'summary.csv'), index=False)
### Needs further improvement

In [None]:
# retrieve data from cav_imseq/summary.csv
folder = r'E:\moreData'
subfolder_name = 'cav_imseq'
data = {'conc': [], 'clA_avg': [], 'clV_avg': [], 'clA_std': [], 'clV_std': []}
for kw in dirs:
    if kw != '00':
        conc = int(kw)
        for n, dn in enumerate(dirs[kw]):
            date, num = dn.split('-')
            summary_dir = os.path.join(folder, date, subfolder_name, 'summary.csv')
            temp = pd.read_csv(summary_dir, index_col='sample').loc[[int(num)]]
            if n == 0:
                conc_data = temp
            else:
                conc_data = conc_data.append(temp)
        data['conc'].append(conc)
        data['clA_avg'].append(conc_data.clA.mean())
        data['clV_avg'].append(conc_data.clV.mean())
        data['clA_std'].append(conc_data.clA.std())
        data['clV_std'].append(conc_data.clV.std())

## 2 df2_kinetics

### 2.1 GNF curves at different concentrations

In [21]:
date = []
num = []
conc = []
for kw in dirs:
    if int(kw) <= 120:
        conc.append(int(kw))
        d, n = dirs[kw][1].split('-')
        date.append(d)
        num.append(int(n))
data_log = pd.DataFrame().assign(conc=conc, date=date, num=num).sort_values(by='conc')
# NOTE: 
# This block is not elegant. I want to use a consistent method to load experiment log information and map them to data.
# Throughout this project, I have implemented no less than three of such methods. Yet, none of them are intuitive or easy to use.
# Keep this block here for now, change it in the future.

In [22]:
lb = 10
data_dir = r'E:\moreData\{0}\df2_kinetics\{1:02d}\kinetics_data.csv'
count = 0
for num, i in data_log.iterrows():
    if i.conc != 0 and i.conc != 85:# and i.conc < 40:     # for animation in slides   
        k_data = pd.read_csv(data_dir.format(i.date, i.num))
        gnf_data = k_data.loc[k_data.segment==k_data.segment.max()]
        x, y = postprocess_gnf(gnf_data, lb, xlim=[10, 10000], sparse=1)
#         ax.plot(x, y, marker=marker_list[color_dict[str(i.conc)]], label='{:.1f}'.format(i.conc*0.08), color=wowcolor(color_dict[str(i.conc)]), markersize=2, lw=1)
        temp = pd.concat([x, y], axis=1, keys=['N', str(i.conc)]).set_index('N')
        if count == 0:
            master = temp
        else:
            master = pd.concat([master, temp], axis=1)
        count += 1
master.to_csv(os.path.join(data_master_dir, r'Research projects\DF\data\gnf.csv'))

In [17]:
def postprocess_gnf(gnf_data, lb, xlim=None, sparse=3):
    """
    Postprocess raw GNF data for plotting.
    
    Args:
    gnf_data -- DataFrame containing columns ('n', 'd'), generated by df2_nobp.py
    lb -- size of bacteria (pixel, normalizing factor of x axis)
    xlim -- box size beyond which the data get cut off (pixel), can be either integer or a list of 2 integers
            if xlim is int, it is the upper limit, data above xlim will be cut off,
            if xlim is a list, data outside [xlim[0], xlim[1]] will be cut off
    sparse -- the degree to sparsify the data, 1 is doing nothing, 3 means only keep 1/3 of the orginal data
    
    Returns:
    x, y -- a tuple that can be plotted directly using plt.plot(x, y)
    """    
    
    if xlim == None:
        data = gnf_data
    elif isinstance(xlim, int):
        data = gnf_data.loc[gnf_data.n < xlim*lb**2]
    elif isinstance(xlim, list) and len(xlim) == 2:
        data = gnf_data.loc[(gnf_data.n>=xlim[0]*lb**2)&(gnf_data.n < xlim[1]*lb**2)]
        
    xx = data.n / lb**2
    yy = data.d / data.n**0.5
    
    # sparcify
    x = xx[0:len(xx):sparse]
    y = yy[0:len(xx):sparse]
    
    return x, y

### 2.2 $\alpha$

In [None]:
# Summarize data from each raw data folders
dates = ['08032020', '08042020', '08052020', '08062020']
sampling_range = 1 # range between 0 and 1, above which we sample the alpha values and average
fitting_range = [10, ] # um
############################################

for date in dates:
    date_folder = r'D:\density_fluctuations\{}'.format(date)
    k_master_folder = os.path.join(date_folder, 'df2_kinetics')
    i_master_folder = os.path.join(date_folder, 'overall_intensity')
    sfL = next(os.walk(k_master_folder))[1]
    alpha_list = []
    n_list = []
    for sf in sfL:
        n = int(sf)
        fps = data_log()[date]['fps'][n]
        k_folder = r'D:\density_fluctuations\{0}\df2_kinetics\{1:02d}'.format(date, n)
        i_folder = r'D:\density_fluctuations\{0}\overall_intensity\{1:02d}'.format(date, n)
        k_data = pd.read_csv(os.path.join(k_folder, 'kinetics_data.csv'))
        i_data = pd.read_csv(os.path.join(i_folder, 'intensity.csv'))
        data = plot_kinetics(k_data, i_data, tlim=None, xlim=[5, 100], lb=10, mpp=0.33, seg_length=50, fps=fps, plot=False)
        df = pd.DataFrame().assign(t=data['t0'], alpha=data['alpha'])
        alpha = df['alpha'].loc[df['t']>=df['t'].max()*sampling_range].mean()
        n_list.append(n)
        alpha_list.append(alpha)
    data = pd.DataFrame({'n': n_list, 'alpha': alpha_list})
    data.to_csv(os.path.join(k_master_folder, 'summary.csv'), index=False)

### 2.3 Order parameter

In [None]:
eo_folder = r'D:\density_fluctuations\08042020\energy_order'
sfL = next(os.walk(eo_folder))[1]
OP_list = []
E_list = []
for sf in sfL:
    print('summarizing data in {}'.format(sf))
    eo_data = pd.read_csv(os.path.join(eo_folder, sf, 'energy_order.csv'))
    eo_data_crop = eo_data.loc[eo_data.t>eo_data.t.max()*0.9]
    E = eo_data_crop.E.mean()
    E_list.append(E)
    OP = eo_data_crop.OP.mean()
    OP_list.append(OP)

In [None]:
data = pd.DataFrame().assign(sample=sfL, E=E_list, OP=OP_list)
data.to_csv(os.path.join(eo_folder, 'eo_summary.csv'), index=False)

## 3 summarize local correlation between df and energy

In [None]:
folder = r'D:\density_fluctuations\08062020\local_df'
sfL = next(os.walk(folder))[1]
avg_list = []
for sf in sfL:
    print('summarizing data in {}'.format(sf))
    df_folder = os.path.join(folder, sf, 'dt=10')
    piv_folder = r'D:\density_fluctuations\08062020\piv_imseq\{}'.format(sf)
    l = readdata(df_folder, 'npy')
    l = l.loc[l.Name.astype('int')>l.Name.astype('int').max()*0.8]
    corr_list = []
    for num, i in l.iterrows():
        df = np.load(i.Dir)
        n = int(i.Name)
        pivData = pd.read_csv(os.path.join(piv_folder, '{0:04d}-{1:04d}.csv'.format(n, n+1)))
        v = np.array(pivData.u**2 + pivData.v**2).reshape((42, 50))
        corr = corr2d(df, v)
        corr_list.append(corr)
    avg_corr = np.array(corr_list).mean()
    avg_list.append(avg_corr)
    print('{0}: {1:.3f}'.format(sf, avg_corr))

In [None]:
data = pd.DataFrame().assign(sample=sfL, E=avg_list)
data.to_csv(os.path.join(folder, 'corr_E_sl=10.csv'), index=False)

## 4 Summarize energy spectrum data

### 4.1 Level - 1
Summarize each day data to summary.csv 

In [8]:
def summarize_es(folder):
    """
    Average energy spectrum data in each subfolder under folder, then save a summary.csv file under folder, containing all the averaged data.
    
    """
    sfL = next(os.walk(folder))[1]
    count = 0
    for sum_count, sf in enumerate(sfL):
        clear_output(wait=True)
        print('Summarizing {0}\\{1} ...'.format(folder, sf))
        es_folder = os.path.join(folder, sf) # for varstep, experiment number '00' needs to be included
        l = readdata(es_folder, 'csv')
        data_count = 0
        for num, i in l.iterrows():
            if num == 0:
                data = pd.read_csv(i.Dir)
            else:
                data += pd.read_csv(i.Dir)
#             print('{:d}: {:f}'.format(num, data.E.mean()))
            data_count += 1
        data /= data_count + 1
        summary = data.assign(sample=sf)
        
#         if count == 0:
#             summary = data.assign(sample=sf)
#         else:
#             summary = summary.append(data.assign(sample=sf))
        count += 1
        if sum_count == 0:
            summary.to_csv(os.path.join(folder, 'summary.csv'), index=False)
        else:
            summary.to_csv(os.path.join(folder, 'summary.csv'), index=False, mode='a', header=False)

In [7]:
folders = ['08032020', '08042020', '08052020', '08062020']
for f in folders:
    folder = os.path.join(r'E:\moreData', f, 'energy_spectrum')
    summarize_es(folder)

Summarizing E:\moreData\08032020\energy_spectrum\00 ...
Summarizing E:\moreData\08032020\energy_spectrum\01 ...
Summarizing E:\moreData\08032020\energy_spectrum\02 ...
Summarizing E:\moreData\08032020\energy_spectrum\03 ...
Summarizing E:\moreData\08032020\energy_spectrum\04 ...
Summarizing E:\moreData\08032020\energy_spectrum\05 ...
Summarizing E:\moreData\08032020\energy_spectrum\06 ...
Summarizing E:\moreData\08032020\energy_spectrum\07 ...
Summarizing E:\moreData\08032020\energy_spectrum\08 ...
Summarizing E:\moreData\08032020\energy_spectrum\09 ...
Summarizing E:\moreData\08032020\energy_spectrum\10 ...
Summarizing E:\moreData\08032020\energy_spectrum\11 ...
Summarizing E:\moreData\08032020\energy_spectrum\12 ...
Summarizing E:\moreData\08032020\energy_spectrum\13 ...
Summarizing E:\moreData\08032020\energy_spectrum\14 ...
Summarizing E:\moreData\08042020\energy_spectrum\00 ...
Summarizing E:\moreData\08042020\energy_spectrum\01 ...
Summarizing E:\moreData\08042020\energy_spectrum

### 4.2 Level - 2
Make ready for plotting data

In [9]:
conc = [120, 100, 85, 80, 70, 60, 50, 40, 30, 20, 10]
folders = ['08062020', '08062020', '08052020', '08032020', '08042020', '08032020', 
           '08042020', '08032020', '08042020', '08032020', '08042020']
sample_num = [range(3, 6), range(0, 3), range(3, 6), range(0, 3), range(0, 3), 
              range(3, 6), range(3, 6), range(6, 9), range(6, 9), range(9, 12), range(9, 12)]
count1 = 0
for c, f, s in zip(conc[::-1], folders[::-1], sample_num[::-1]):
    if c == 85:
        continue
#     summary = pd.read_csv(os.path.join(data_master_dir, r'Research projects\DF\data\level-2-data',
#                                        f, r'energy_spectrum_10\summary.csv')).set_index('k')
    summary = pd.read_csv(os.path.join(r'E:\moreData',
                           f, r'energy_spectrum\summary.csv'))
    for count, num in enumerate(s):
        temp_data = summary.loc[summary['sample'].astype('int')==num].reset_index(drop=True)
        if count == 0:
            data = temp_data            
        else:
            data += temp_data 
    data /= count + 1    
    data.set_index('k')
    x, y = xy_bin(data.k, data.E)
    ###########################################
    # log_bin() bins data to a set of evenly distributed xo in logspace
    # the data in each bin are averages of yo in that bin, these are the output of log_bin(), x and y given above
    # However, we noticed that the 1D energy spectrum should actually be the sum of 2D energy spectrum in a ring with fixed width dk.
    # This sum can also be written as mean*area(or circumference 2*pi*k, given constant width dk)
    # Here, the circumference is 2*pi*k, where k is x in the code
    # so the following line reflect this modification
    y *= 2*np.pi*x 
    ######################################
    
    ###########################################
    # Cut off the data x > 0.15
    xx = x[x<0.15]
    yy = y[x<0.15]
    ###########################################
    
    temp = pd.DataFrame(data={str(c): yy}, index=range(len(yy)))
    if count1 == 0:
        master = temp
    else:
        master = pd.concat([master, temp], axis=1)
    count1 += 1
master = master.assign(k=xx).set_index('k').dropna() 
master.to_csv(os.path.join(data_master_dir, r'Research projects\DF\data\energy_spectra-1.csv'))

                E         k  sample
0     8116.858922  0.000000      10
1     1191.220488  0.002396      10
2     1191.220488  0.002396      10
3      958.556400  0.002853      10
4      958.556400  0.002853      10
...           ...       ...     ...
2095     2.266821  0.082730      10
2096     2.266821  0.082730      10
2097     2.158107  0.083046      10
2098     2.158107  0.083046      10
2099     2.046609  0.084723      10

[2100 rows x 3 columns]
                E         k  sample
0     5429.047983  0.000000      11
1     1010.685720  0.002396      11
2     1010.685720  0.002396      11
3      918.630744  0.002853      11
4      918.630744  0.002853      11
...           ...       ...     ...
2095     0.959304  0.082730      11
2096     0.959304  0.082730      11
2097     1.046436  0.083046      11
2098     1.046436  0.083046      11
2099     1.014198  0.084723      11

[2100 rows x 3 columns]
                 E         k  sample
0     14982.173566  0.000000      10
1     10560.

  np.histogram(xo, x)[0])
  np.histogram(xo, x)[0])
  np.histogram(xo, x)[0])
  np.histogram(xo, x)[0])
  np.histogram(xo, x)[0])
  np.histogram(xo, x)[0])


                  E         k  sample
0     413303.282827  0.000000       1
1     622923.855525  0.002403       1
2     622923.855525  0.002403       1
3     385019.194678  0.002861       1
4     385019.194678  0.002861       1
...             ...       ...     ...
2095      23.104398  0.082959       1
2096      23.104398  0.082959       1
2097      20.967080  0.083276       1
2098      20.967080  0.083276       1
2099      20.579615  0.084958       1

[2100 rows x 3 columns]
                  E         k  sample
0     749285.715571  0.000000       2
1     756384.591627  0.002403       2
2     756384.591627  0.002403       2
3     504014.920389  0.002861       2
4     504014.920389  0.002861       2
...             ...       ...     ...
2095      22.480511  0.082959       2
2096      22.480511  0.082959       2
2097      22.050292  0.083276       2
2098      22.050292  0.083276       2
2099      21.278089  0.084958       2

[2100 rows x 3 columns]
                  E         k  sample


  np.histogram(xo, x)[0])
  np.histogram(xo, x)[0])
  np.histogram(xo, x)[0])
  np.histogram(xo, x)[0])
