# ABOVE | AC9 Data Processing
***
## 01 Read AC9 files
***

**Authors:** Catherine Kuhn and Elena Terzić   
**Last Updated:** August, 15th, 2018
***

This code reads in raw ac9 .dat files and parses sample information from the filename and header information inside the file. The output is a table of summary statistics formatted as a *.csv* file for each wavelength for each file. This code was built for a worflow in which a and c sides are sampled separately. File names should contain: date, site, rep, a or c side and water temperature. 

File names are formatted like ** AC9_dddddd_sit_sam_s_r_TXX_XX.dat** where:

- **dddddd** = date (071718)
- **sit** = three letter site code (fai)
- **sam** = three letter sample type (cal, raw, fil) for calibration, raw water (unfiltered) or filtered (fil)
- **r** = numbered replicate (1, 2, 3) 
- **TXX_X** = temperature in Celcius (T17_3)
    
**Ex:** AC9_071618_y17_raw_a_1_T17_6.dat



### Load required packages

In [60]:
### Import the required python libraries
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import glob
import os
import sys
import csv

### Import files

In [61]:
# Change to the desired directory
%cd /Users/ckuhn/Documents/ABOVE/Data/AC9/ac9_data/1_1_renamed_originals/cals/

/Users/ckuhn/Documents/ABOVE/Data/AC9/ac9_data/1_1_renamed_originals/cals


In [62]:
# Read in all the files and sort them into a list
AC9 = sorted(glob.glob('AC9*.dat'))

#### Use the columns in the upper part of the document for the wavelengths. Just to organize your main dataframe. 

In [63]:
AC9[0]

'AC9_070718_lab_cal_a_1_T20_6.dat'

In [64]:
skiprows_wavel = 9   # Skip the first 9 lines

In [65]:
for file in range(len(AC9)):

    # brings in file
    read_wl = pd.read_csv(AC9[file], skiprows=10, names=range(100), delimiter= '\t')  # use names= range (100) to clip dangling columns
    
    # reads in a and c wavelength values from the first column of data
    a_c_wl = read_wl[0][0:18]  ; a_wl = read_wl[0][0:9] ; c_wl = read_wl[0][9:18];
    
    # parses temperature, samplename, a or c side and site from the file name
    # 0 = A, 0:3 takes AC9 (letters 0, 1, 2 not 3)
    temperature = float(AC9[file][24:28].replace('_', '.'))
    temp_string = AC9[file][24:28]; sample_type = AC9[file][15:18];  a_or_c = AC9[file][15:16];   
    date = AC9[file][4:10];         site = AC9[file][11:14];   rep = AC9[file][21:22];

    # make empty objects for your new variables of the wavelength value and name
    # Example: wl_a: 650.0; wl_a_str: a650
    wl_a = []    ; wl_c = []  ; wl_a_str = []  ; wl_c_str = []
    
    # makes a list of the 9 wavelengths formatted as floats
    for i in range(len(a_wl)):
        wl_a.append(np.float(a_wl[i][1:4]))
        wl_a_str.append(a_wl[i])
    for j in range(len(c_wl)):
        wl_c.append(np.float(c_wl[j+9][1:4]))
        wl_c_str.append(c_wl[j+9])
    
     # Unsorted list of wavelengths (412) and wavelength strings (a676)
    wavelist = wl_a + wl_c                   ; wavelist_str = wl_a_str + wl_c_str   
    # Unsorted list of wavelengths (412) and wavelength strings (a676) as arrays
    wavelengths = np.asarray(wavelist)       ; wavelengths_str = np.asarray(wavelist_str)  
    # Sorted list of a and c wavelengths as floats in an array (ex: 412, 440, etc)
    wl_a_sorted = np.asarray(sorted(wl_a))   ; wl_c_sorted = np.asarray(sorted(wl_c))

    # Now read back in the data, skipping all the header information  
    # The time series of measured values starts in the 32th row
    df1 = pd.read_csv(AC9[file], skiprows=31, delimiter= '\t') 
    
    # drops all the ragged extra columns dangling to the right
    columns = df1.columns[19:len(df1.columns)]                 
    df2 = df1.drop(columns, axis=1)    # you should have 19 cols left ~ array size [ntimesteps, 19]                       
    
    # drops the first column of the timestamp (ntimesteps)
    columns1 = df2.columns[0]
    df3 = df2.drop(columns1, axis=1)                    
    
    # makes a new header from the list of wavelengths you parsed earlier  
    wl_header = wavelengths_str 
    
    # Clean and reindex
    df4 = df3[1:]                                       # take the data (row 1- n) less the header row (row 0)
    df4.columns = wavelengths_str                       # set the header row as list of wavelengths
    df4 = df4.reindex_axis(sorted(df4.columns), axis=1) # reindex them by the new sorted wavelengths
    df4=df4.convert_objects(convert_numeric=True)       # Just to make sure that all elements are floats!
    
    no_cols = int(len(df4.columns)/2.)                  # no_col should always be 9 (one for each wavelength)         
    
    # Sort your dataframe with ascending walues of your wavelengths
    # at this point the wl_a and wl_c are the same wavelengths so 
    # it doesn't matter which one you use here
    new_header = wl_a_sorted
    
    # reindex to reshape the data
    df_a_aux = df4.iloc[:, :no_cols];  df_a_aux.columns = new_header ;  df_a = df_a_aux.reindex_axis(sorted(df_a_aux.columns), axis = 1)
    df_c_aux = df4.iloc[:, no_cols:];  df_c_aux.columns = new_header ;  df_c = df_c_aux.reindex_axis(sorted(df_c_aux.columns), axis = 1)
    
    # calculate the me(di)an, stdev, IQR for the time series - per each column
    a_mean = df_a[wl_a_sorted].mean(axis=0)         ; c_mean = df_c[wl_c_sorted].mean(axis=0)   
    a_std  = df_a[wl_a_sorted].std(axis=0)          ; c_std  = df_c[wl_c_sorted].std(axis=0)
    a_median = df_a[wl_a_sorted].median(axis=0)     ; c_median = df_c[wl_c_sorted].median(axis=0)
    
    # Computing IQR
    a_Q1 = df_a[wl_a_sorted].quantile(0.25)         ; c_Q1 = df_c[wl_c_sorted].quantile(0.25)
    a_Q3 = df_a[wl_a_sorted].quantile(0.75)         ; c_Q3 = df_c[wl_c_sorted].quantile(0.75)
    a_IQR = a_Q3 - a_Q1                             ; c_IQR = c_Q3 - c_Q1
    
    # Specifiy the output file name and directory
    outputname = 'AC9_' + str(date) + '_'+ str(site) +'_' + str(sample_type) + '_' + str(a_or_c) +'_' + str(rep)+ '_'  + 'T' + str(temp_string) + '.csv'
    outputdir = '/Users/ckuhn/Documents/ABOVE/Data/AC9/ac9_data/2_summarystats/cal/' + outputname 
    
    # make a new dataframe from the summary statistics and export
    if a_or_c == 'a':
        output_df = pd.DataFrame([wl_a_sorted, a_mean, a_std, a_median, a_IQR]).swapaxes(0,1)
        output_df.columns = ('wl', 'a_mean', 'a_std', 'a_median', 'a_IQR')
        output_df.to_csv(outputdir, sep='\t')
    else:
        output_df = pd.DataFrame([wl_c_sorted, c_mean, c_std, c_median, c_IQR]).swapaxes(0,1)
        output_df.columns = ('wl', 'c_mean', 'c_std', 'c_median', 'c_IQR')
        output_df.to_csv(outputdir, sep='\t')


