# ABOVE | AC9 Data Processing
***
## 01 Read AC9 files
***

**Authors:** Catherine Kuhn and Elena Terzić and Anna Simpson
**Last Updated:** August, 29th, 2018
***

This code reads in raw ac9 .dat files and parses sample information from the filename and header information inside the file. The output is a table of summary statistics formatted as a *.csv* file for each wavelength for each file. This code was built for a worflow in which a and c sides are sampled separately. File names should contain: date, site, rep, a or c side and water temperature. 

File names are formatted like ** AC9_dddddd_sit_sam_s_r_TXX_XX.dat** where:

- **dddddd** = date (071718)
- **sit** = three letter site code (fai)
- **sam** = three letter sample type (cal, raw, fil) for calibration, raw water (unfiltered) or filtered (fil)
- **r** = numbered replicate (1, 2, 3) 
- **TXX_X** = temperature in Celcius (T17_3)
    
**Ex:** AC9_071618_y17_raw_a_1_T17_6.dat



### Load required packages

In [1]:
### Import the required python libraries
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import glob
import os
import sys
import csv

### Functions

In [8]:
def get_subdirectories(directory=os.getcwd()):
    if directory in os.getcwd():
        directories = os.listdir(os.getcwd())
    else:
        directories = os.listdir(directory)
    directories = [subdirectory for subdirectory in directories if not subdirectory.startswith('.')]
    return directories

In [None]:
get_subdirectories('')

### Inputs

In [2]:
### Define directory where raw data is located
raw_data_directory = "1_1_renamed_originals"
### New directory name to store summary data
new_dir_name = '2_summary_stats'

### Import file names

In [3]:
### Will be operating from Code directory - this gets parent directory path
parent_directory = os.path.abspath('..')
### Get path to raw data
raw_data_file_path = parent_directory+'/Data/'+raw_data_directory
# Get a list of all the subdirectories in the directory
directories = os.listdir(raw_data_file_path)
# This removes any hidden files that os.listdir is picking up from your list of subdirectories
# (was picking up .DS_store)
directories = [subdirectory for subdirectory in directories if not subdirectory.startswith('.')]

In [14]:
date = []
site = []
sample_type = []
abs_type = []
rep = []
temp = []

for i in get_subdirectories(raw_data_file_path):
    file_list = sorted(glob.glob(raw_data_file_path+'/'+i+'/*.dat'))
    for j in file_list:
        basename = j.split('/')[-1].split('.')[0]
        Sensor, Date, Site, Sample_Type, Abs_Type, Rep, T1, T2= basename.split('_')
        T = float(T1.lstrip('T')+'.'+T2)
        date.append(Date)
        site.append(Site)
        sample_type.append(Sample_Type)
        abs_type.append(Abs_Type)
        
        print basename
        print T

AC9_071318_bai_cal_a_1_T23_8
23.8
AC9_071318_bai_cal_a_2_T24_1
24.1
AC9_071318_bai_cal_a_3_T24_0
24.0
AC9_071318_bai_cal_c_1_T23_7
23.7
AC9_071318_bai_cal_c_2_T23_8
23.8
AC9_071318_bai_cal_c_3_T23_9
23.9
AC9_071418_bai_fil_a_1_T13_4
13.4
AC9_071418_bai_fil_a_2_T19_2
19.2
AC9_071418_bai_fil_a_3_T18_5
18.5
AC9_071418_bai_fil_c_1_T13_4
13.4
AC9_071418_bai_fil_c_2_T19_2
19.2
AC9_071418_bai_fil_c_3_T18_5
18.5
AC9_071418_bai_raw_a_1_T19_8
19.8
AC9_071418_bai_raw_a_2_T19_8
19.8
AC9_071418_bai_raw_a_3_T17_8
17.8
AC9_071418_bai_raw_c_1_T19_8
19.8
AC9_071418_bai_raw_c_2_T19_8
19.8
AC9_071418_bai_raw_c_3_T17_8
17.8
AC9_071618_cbr_cal_a_1_T19_9
19.9
AC9_071618_cbr_cal_a_2_T19_3
19.3
AC9_071618_cbr_cal_a_3_T19_0
19.0
AC9_071618_cbr_cal_c_1_T19_9
19.9
AC9_071618_cbr_cal_c_2_T19_3
19.3
AC9_071618_cbr_cal_c_3_T19_0
19.0
AC9_071618_cbr_fil_a_1_T17_5
17.5
AC9_071618_cbr_fil_a_2_T17_5
17.5
AC9_071618_cbr_fil_a_3_T17_5
17.5
AC9_071618_cbr_fil_c_1_T17_5
17.5
AC9_071618_cbr_fil_c_2_T17_5
17.5
AC9_071618_cbr

ValueError: too many values to unpack

In [4]:
# Create dictionary  storing subdirectory/site names (keys) 
# and lists of files within each subdirectory (entries)
site_files_dict = {}
# Iterate through subdirectories
for i in directories:
    path=raw_data_file_path+'/'+i+'/'
    # Get a list of the files in each subdirectory as the entry for each site name key
    site_files_dict[i] = sorted(glob.glob(path+'*.dat'))

### Define the new directory where summary stats will be stored

#### Default action: summary stat files will be stored in subdirectories (defined by site names) in the new directory you have defined here

In [5]:
## Get full path name of new directory
new_dir_path = os.path.abspath('..')+'/Data/'+new_dir_name

## Generate new directory if it doesn't already exist
if not os.path.exists(new_dir_path):
    os.makedirs(new_dir_path)

### Generate summary statistics and make new csv files for each sample

#### Use the columns in the upper part of the document for the wavelengths. Just to organize your main dataframe. 

In [7]:
# iterate through dictionary key entries (site names)
for site_name in site_files_dict:
    # define the path for each new subdirectory that will be made based on site name
    individual_site_directory_path = new_dir_path+'/'+site_name
    # generate the new subdirectory for each site if it doesn't already exist
    if not os.path.exists(individual_site_directory_path):
        os.makedirs(individual_site_directory_path)
    # Get the list of original raw files for this site
    site_files = site_files_dict[site_name]
    # Iterate through that list of files so that:
    # each file can be imported ad each summary stat file can be generated
    for file in range(len(site_files)):
    # brings in file
        filename = site_files[file].split('/')[-1]
        read_wl = pd.read_csv(site_files[file], skiprows=10, names=range(100), delimiter= '\t')  # use names= range (100) to clip dangling columns
        # reads in a and c wavelength values from the first column of data
        a_c_wl = read_wl[0][0:18]  ; a_wl = read_wl[0][0:9] ; c_wl = read_wl[0][9:18];

        # parses temperature, samplename, a or c side and site from the file name
        # 0 = A, 0:3 takes AC9 (letters 0, 1, 2 not 3)
        temperature = float(filename[24:28].replace('_', '.'))
        temp_string = filename[24:28]; sample_type = filename[15:18];  a_or_c = filename[19:20];   
        date = filename[4:10];         site = filename[11:14];   rep = filename[21:22];

        # make empty objects for your new variables of the wavelength value and name
        # Example: wl_a: 650.0; wl_a_str: a650
        wl_a = []    ; wl_c = []  ; wl_a_str = []  ; wl_c_str = []

        # makes a list of the 9 wavelengths formatted as floats
        for i in range(len(a_wl)):
            wl_a.append(np.float(a_wl[i][1:4]))
            wl_a_str.append(a_wl[i])
        for j in range(len(c_wl)):
            wl_c.append(np.float(c_wl[j+9][1:4]))
            wl_c_str.append(c_wl[j+9])

         # Unsorted list of wavelengths (412) and wavelength strings (a676)
        wavelist = wl_a + wl_c                   ; wavelist_str = wl_a_str + wl_c_str   
        # Unsorted list of wavelengths (412) and wavelength strings (a676) as arrays
        wavelengths = np.asarray(wavelist)       ; wavelengths_str = np.asarray(wavelist_str)  
        # Sorted list of a and c wavelengths as floats in an array (ex: 412, 440, etc)
        wl_a_sorted = np.asarray(sorted(wl_a))   ; wl_c_sorted = np.asarray(sorted(wl_c))

        # Now read back in the data, skipping all the header information  
        # The time series of measured values starts in the 32th row
        df1 = pd.read_csv(site_files[file], skiprows=31, delimiter= '\t') 

        # drops all the ragged extra columns dangling to the right
        columns = df1.columns[19:len(df1.columns)]                 
        df2 = df1.drop(columns, axis=1)    # you should have 19 cols left ~ array size [ntimesteps, 19]                       

        # drops the first column of the timestamp (ntimesteps)
        columns1 = df2.columns[0]
        df3 = df2.drop(columns1, axis=1)                    

        # makes a new header from the list of wavelengths you parsed earlier  
        wl_header = wavelengths_str 

        # Clean and reindex
        df4 = df3[1:]                                       # take the data (row 1- n) less the header row (row 0)
        df4.columns = wavelengths_str                       # set the header row as list of wavelengths
        df4 = df4.reindex_axis(sorted(df4.columns), axis=1) # reindex them by the new sorted wavelengths
        df4=df4.convert_objects(convert_numeric=True)       # Just to make sure that all elements are floats!

        no_cols = int(len(df4.columns)/2.)                  # no_col should always be 9 (one for each wavelength)         

        # Sort your dataframe with ascending walues of your wavelengths
        # at this point the wl_a and wl_c are the same wavelengths so 
        # it doesn't matter which one you use here
        new_header = wl_a_sorted

        # reindex to reshape the data
        df_a_aux = df4.iloc[:, :no_cols];  df_a_aux.columns = new_header ;  df_a = df_a_aux.reindex_axis(sorted(df_a_aux.columns), axis = 1)
        df_c_aux = df4.iloc[:, no_cols:];  df_c_aux.columns = new_header ;  df_c = df_c_aux.reindex_axis(sorted(df_c_aux.columns), axis = 1)

        # calculate the me(di)an, stdev, IQR for the time series - per each column
        a_mean = df_a[wl_a_sorted].mean(axis=0)         ; c_mean = df_c[wl_c_sorted].mean(axis=0)   
        a_std  = df_a[wl_a_sorted].std(axis=0)          ; c_std  = df_c[wl_c_sorted].std(axis=0)
        a_median = df_a[wl_a_sorted].median(axis=0)     ; c_median = df_c[wl_c_sorted].median(axis=0)

        # Computing IQR
        a_Q1 = df_a[wl_a_sorted].quantile(0.25)         ; c_Q1 = df_c[wl_c_sorted].quantile(0.25)
        a_Q3 = df_a[wl_a_sorted].quantile(0.75)         ; c_Q3 = df_c[wl_c_sorted].quantile(0.75)
        a_IQR = a_Q3 - a_Q1                             ; c_IQR = c_Q3 - c_Q1

        # Specifiy the output file name and directory
        outputname = 'AC9_' + str(date) + '_'+ str(site) +'_' + str(sample_type) + '_' + str(a_or_c) +'_' + str(rep)+ '_'  + 'T' + str(temp_string) + '.csv'
        outputdir = new_dir_path+'/'+site_name+'/'+ outputname 

        # make a new dataframe from the summary statistics and export
        if a_or_c == 'a':
            output_df = pd.DataFrame([wl_a_sorted, a_mean, a_std, a_median, a_IQR]).swapaxes(0,1)
            output_df.columns = ('wl', 'a_mean', 'a_std', 'a_median', 'a_IQR')
            output_df.to_csv(outputdir, sep='\t')
        else:
            output_df = pd.DataFrame([wl_c_sorted, c_mean, c_std, c_median, c_IQR]).swapaxes(0,1)
            output_df.columns = ('wl', 'c_mean', 'c_std', 'c_median', 'c_IQR')
            output_df.to_csv(outputdir, sep='\t')


