# <font color=blue>DAPD Normalization part1 </font>
DAPD Normalization part1 reads the csv files which are generated by the image processing module. Each csv file stores the time and phenotyping measurements such as rosette area and leaf number. This piece of program filters out the outliers plants based on the ExpX_label.csv file and the curve fitting error. Then, it merges the all the files into two files per accession (area and leaf number files).

### Importing modules
The Python and custom modules are loaded using the import command

In [1]:
import numpy as np
import data
import path
import fitting
import matplotlib.pylab as plt
import matplotlib.transforms as mtransforms
import os
import pandas as pd
from scipy.optimize import curve_fit
from scipy.fftpack import fft, ifft
import sys
from scipy import interpolate
import save
from scipy import stats
import ecoTable
import ecoPlot

### Variable declaration
__expID__ is the experiment ID. __daySowing__ is the day when the seeds were sown into the soil. __dayStart__ is the first day of the image acquisition from the sowing day. __dayStop__ is the last day of the image acquisition from the sowing day. __krnl__ is the filter kernel size.

In [2]:
expID, daySowing,  dayStart, dayStop = 'Exp4', '2017-11-01', 12, 34
krnl = 71

### Experiment Directories
Each experiment has its own set of files which are stored in a specific directory. They are loaded based on the full file directory.

In [3]:
root = 'C:/Users/dlozano/OneDrive - LA TROBE UNIVERSITY/PhD - GitHub/DAPD_Normalization/' + expID + '/'
labelDir = root + expID + '_label.csv'
pathPlot = root + expID + '_' + 'plots' +'/'
pathData = root + expID + '_' + 'datasets' +'/'

if not os.path.exists(pathPlot): os.makedirs(pathPlot) 
    
if not os.path.exists(pathData): os.makedirs(pathData)     

### Read csv files 
This section reads the csv files which were generated by the image processing main module. Each csv file has the time and phenotyping measurements such as rosette area and leaf number.

In [4]:
label, csvFiles = [], []
table, ecoNamesAll  = [], []
label                = pd.read_csv(labelDir)
label['tray']        = 'T' + label['tray'].map('{:02}'.format).apply(str)
label['posn']        = label.tray.str.cat('_' + label.position)
label.drop(label[label['ecotype'] == 'Checker' ].index , inplace=True)
label.sort_values(by = ['ecotype', 'posn'], inplace=True)
label = label.reset_index(drop=True)
csvFiles              = pd.DataFrame(path.files(root+expID + '_csvFiles/'))     
csvFiles.columns      = ['folder', 'file']
aux1                    = csvFiles['file'].str.split("_T", n = 1, expand = True)
aux2                    = aux1[1].str.split("_", n = 3, expand = True)
csvFiles['ecotype']  = aux1[0]
csvFiles['posn']     = 'T' +aux2[0] + '_' + aux2[1]
csvFiles['camera']   = aux2[2].str.split(".", n = 1, expand = True)[0]
csvFiles['status']   = 'non'
csvFiles['outlier']  = ''

This section filters out the outliers plants based on the ExpX_label.csv file/

In [5]:
for cntCSV in range(len(label['posn'])):
    posn, status, outlier, idx = [], [], [], []
    posn = label.loc[cntCSV, 'posn']
    status = label.loc[cntCSV, 'status']
    outlier = label.loc[cntCSV, 'outlier']
    idx = np.where(csvFiles['posn'] == posn)[0]  
    csvFiles.loc[idx, 'status'] = status
    csvFiles.loc[idx, 'outlier'] = outlier

outliers1 = csvFiles.loc[csvFiles.loc[:, 'outlier'] == 'yes', 'posn'].drop_duplicates()
outliers1 = list(outliers1.sort_values().unique())
csvFiles.drop(csvFiles[(csvFiles['status'] == 'non') | (csvFiles['outlier'] == 'yes') ].index , inplace=True)
csvFiles.reset_index(drop=True, inplace=True)

This section filters out the outliers plants based on the curve fitting error

In [6]:
table = ecoTable.detectDuplicate('area',csvFiles.copy(), dayStart, dayStop, 100)
table.drop(table[table['duplicate'] == 'yes' ].index , inplace=True) # Remove duplicate time-series
outliers2 = table.loc[(table.loc[:, 'outlier']=='yes') | (table['fitError'] > 1.0 ), 'posn'].drop_duplicates()
outliers2 = list(outliers2.sort_values().unique())
table.drop(table[table['fitError'] > 1.0 ].index , inplace=True) # Remove time-series that have a large curve fitting error
table.sort_values(by = ['ecotype', 'posn'], inplace=True)
table.reset_index(drop=True, inplace=True)
outlierAll = list(set(outliers1 + outliers2))

This section merges and consolidates all csv files which belong to an accession into a single cvs file.

In [7]:
ecoNamesAll = table.ecotype.unique() # Unique ecotypes

for line in ecoNamesAll:
    idx, ecotype, areaRaw, leafRaw = [], [], [], []
    timeSeries, areaSeries, leafSeries, timeVect, locat = [], [], [], [], []
    idx = np.where(table['ecotype'].values == line)
    ecotype = table.iloc[idx].reset_index(drop=True)
    timeSeries, areaSeries, leafSeries, timeVect, locat = ecoTable.seriesSize('area', 'leaf', ecotype, krnl, daySowing, dayStart, dayStop)
    areaRaw, leafRaw = ecoTable.seriesOriginal(timeSeries, areaSeries, leafSeries, timeVect, locat, daySowing)
    save.CVS(pathData, expID + '_' + line + '_' + 'area' + '_' + 'raw', areaRaw)
    save.CVS(pathData, expID + '_' + line + '_' + 'leaf' + '_' + 'raw', leafRaw)