# VERIFICATION

# So Idea:
- get typical (aggregated) verification data for both regimes (100m box model structure)
- instead of aggregating to one year, get verification over 5 years per each regime


# now what to do first: 
1. get necessary raw data, 
2. then integrate over entire time-series & depth (if necessary)
3. create aggregated forcing raw csvs for model (simply include proper Date information, in addition to yday)
4. date truncation & aggregation can happen in python code of model!

In [1]:
### create one large file with all possible verification? (depends on where data is from, don't overcomplicate)

Let's write up what I need:
  > 1. N_box, NH4_box - mean(100m)
  > 2. Si_box
  > 3. P_box
4. HPLC - integrated(100m) # possibly have 0-55 and 55-100 separated of potential later analysis (but then other compenents might need the same? no?)
  > 4. FluorChla  - integrated(100m)
5. Zooplankton data
6. Export Flux (!)
  > 7. Primary Production (?) [NISKIN]
  > 8. PN for Detritus comparison (?) (maybe also POC, DOC, whatever makes sense, or just all of em)
  > 9. Euphotic Zone depth
10. 


In [2]:
# most important thing is not number of datasets, 
# but they they are structured and named conservatively, for easy reuse of functions

# NISKIN DATA

In [3]:
require(ncdf4, warn.conflicts = FALSE);
require(cowplot, warn.conflicts = FALSE);
require(tidyverse, warn.conflicts = FALSE);
require(oce, warn.conflicts = FALSE);


require(lubridate, warn.conflicts = FALSE);

Loading required package: ncdf4
Loading required package: cowplot
Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang

********************************************************
Note: As of version 1.0.0, cowplot does not change the
  default ggplot2 theme anymore. To recover the previous
  behavior, execute:
  theme_set(theme_cowplot())
********************************************************

Loading required package: tidyverse
Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.1.1       ✔ purrr   0.3.3  
✔ tibble  2.1.1       ✔ dplyr   0.8.0.1
✔ tidyr   0.8.3       ✔ stringr 1.4.0  
✔ readr   1.3.1       ✔ forcats 0.4.0  
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks

 # ALL Niskin Data created in FinalForcingData script!

# CTD DATA

# HPLC Data

## TOTAL CHLA

In [4]:
TotChlA <- read.csv("CARIACO Data Chl a Contour Plot.csv")
# units: ng l^-1

In [5]:
tail(TotChlA)

Unnamed: 0,Time.1,Depth.1,Chla.1,Time.2,Depth.2,Chla.2
486,,,,,,
487,,,,,,
488,,,,,,
489,,,,,,
490,,,,,,
491,,,,,,


In [6]:
TotChlA$Date.1 <- as.Date(as.character(TotChlA$Time.1), format="%m/%d/%y")
TotChlA$Date.2 <- as.Date(as.character(TotChlA$Time.2), format="%m/%d/%y")

In [7]:
TCHLA1 <- data.frame(Date = TotChlA$Date.1, depth=TotChlA$Depth.1, Tchla = TotChlA$Chla.1)  
TCHLA2 <- data.frame(Date = TotChlA$Date.2, depth=TotChlA$Depth.2, Tchla = TotChlA$Chla.2) 

In [8]:
TCHLATDF <- rbind(TCHLA1, TCHLA2)

In [9]:
head(TCHLATDF)

Date,depth,Tchla
1995-12-14,1,107
1995-12-14,7,139
1995-12-14,15,163
1995-12-14,25,408
1995-12-14,35,241
1995-12-14,55,88


In [10]:
# Function to read specific verfication type from dataset
prepdataframe <- function(variable='', datasource='niskin'){
    if(datasource=='niskin'){
    DF <- niskdat_df
    DF$depth <- DF$Depth_real
    # print(names(DF))
        
    VarDF <- DF[complete.cases(DF[variable]),] %>%
      select(date, variable, depth) %>%
      gather(key='key',value = "value", -date, -depth)
    head(VarDF)
    }
    return(VarDF)
}

In [11]:
# integration functions (if necessary)

# adapt integration function from PINCKNEY
integrateJAYfunc <- function(x, y){
    N1 = length(y)
    area = 0
    
    for(j in 2:N1){
        area = area + (x[j] - x[j-1]) * (y[j] + y[j-1]) / 2
    }
    return(area)
}

intgrtJAY <- function(DFtest, reg, var){
    DF_int <- DFtest %>%
        group_by(date) %>%
        do(data.frame("value_int" = with(.,integrateJAYfunc(x=depth, y=value)), "reg" = reg , "var" = var)) %>%
        ungroup()
        
    return(DF_int)
}

In [12]:
# Interpolate
interpolateDF <- function(DF,func='oce-rr'){
    zz <- seq(0, 200, 1)
    var = DF$key[1]
    if(func=='oce-rr'){
        IntDF <- DF %>%
            group_by(date) %>%
            do(data.frame(value_int = with(.,oceApprox(depth, value, zz, "rr")), depth = zz)) 
        }
    
    return(IntDF)
}

# take MEAN of concentrations at certain depth
meanTOdepth <- function(DF_int,type='fixed',depthFROM=0,depthTO=100){
    DF_means <- DF_int %>%
        group_by(date) %>% # this groups it by the run number 
        filter(depthFROM < depth & depth < depthTO) %>% # only keep values above the depth passed to function
        summarize('mean'=mean(value_int, na.rm=T))

    return(DF_means)}

In [13]:
extrctintrplt <- function(x, below=TRUE){
    DF <- prepdataframe(x)
    
    #head(DF)
    ##############################
    DF_int <- interpolateDF(DF)
    #head(DF_int)
    ##############################
    DF_mean <- meanTOdepth(DF_int, type='fixed')
    names(DF_mean)[2] <- paste(x,'_Box', sep='')
    #head(DF_mean)
    if(below==TRUE){
        ##############################
        DF_atDEPTH <- meanTOdepth(DF_int, type='fixed', depthFROM = 100, depthTO = 150)
        names(DF_atDEPTH)[2] <- paste(x,'_AtDepth', sep='')
        #head(DF_atDEPTH)
        ##############################
        DFX <- merge(DF_mean,DF_atDEPTH, all=T)
    }else{
        DFX <- DF_mean
    }

    return(DFX)
}

In [14]:
DF <- TCHLATDF
names(DF) <- c('date', 'depth', 'variable')

In [15]:
head(DF)

date,depth,variable
1995-12-14,1,107
1995-12-14,7,139
1995-12-14,15,163
1995-12-14,25,408
1995-12-14,35,241
1995-12-14,55,88


In [16]:
VarDF <- DF[complete.cases(DF$variable),] %>%
  select(date, variable, depth) %>%
  gather(key='key',value = "value", -date, -depth)
head(VarDF)

date,depth,key,value
1995-12-14,1,variable,107
1995-12-14,7,variable,139
1995-12-14,15,variable,163
1995-12-14,25,variable,408
1995-12-14,35,variable,241
1995-12-14,55,variable,88


In [17]:
xxxx <- interpolateDF(VarDF)

In [18]:
meansxxxx <- meanTOdepth(xxxx)

In [19]:
head(meansxxxx$mean / 1000 * 100)

max(meansxxxx$mean / 1000 * 100, na.rm=T)

## FT CHLA

In [20]:
PinckINTchla <- read.csv("IntegratedValues_PINCKNEY.csv", dec=',', sep=';')

In [21]:
head(PinckINTchla)
# units: mg m^2

Year,Month,Day,Date,Depth,Date.1,Tchla,Prasino3,Dino1,Crypto1,Hapto6,Chloro1,Cyano4,Diatom1
1995,Dec,14,12.14.95,0 - 55 m,14-Dec-95,11.34,1.93,0.85,0.66,2.13,0.0,0.66,5.1
1996,Jan,13,01.13.96,0 - 55 m,13. Jan 96,41.94,0.96,6.23,1.08,2.55,0.59,0.4,27.86
1996,Feb,14,02.14.96,0 - 55 m,14. Feb 96,25.44,1.08,1.79,1.81,1.79,0.0,0.25,18.14
1996,0,13,03.13.96,0 - 55 m,13-Mar-96,20.21,0.37,1.8,0.92,1.84,0.09,0.25,14.23
1996,Apr,17,04.17.96,0 - 55 m,17. Apr 96,167.78,0.84,2.72,2.76,50.01,11.79,1.35,98.3
1996,May,10,05.10.96,0 - 55 m,10-May-96,42.38,0.92,0.01,0.22,9.38,1.03,0.53,30.14


In [22]:
PinckINTchla$Date <- as.Date(as.character(PinckINTchla$Date), format="%m.%d.%y")

In [23]:
PINT1 <- PinckINTchla[PinckINTchla$Depth == "0 - 55 m",]
PINT2 <- PinckINTchla[PinckINTchla$Depth == "55 - 100 m",]

In [24]:
PINT <- merge(PINT1,PINT2, by="Date")

In [25]:
PINT$Tchla <- PINT$Tchla.x + PINT$Tchla.y
PINT$Prasino3 <- PINT$Prasino3.x + PINT$Prasino3.y
PINT$Crypto1 <- PINT$Crypto1.x + PINT$Crypto1.y
PINT$Hapto6 <- PINT$Hapto6.x + PINT$Hapto6.y
PINT$Chloro1 <- PINT$Chloro1.x + PINT$Chloro1.y
PINT$Cyano4 <- PINT$Cyano4.x + PINT$Cyano4.y
PINT$Diatom1 <- PINT$Diatom1.x + PINT$Diatom1.y

In [26]:
Tchla    <- data.frame('date'=PINT$Date, 'val'=PINT$Tchla,   '0-55m'=PINT$Tchla.x,   '55-100m'=PINT$Tchla.y,   'spec'='Tchla')
Prasino3 <- data.frame('date'=PINT$Date, 'val'=PINT$Prasino3,'0-55m'=PINT$Prasino3.x,'55-100m'=PINT$Prasino3.y,'spec'='Prasino3')
Crypto1  <- data.frame('date'=PINT$Date, 'val'=PINT$Crypto1, '0-55m'=PINT$Crypto1.x, '55-100m'=PINT$Crypto1.y, 'spec'='Crypto1')
Hapto6   <- data.frame('date'=PINT$Date, 'val'=PINT$Hapto6,  '0-55m'=PINT$Hapto6.x,  '55-100m'=PINT$Hapto6.y,  'spec'='Hapto6')
Chloro1  <- data.frame('date'=PINT$Date, 'val'=PINT$Chloro1, '0-55m'=PINT$Chloro1.x, '55-100m'=PINT$Chloro1.y, 'spec'='Chloro1')
Cyano4   <- data.frame('date'=PINT$Date, 'val'=PINT$Cyano4,  '0-55m'=PINT$Cyano4.x,  '55-100m'=PINT$Cyano4.y,  'spec'='Cyano4')
Diatom1  <- data.frame('date'=PINT$Date, 'val'=PINT$Diatom1, '0-55m'=PINT$Diatom1.x, '55-100m'=PINT$Diatom1.y, 'spec'='Diatom1')

In [27]:
PINTDF <- rbind(Tchla, Prasino3, Crypto1, Hapto6, Chloro1, Cyano4, Diatom1)

In [28]:
head(PINTDF)

date,val,X0.55m,X55.100m,spec
1995-12-14,13.54,11.34,2.2,Tchla
1996-01-13,44.21,41.94,2.27,Tchla
1996-02-14,28.77,25.44,3.33,Tchla
1996-03-13,21.26,20.21,1.05,Tchla
1996-04-17,171.92,167.78,4.14,Tchla
1996-05-10,44.24,42.38,1.86,Tchla


In [29]:
#PINTDF$date <- as.POSIXlt(PINTDF$Date)

In [30]:
PINTDF$yday <- yday(PINTDF$date)

PINTDF$month <- month(PINTDF$date)

In [31]:
write.csv(PINTDF,"ProcessedDATA/HPLCPinckneyTotAndSpec_02.csv")

In [32]:
#write.csv(PINTDF, file = "PINTDF_integratedChlorophyll.csv")

# ZOOPLANKTON DATA

In [33]:
'zooplankton.netcdf'

In [34]:
# open a NetCDF file
zooplankton <- nc_open("zooplankton.netcdf")
#print(zooplankton)

In [35]:
zooplankton_df <- data.frame("Num" = 1:zooplankton$dim$unlimited$len)
print('start')
for(i in 1:zooplankton$nvars){
    #print(i)
    zooplankton_df[paste(attributes(zooplankton$var)$names[i])] <- ncvar_get(zooplankton, attributes(zooplankton$var)$names[i])
}
print('done reading')
head(zooplankton_df)

[1] "start"
[1] "done reading"


Num,Cruise,Cruise_ID,Day,Month,Year,Date,Latitude,Longitude,Analyst,...,PROTOZOAN,RADIOLARIAN,APPENDICULARIANS,SALPS,DOLIOLIDS,BIVALVES,H_CRUSTACEA,L_ANFIOXUS,LUCIFER,JUV_GASTROPOD
1,71,CAR-071,9,10,2001,2001-10-09,10.5,-64.664,Javier Gutierrez,...,39.9409,0,91.2935,8.55877,0,0.0,0.0,,2.85292,
2,71,CAR-071,9,10,2001,2001-10-09,10.5,-64.664,Javier Gutierrez,...,7.53171,0,9.35758,1.82587,0,0.0,0.0,0.0,0.0,0.0
3,72,CAR-072,6,11,2001,2001-11-06,10.498,-64.666,Javier Gutierrez,...,0.0,0,0.51794,0.51794,0,0.0,0.0,0.0,0.0,0.0
4,72,CAR-072,6,11,2001,2001-11-06,10.498,-64.666,Javier Gutierrez,...,34.7595,0,18.7889,0.0,0,0.0,0.0,,0.0,
5,73,CAR-073,11,12,2001,2001-12-11,10.501,-64.668,Javier Gutierrez,...,0.0,0,24.6831,4.93661,0,0.0,0.0,,0.0,
6,73,CAR-073,11,12,2001,2001-12-11,10.501,-64.668,Javier Gutierrez,...,0.0620957,0,0.217335,0.0310479,0,0.0310479,0.263907,0.0,0.0,0.0


In [36]:
cat(names(zooplankton_df))

Num Cruise Cruise_ID Day Month Year Date Latitude Longitude Analyst Mesh_Size TOTAL_DENSITY BIOMASS ASH COPEPODS CALANOIDS CYCLOPOIDA HAPARCTICOIDA POECILOSTOMATOIDA L_FISH H_FISH CHAETOGNATHA CLADOCEROS OSTRACODA FORAMINIFERA AMPHIPODS ISOPODS EUPHAUSIIDS MISYDACEA SERGESTID L_CRUSTACEA N_COPEPOD C_COPEPOD N_CIRRIPEDIA C_CIRRIPEDIA L_DECAPODA L_STOMATOPODA L_EUPHAUSIID L_SERGESTID POLICHAETES L_CYPHONAUTES MEDUSA SIPHONOPHORES CTENOPHORES L_ECHINODERMS PTEROPODS HETEROPODS GASTROPODS L_CEPHALOPODA L_BIVALVE L_GASTROPOD PROTOZOAN RADIOLARIAN APPENDICULARIANS SALPS DOLIOLIDS BIVALVES H_CRUSTACEA L_ANFIOXUS LUCIFER JUV_GASTROPOD

In [37]:
zooplankton_df$date <- as.Date(zooplankton_df$Date, format="%Y-%m-%d")

In [38]:
ZOOdf <- zooplankton_df %>%
  select(date, TOTAL_DENSITY, BIOMASS, ASH, Mesh_Size) %>%
  gather(key='key',value = "value", -date, -Mesh_Size)

ZOOdf$value <- as.numeric(ZOOdf$value)
head(ZOOdf)

date,Mesh_Size,key,value
2001-10-09,200,TOTAL_DENSITY,1112.64
2001-10-09,500,TOTAL_DENSITY,212.942
2001-11-06,500,TOTAL_DENSITY,13.4266
2001-11-06,200,TOTAL_DENSITY,282.303
2001-12-11,200,TOTAL_DENSITY,572.647
2001-12-11,500,TOTAL_DENSITY,8.78655


In [39]:
ZOOdf$yday <- yday(ZOOdf$date)

ZOOdf$month <- month(ZOOdf$date)

In [40]:
write.csv(ZOOdf,"ProcessedDATA/ZooplanktonData_02.csv")

# EXPORT FLUX

## NOTE: Export Flux data is quite messy, will need to spend more time here, if I need it!

In [None]:
"Sediment_Trap.netcdf"
"std_car_TRAPflux"

In [64]:
read.csv('std_car_TRAPflux.xls')

“line 5 appears to contain embedded nulls”

ERROR: Error in make.names(col.names, unique = TRUE): invalid multibyte string at '<d0><cf><e0><a1><b1><1a><e1>'


In [62]:
# open a NetCDF file
Sediment_Trap <- nc_open("Sediment_Trap.netcdf")
print(Sediment_Trap)

File Sediment_Trap.netcdf (NC_FORMAT_CLASSIC):

     16 variables (excluding dimension variables):
        char trap_ID[maxlen_,unlimited]   
        double depth_trap[unlimited]   
        double lon[unlimited]   
        double lat[unlimited]   
        double sample_num[unlimited]   
        double date_open[unlimited]   
        double Year[unlimited]   
        double Month[unlimited]   
        double Day[unlimited]   
        double duration_d[unlimited]   
        char MF_Total[maxlen_,unlimited]   
        char MF_Corg[maxlen_,unlimited]   
        char MF_CaCO3[maxlen_,unlimited]   
        char MF_Sibio[maxlen_,unlimited]   
        char MF_Terr[maxlen_,unlimited]   
        char MF_N[maxlen_,unlimited]   

     2 dimensions:
        unlimited  Size:1573   *** is unlimited ***
        maxlen_  Size:513


In [63]:
Sediment_Trap_df <- data.frame("Num" = 1:Sediment_Trap$dim$unlimited$len)
print('start')
for(i in 1:Sediment_Trap$nvars){
    #print(i)
    Sediment_Trap_df[paste(attributes(Sediment_Trap$var)$names[i])] <- ncvar_get(Sediment_Trap, attributes(Sediment_Trap$var)$names[i])
}
print('done reading')
head(Sediment_Trap_df)

[1] "start"
[1] "done reading"


Num,trap_ID,depth_trap,lon,lat,sample_num,date_open,Year,Month,Day,duration_d,MF_Total,MF_Corg,MF_CaCO3,MF_Sibio,MF_Terr,MF_N
1,Trap_Z,152,-64.67,10.5,196,20031106,2003,11,6,15,,,,,,
2,Trap_Z,152,-64.67,10.5,197,20031121,2003,11,21,15,0.319,0.074,0.079,0.015,0.04,0.015
3,Trap_Z,152,-64.67,10.5,198,20031206,2003,12,6,15,0.668,0.079,0.4,0.057,0.014,0.015
4,Trap_Z,152,-64.67,10.5,199,20031221,2003,12,21,15,0.162,0.014,0.049,0.018,0.06,0.002
5,Trap_Z,152,-64.67,10.5,200,20040105,2004,1,5,15,,,,,,
6,Trap_Z,152,-64.67,10.5,201,20040120,2004,1,20,15,0.185,0.043,0.053,0.014,0.011,0.007


In [53]:
cat(names(zooplankton_df))

Num Cruise Cruise_ID Day Month Year Date Latitude Longitude Analyst Mesh_Size TOTAL_DENSITY BIOMASS ASH COPEPODS CALANOIDS CYCLOPOIDA HAPARCTICOIDA POECILOSTOMATOIDA L_FISH H_FISH CHAETOGNATHA CLADOCEROS OSTRACODA FORAMINIFERA AMPHIPODS ISOPODS EUPHAUSIIDS MISYDACEA SERGESTID L_CRUSTACEA N_COPEPOD C_COPEPOD N_CIRRIPEDIA C_CIRRIPEDIA L_DECAPODA L_STOMATOPODA L_EUPHAUSIID L_SERGESTID POLICHAETES L_CYPHONAUTES MEDUSA SIPHONOPHORES CTENOPHORES L_ECHINODERMS PTEROPODS HETEROPODS GASTROPODS L_CEPHALOPODA L_BIVALVE L_GASTROPOD PROTOZOAN RADIOLARIAN APPENDICULARIANS SALPS DOLIOLIDS BIVALVES H_CRUSTACEA L_ANFIOXUS LUCIFER JUV_GASTROPOD

In [56]:
zooplankton_df$date <- as.Date(zooplankton_df$Date, format="%Y-%m-%d")

# TODO:
- add units to all dataframes & vars for consistency, and to prevent errors!