### Importing and Processing CO2 Respiration Data from GC-MS###
> This script will convert output from the GC-MS from multiple sampling timepoints into a table. It can also calculate the mols C based on peak area and prep a graph in ggplot.


###### Last Modified by R. Wilhelm on October 20th, 2017 ######


# Step 1: User Input

In [None]:
## Provide the directory the contains subdirectories with timepoints
# example: '/home/roli/PROJECT/ which would contain sub-directories corresponding to timepoints T1, T2, T3 ... that containing the text output from the GC-MS

directory = '/home/roli/scripts/gcms/example_data/'

## Name the output for GC-MS data refinement (still raw, but in tabular form)
output_name = 'example'

## Provide an 'events' table in '.tsv' format which contains at least three columns: 'Timepoint', 'Sampling Date', and 'Sampling Time'
# note: It is critical to include T0 (i.e. the start date and time)

#Timepoint  Date       Time
#T0         21/04/17   21:30:00
#T1         22/04/17   09:30:00

events = 'events.tsv'

## Provide information on the volume of microcosm sampled (in L)
# note: you'll have to alter the code for calculating mol.C if you've used mixed container sizes.
microcosm_size = 0.25 #   

## Provide name and concentration of each standard (in ppm)
import pandas as pd
standards = pd.DataFrame({'ppm': [0, 855, 1701, 3422, 8510, 17227, 34488]},
                         index=['standard1','standard2','standard3','standard4','standard5','standard6','standard7'])

# Define Import Function

In [None]:
import os, re, glob, sys
from collections import defaultdict

def import_me(directory):
    import_dict = defaultdict(list)

    for dir_path in os.walk(directory):
        if dir_path[0] != directory:
            dir_path = (str(dir_path[0]))
            dir_name = re.sub(directory,"",dir_path)

            for file in glob.glob(dir_path+"/*.txt"):
                name = re.sub(dir_path+"/","",file)
                name = re.sub(".txt","",name)

                import_dict[dir_name].append([name, file])
                
    return import_dict

# Step 2: Convert GC-MS Raw Data to Table

In [None]:
output = open(directory+"/"+output_name+".raw.co2.table.tsv","w")
output.write("timepoint\tsampleID\tion\tconcentration\tpeak area\trt\n")
input_dictionary = import_me(directory)

for timepoint, sample_files in input_dictionary.items():
    
    for sample_file in sample_files:
        name = sample_file[0]
        file = sample_file[1]
        
        for line in open(file,"r"):
            if re.search("TIC|m/z 44|m/z 45",line):
                line = line.strip()
                line = line.split("\t")

                ion = line[1]
                retention = line[5]
                area = line[9]
                concentration = line[11]

                output.write(timepoint+"\t"+name+"\t"+ion+"\t"+concentration+"\t"+area+"\t"+retention+"\n")

output.close()

# Step 3: Import and Work-up in R

In [None]:
## Setup R-Magic for Jupyter Notebooks
import rpy2
import pandas as pd
%load_ext rpy2.ipython

## Description: use Pandas to create a dataframe and then pipe that to R

# Import CO2 Data
co2_data = pd.read_csv(directory+"/"+output_name+".raw.co2.table.tsv", sep="\t")
%R -i co2_data

# Segregate Standards
%R raw_standards <- co2_data[grep("standard",co2_data$sampleID),]
%R co2_data <- co2_data[-grep("standard",co2_data$sampleID),]

# Import Standards
%R -i standards

# Import Microcosm Size
%R -i microcosm_size

# Import Analysis Directory
%R -i directory
%R -i output_name

# Import Events
try:
    events = pd.read_csv(directory+"/"+events, sep="\t")
    %R -i events
    
except:
    pass

# Define Functions Used in R

In [None]:
%%R

########################
## Convert ppm to mols C

calc_mol_C = function(ppm,ion,volume.L){
  # moles (n) = PV / RT
  # x = ppm
  temp.K=294.261
  pressure.atm=1
  R=0.08206
  
  ppm = as.numeric(ppm)
  ion = as.numeric(ion)
  mol.volume = (pressure.atm * volume.L) / (R * temp.K)  # mol gas in container
  mol.CO2 = mol.volume * (ppm / 1000000)  # fraction of mol as CO2
  
  if (ion == 44){
    mol.C = mol.CO2 * 12/44   # fraction of CO2 that is C  
  } else {
    mol.C = mol.CO2 * 13/45   # fraction of CO2 that is C
  }
  return(mol.C)
}

#########################
#Calculate Standard Error
# Taken from http://www.cookbook-r.com/Graphs/Plotting_means_and_error_bars_(ggplot2)/

summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE, conf.interval=.95, .drop=TRUE) {
  library(plyr)
  
  # New version of length which can handle NA's: if na.rm==T, don't count them
  length2 <- function (x, na.rm=FALSE) {
    if (na.rm) sum(!is.na(x))
    else       length(x)
  }
  
  # This does the summary. For each group's data frame, return a vector with
  # N, mean, and sd
  datac <- ddply(data, groupvars, .drop=.drop,
                 .fun = function(xx, col) {
                   c(N    = length2(xx[[col]], na.rm=na.rm),
                     mean = mean   (xx[[col]], na.rm=na.rm),
                     sd   = sd     (xx[[col]], na.rm=na.rm)
                   )
                 },
                 measurevar
  )
  
  # Rename the "mean" column    
  datac <- rename(datac, c("mean" = measurevar))
  
  datac$se <- datac$sd / sqrt(datac$N)  # Calculate standard error of the mean
  
  # Confidence interval multiplier for standard error
  # Calculate t-statistic for confidence interval: 
  # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
  ciMult <- qt(conf.interval/2 + .5, datac$N-1)
  datac$ci <- datac$se * ciMult
  
  return(datac)
}

##############################################
### Join Data and Time into Single POSIX stamp
   
time_converter <- function(date, time){
    x <- data.frame(posix = rep(NA,length(date)))
    x$posix <- as.POSIXct(x$posix)
    
    ## This is ugly becaue it is trying to catch various irregularities in time and date input
    for (n in 1:length(date)){
      if (!is.na(as.POSIXct(strptime(paste(date[n],time[n],sep=" "), '%d/%m/%y %R'), tz="EST"))){
        x$posix[n] = as.POSIXct(strptime(paste(date[n],time[n],sep=" "), '%d/%m/%y %R'), tz="EST")
      } else if (!is.na(as.POSIXct(strptime(paste(date[n],time[n],sep=" "), '%d/%m/%Y %R'), tz="EST"))) {
        x$posix[n] = as.POSIXct(strptime(paste(date[n],time[n],sep=" "), '%d/%m/%Y %R'), tz="EST")
      } else {
        x$posix[n] = as.POSIXct(strptime(paste(date[n],time[n],sep=" "), '%d-%m-%y %R'), tz="EST")  
      }
    }
       
    return(x)
}
      
####################################################
### Calculate Duration from Start for All Timepoints
      
duration_calculator <- function(start, time_series){
  return(as.numeric(difftime(time_series, start), units='hours'))
}

# Step 4: Calculate Durations

In [None]:
%%R

# Combine all dates and times into single 'POSIX' time-stamp
posix <- time_converter(events$date, events$time)
events <- cbind(events, posix)

# Calculate Duration for Each Timepoint
start <- subset(events, timepoint == "T0")
events <- subset(events, timepoint != "T0")
events$duration <- duration_calculator(start$posix, events$posix) # duration_calculator(start_time, all_time_points)

# Merge CO2 Data with duration
co2_data <- merge(co2_data, events, by = "timepoint")
                  
# print current work-up
print(head(co2_data))

# Step 5: Calculate Standard Curve

In [None]:
%%R
## Note: This script assumes very low inter-run variability
## From my experience, preparing the standards by hand introduces greater inter-run variability than the instrument.
## Therefore, this script will average all standard data and calculate ppm based off of this average

## a bit of clean-up
raw_standards$ion <- gsub("m/z ","",raw_standards$ion) 
raw_standards <- subset(raw_standards, ion != "TIC")

## Concentrations are calculated based on total CO2  (converting 13C to 12C-equivalent is negligible)
# sum ion 44 and ion 45 
raw_standards <- ddply(raw_standards, ~ timepoint + sampleID, summarise, total.peak.area = sum(peak.area))
raw_standards$combo <- paste(raw_standards$timepoint, raw_standards$sampleID,sep="_")

## regress standards
standards$sampleID <- rownames(standards)
raw_standards <- merge(raw_standards, standards, by = "sampleID")

## Plot Curve
plot <- ggplot(raw_standards, aes(total.peak.area, ppm, label = combo)) + geom_point() + geom_smooth(method=lm, se=F) + ggtitle("Raw Standards")
print(plot + geom_label(size = 4, hjust = -0.1))

## Remove Outliers from Standard Curve
remove_me <- c("T2_standard7","T2_standard6")

if (length(remove_me) > 0){
    refined_standards <- raw_standards[-which(raw_standards$combo %in% remove_me),]
    plot <- ggplot(refined_standards, aes(total.peak.area, ppm, label = combo)) + geom_point() + geom_smooth(method=lm, se=F) + ggtitle("Refined Standards")
    print(plot + geom_label(size = 4, hjust = -0.1))
} else {
    refined_standards <- raw_standards
}

## Calculate regression coefficients (force through zero)
m <- as.numeric(coef(lm(ppm ~ total.peak.area -1, refined_standards))[1])
#b <- as.numeric(coef(lm(ppm ~ total.peak.area, refined_standards))[1])

# Step 6: Convert peak area to ppm

In [None]:
%%R

## a bit of clean-up
co2_data$ion <- gsub("m/z ","", co2_data$ion) 
co2_data <- subset(co2_data, ion != "TIC")

## Calculate ppm based on curve
co2_data$adj.conc <- m*co2_data$peak.area

print(head(co2_data))

# Step 7: Convert ppm to mols C

In [None]:
%%R

# Calculate mol
co2_data$mol.C <- apply(co2_data[,c("adj.conc","ion")], 1, function(x) calc_mol_C(x[1],x[2],microcosm_size))

print(head(co2_data))

# Step 8: Calculate Cumulative Respiration

In [None]:
%%R

## Calculate Cumulative Respiration
count = 1

for (sample in unique(co2_data$sampleID)){ 
    for (i in c(44, 45)){
        foo<-subset(co2_data, sampleID == sample & ion == i)
        foo<-foo[order(foo$duration),]
        foo$cum.mol.C <- cumsum(foo$mol.C)

        if (count == 1){
            cumulative <- foo
            count = count + 1
        } else {
            cumulative <- rbind(cumulative, foo)
        }    
    }
}

print(head(cumulative))


# Step 9: Plot Curves

In [None]:
%%R
# Plot Curve 1 : Respiration over Sampling Intervals
plot_me <- summarySE(co2_data, measurevar="mol.C", groupvars=c("sampleID","duration","ion"))
print(ggplot(plot_me, aes(duration, mol.C, color = sampleID)) + geom_point() + geom_smooth(method = "lm", formula = y ~ splines::bs(x, 3), se = FALSE) + facet_grid(~ion) + ggtitle("Net CO2 Flux across Sampling Intervals"))

# Plot Curve 2 : Cumulative Respiration
print(ggplot(cumulative, aes(duration, cum.mol.C, color = sampleID)) + geom_point() + geom_smooth(method = "lm", formula = y ~ splines::bs(x, 3), se = FALSE) + facet_grid(~ion) + ggtitle("Cumulative CO2 Over Time"))

# Step 10: Export and Save Dataset

In [None]:
%%R

## Export as '.csv' for safe-keeping
write.csv(co2_data, file = paste(directory,"/",output_name,".final.csv",sep=""))

## Export as '.rds' for analysis in R
saveRDS(co2_data, file = paste(directory,"/",output_name,".final.rds",sep=""))