# Format metadata tables for NCBI upload

In [1]:
library(tidyr)
library(dplyr)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



In [2]:
#Metadata file
SamData = '~/Hyphosphere/3exp_metadata.txt'

# read sample data
samdf = read.delim(file = SamData, header = TRUE, sep = '\t', row.names = "SampleID")
head(samdf)
rownames(samdf)[1:4]

Unnamed: 0,Sample,Experiment,Plant,Fungus,Soil,SampleType,Treatment,TimePoint,DAI,Rep,Concentration_ng.ul
HCBN1_S211,HCBN1,1,N1,Gv,Lansing,CS,HN,,81,1,6.08
HCBN2_S223,HCBN2,1,N2,Gv,Lansing,CS,HN,,81,2,7.58
HCBN3_S235,HCBN3,1,N3,Gv,Lansing,CS,HN,,81,3,7.05
HCBF1_S247,HCBF1,1,F1,Gv,Florence,CS,HN,,81,1,2.32
HCBF2_S259,HCBF2,1,F2,Gv,Florence,CS,HN,,81,2,2.08
HCBF3_S271,HCBF3,1,F3,Gv,Florence,CS,HN,,81,3,1.89


# Format biosample attributes

In [3]:
samdf$SampleType %>% levels()

In [19]:
SampleType = c('BH', 'BP', 'BS', 'CH', 'CS', 'EB', 'GV', 'MK', 'PB', 'RH', 'RS', 'RT', 'T0')
sample_type = c("Hyphosphere soil", "Particulate organic matter", "Bulk soil", "Hyphae", "Soil slurry", "Extraction blank",
               'G. versiforme spores', 'Mock community','PCR blank', "Hyphae - root compartment", "Sand slurry - root compartment", 'Roots', 
                'T0 soil samples')

stlookup = cbind(SampleType, sample_type) %>% as.data.frame()
stlookup


SampleType,sample_type
BH,Hyphosphere soil
BP,Particulate organic matter
BS,Bulk soil
CH,Hyphae
CS,Soil slurry
EB,Extraction blank
GV,G. versiforme spores
MK,Mock community
PB,PCR blank
RH,Hyphae - root compartment


In [4]:
colnames(tmp)

ERROR: Error in is.data.frame(x): object 'tmp' not found


In [43]:
SampleAttributes = samdf %>% 
    mutate(sample_name = row.names(.),
          bioproject_accession = NA,
          organism = ifelse(SampleType == "RT", 'rhizosphere metagenome', 
                           ifelse(SampleType %in% c('BH','BP','BS','CH','CS', 'RH', 'GV', 'RS','T0'),
                            'soil metagenome',
                            ifelse(SampleType == 'MK', 'synthetic metagenome', 'not applicable'))),
          host = ifelse(SampleType == 'RT', 'Brachypodium distachyon',
                       ifelse(SampleType %in% c('BH','BP','BS','CS', 'RS','T0'),'soil', 
                       ifelse(SampleType == "GV", "Glomus versiforme",
                       ifelse(SampleType %in% c("RH", "CH") & Fungus == "Gv", "Glomus versiforme",
                       ifelse(SampleType %in% c("RH", "CH") & Fungus == "RI", "Rhizophagus irregularis", 
                       ifelse(SampleType %in% c("EB", "PB", 'MK'), 'not applicable', NA)))))),
          collection_date = ifelse(SampleType %in% c("PB", "EB", "MK"),'not applicable',
                            ifelse(Experiment == 1, "03/10/2017",
                            ifelse(SampleType == 'T0', '05/02/2018',
                            ifelse(SampleType %in% 'GV', '2018',
                            ifelse(Experiment == 2 & Rep %in% c(1,2), '08/06/2018',
                            ifelse(Experiment == 2 & Rep %in% c(3,4), '08/16/2018',
                            ifelse(Experiment == 3 & DAI == '14', '06/08/2018',
                            ifelse(Experiment == 3 & DAI == '24', '06/18/2018',
                            ifelse(Experiment == 3 & DAI == '35', '06/29/2018',
                            ifelse(Experiment == 3 & DAI == '45', '07/09/2018',
                            ifelse(Experiment == 3 & DAI == '65', '07/29/2018', 'not applicable'))))))))))),
          geo_loc_name = ifelse(SampleType %in% c('PB', 'EB', 'MK'), 'not applicable', 
                                ifelse(SampleType == 'T0', 'United States:New York;Freeville',
                                ifelse(Soil == "Lansing", 'United States:New York;Freeville',
                                ifelse(Soil == "Pendelton", 'United States:South Carolina;Pendelton', 
                                ifelse(Soil == "Florence", 'United States:South Carolina;Florence',
                                       'United States:New York;Ithaca;Laboratory'))))),
          lat_lon = ifelse(SampleType == 'T0', '42.5213 N 76.3309 W', 
                        ifelse(SampleType %in% c('BH', 'BP', 'BS', 'CH', 'CS', 'RH', 'RS', 'RT') & 
                               Soil == "Lansing", '42.5213 N  76.3309 W',
                        ifelse(SampleType %in% c('BH', 'BP', 'BS', 'CH', 'CS',  'RH', 'RS', 'RT') & 
                               Soil == "Florence", '34.3113 N  79.7542 W',
                        ifelse(SampleType %in% c('BH', 'BP', 'BS', 'CH', 'CS', 'RH', 'RS', 'RT') & 
                               Soil == "Pendelton", '34.6270 N  82.7410 W', 
                        ifelse(SampleType %in% c('BH', 'BP', 'BS', 'CH', 'CS',  'RH', 'RS', 'RT') & 
                               Soil == "Sand" | SampleType == "GV", '42.444 N  76.5019 W',
                               
                           ifelse(SampleType %in% c('PB', 'EB', 'MK'), 'not applicable', NA)))))),
          fertilizer_regm = ifelse(Experiment == 1, "1/4x Hoagland’s solution with 20 uM PO4-; 3x weekly",
                                 ifelse(Experiment == 2 & Treatment == "HN", "1/4x Hoagland’s solution with 20 uM PO4-; 50ml, 3x weekly",
                                  ifelse(Experiment == 2 & Treatment == "HP", "1/4x Hoagland’s solution with 1/20x N & 200 uM PO4-; 50ml, 3x weekly",
                                 ifelse(Experiment == 3, '1/4x Hoagland’s solution with 20 uM PO4-; 25ml; 3x weekly', NA)))),
          Fungus = ifelse(Fungus == "Gv", "Glomus versiforme", 
                         ifelse(Fungus == "RI", 'Rhizophagus irregularis', NA)),
          Plant = 'Brachypodium distachyon',
          DaysAfterPlanting = DAI + 49) %>% 
        rename(DaysAfterInsert = DAI) %>%
left_join(., stlookup, by = "SampleType") %>%
select(-SampleType) %>%
rename(SampleType = sample_type) %>%
select(sample_name, bioproject_accession, organism, host, collection_date, geo_loc_name, lat_lon, Sample, 
    Experiment, Plant, Fungus, Soil, SampleType, Treatment, TimePoint, DaysAfterInsert, DaysAfterPlanting, Rep, fertilizer_regm)
    

In [44]:
head(SampleAttributes)

sample_name,bioproject_accession,organism,host,collection_date,geo_loc_name,lat_lon,Sample,Experiment,Plant,Fungus,Soil,SampleType,Treatment,TimePoint,DaysAfterInsert,DaysAfterPlanting,Rep,fertilizer_regm
HCBN1_S211,,soil metagenome,soil,03/10/2017,United States:New York;Freeville,42.5213 N 76.3309 W,HCBN1,1,Brachypodium distachyon,Glomus versiforme,Lansing,Soil slurry,HN,,81,130,1,1/4x Hoagland’s solution with 20 uM PO4-; 3x weekly
HCBN2_S223,,soil metagenome,soil,03/10/2017,United States:New York;Freeville,42.5213 N 76.3309 W,HCBN2,1,Brachypodium distachyon,Glomus versiforme,Lansing,Soil slurry,HN,,81,130,2,1/4x Hoagland’s solution with 20 uM PO4-; 3x weekly
HCBN3_S235,,soil metagenome,soil,03/10/2017,United States:New York;Freeville,42.5213 N 76.3309 W,HCBN3,1,Brachypodium distachyon,Glomus versiforme,Lansing,Soil slurry,HN,,81,130,3,1/4x Hoagland’s solution with 20 uM PO4-; 3x weekly
HCBF1_S247,,soil metagenome,soil,03/10/2017,United States:South Carolina;Florence,34.3113 N 79.7542 W,HCBF1,1,Brachypodium distachyon,Glomus versiforme,Florence,Soil slurry,HN,,81,130,1,1/4x Hoagland’s solution with 20 uM PO4-; 3x weekly
HCBF2_S259,,soil metagenome,soil,03/10/2017,United States:South Carolina;Florence,34.3113 N 79.7542 W,HCBF2,1,Brachypodium distachyon,Glomus versiforme,Florence,Soil slurry,HN,,81,130,2,1/4x Hoagland’s solution with 20 uM PO4-; 3x weekly
HCBF3_S271,,soil metagenome,soil,03/10/2017,United States:South Carolina;Florence,34.3113 N 79.7542 W,HCBF3,1,Brachypodium distachyon,Glomus versiforme,Florence,Soil slurry,HN,,81,130,3,1/4x Hoagland’s solution with 20 uM PO4-; 3x weekly


In [45]:
SA1 = SampleAttributes %>% filter(Experiment == 1)
SA2 = SampleAttributes %>% filter(Experiment == 2)
SA3 = SampleAttributes %>% filter(Experiment == 3)

write.table(SampleAttributes, file = '~/Hyphosphere/SRA_biosample_attributes.txt', sep = '\t', row.names = FALSE)
write.table(SA1, file = '~/Hyphosphere/SRA_biosample_attributes_Exp1.txt', sep = '\t', row.names = FALSE)
write.table(SA2, file = '~/Hyphosphere/SRA_biosample_attributes_Exp2.txt', sep = '\t', row.names = FALSE)
write.table(SA3, file = '~/Hyphosphere/SRA_biosample_attributes_Exp3.txt', sep = '\t', row.names = FALSE)

# Format metadata for NCBI upload

In [51]:
samdf %>% head()


Unnamed: 0,Sample,Experiment,Plant,Fungus,Soil,SampleType,Treatment,TimePoint,DAI,Rep,Concentration_ng.ul
HCBN1_S211,HCBN1,1,N1,Gv,Lansing,CS,HN,,81,1,6.08
HCBN2_S223,HCBN2,1,N2,Gv,Lansing,CS,HN,,81,2,7.58
HCBN3_S235,HCBN3,1,N3,Gv,Lansing,CS,HN,,81,3,7.05
HCBF1_S247,HCBF1,1,F1,Gv,Florence,CS,HN,,81,1,2.32
HCBF2_S259,HCBF2,1,F2,Gv,Florence,CS,HN,,81,2,2.08
HCBF3_S271,HCBF3,1,F3,Gv,Florence,CS,HN,,81,3,1.89


In [63]:
MetaData = samdf %>% 
    mutate(sample_name = row.names(.),
          library_ID = sample_name,
          title = paste('16S amplicon of ', SampleType, ' sample from mesocosm with', Soil, ' soil and ', Fungus, '; Experiment ', Experiment, 
                       '. Samples collected ', DAI, ' days after soil in-growth cores inserted in mesocosm', sep = ''),
          library_strategy = "AMPLICON", 
          library_source = "METAGENOMIC",
          library_selection = "PCR",
          library_layout = 'paired',
          platform = 'ILLUMINA',
          instrument = 'Illumina MiSeq',
          design_description = ifelse(Experiment == 1, 'Prepared at University of Minnesota Genomics Center',
                                      "Multiplexed dual barcoded MiSeq library prepared as in Kozich et al. 2013"),
          filetype = "fastq",
          filename = ifelse(Experiment == 1, paste(sample_name, '_R1_001.fastq.gz', sep = ''), 
                                                    paste(sample_name, '.R1.fq.gz', sep = '')),
          filename2 = ifelse(Experiment == 1, paste(sample_name, '_R2_001.fastq.gz', sep = ''), 
                                                    paste(sample_name, '.R2.fq.gz', sep = ''))) %>%
    select(sample_name, library_ID, title, library_strategy, library_source, library_selection, library_layout,
          platform, instrument, design_description, filetype, filename, filename2, Experiment) 
head(MetaData)


sample_name,library_ID,title,library_strategy,library_source,library_selection,library_layout,platform,instrument,design_description,filetype,filename,filename2,Experiment
HCBN1_S211,HCBN1_S211,16S amplicon of CS sample from mesocosm withLansing soil and Gv; Experiment 1. Samples collected 81 days after soil in-growth cores inserted in mesocosm,AMPLICON,METAGENOMIC,PCR,paired,ILLUMINA,Illumina MiSeq,Prepared at University of Minnesota Genomics Center,fastq,HCBN1_S211_R1_001.fastq.gz,HCBN1_S211_R2_001.fastq.gz,1
HCBN2_S223,HCBN2_S223,16S amplicon of CS sample from mesocosm withLansing soil and Gv; Experiment 1. Samples collected 81 days after soil in-growth cores inserted in mesocosm,AMPLICON,METAGENOMIC,PCR,paired,ILLUMINA,Illumina MiSeq,Prepared at University of Minnesota Genomics Center,fastq,HCBN2_S223_R1_001.fastq.gz,HCBN2_S223_R2_001.fastq.gz,1
HCBN3_S235,HCBN3_S235,16S amplicon of CS sample from mesocosm withLansing soil and Gv; Experiment 1. Samples collected 81 days after soil in-growth cores inserted in mesocosm,AMPLICON,METAGENOMIC,PCR,paired,ILLUMINA,Illumina MiSeq,Prepared at University of Minnesota Genomics Center,fastq,HCBN3_S235_R1_001.fastq.gz,HCBN3_S235_R2_001.fastq.gz,1
HCBF1_S247,HCBF1_S247,16S amplicon of CS sample from mesocosm withFlorence soil and Gv; Experiment 1. Samples collected 81 days after soil in-growth cores inserted in mesocosm,AMPLICON,METAGENOMIC,PCR,paired,ILLUMINA,Illumina MiSeq,Prepared at University of Minnesota Genomics Center,fastq,HCBF1_S247_R1_001.fastq.gz,HCBF1_S247_R2_001.fastq.gz,1
HCBF2_S259,HCBF2_S259,16S amplicon of CS sample from mesocosm withFlorence soil and Gv; Experiment 1. Samples collected 81 days after soil in-growth cores inserted in mesocosm,AMPLICON,METAGENOMIC,PCR,paired,ILLUMINA,Illumina MiSeq,Prepared at University of Minnesota Genomics Center,fastq,HCBF2_S259_R1_001.fastq.gz,HCBF2_S259_R2_001.fastq.gz,1
HCBF3_S271,HCBF3_S271,16S amplicon of CS sample from mesocosm withFlorence soil and Gv; Experiment 1. Samples collected 81 days after soil in-growth cores inserted in mesocosm,AMPLICON,METAGENOMIC,PCR,paired,ILLUMINA,Illumina MiSeq,Prepared at University of Minnesota Genomics Center,fastq,HCBF3_S271_R1_001.fastq.gz,HCBF3_S271_R2_001.fastq.gz,1


In [65]:
MD1 = MetaData %>% filter(Experiment == 1)
MD2 = MetaData %>% filter(Experiment == 2)
MD3 = MetaData %>% filter(Experiment == 3)
Metadata = MetaData %>% select(-Experiment)
write.table(MetaData, file = '~/Hyphosphere/SRA_metadata.txt', sep = '\t', row.names = FALSE)
write.table(MD1, file = '~/Hyphosphere/SRA_metadata_Exp1.txt', sep = '\t', row.names = FALSE)
write.table(MD2, file = '~/Hyphosphere/SRA_metadata_Exp2.txt', sep = '\t', row.names = FALSE)
write.table(MD3, file = '~/Hyphosphere/SRA_metadata_Exp3.txt', sep = '\t', row.names = FALSE)

# Filter to each experiment in case separate uploading is necessary

In [3]:
samdf %>% filter(Experiment == 1)

Unnamed: 0,Sample,Experiment,Plant,Fungus,Soil,SampleType,Treatment,TimePoint,DAI,Rep,Concentration_ng.ul
HCBN1_S211,HCBN1,1,N1,Gv,Lansing,CS,HN,,81.0,1.0,6.08
HCBN2_S223,HCBN2,1,N2,Gv,Lansing,CS,HN,,81.0,2.0,7.58
HCBN3_S235,HCBN3,1,N3,Gv,Lansing,CS,HN,,81.0,3.0,7.05
HCBF1_S247,HCBF1,1,F1,Gv,Florence,CS,HN,,81.0,1.0,2.32
HCBF2_S259,HCBF2,1,F2,Gv,Florence,CS,HN,,81.0,2.0,2.08
HCBF3_S271,HCBF3,1,F3,Gv,Florence,CS,HN,,81.0,3.0,1.89
HCBP1_S283,HCBP1,1,P1,Gv,Pendelton,CS,HN,,81.0,1.0,9.18
HCBP2_S200,HCBP2,1,P2,Gv,Pendelton,CS,HN,,81.0,2.0,6.87
HCBP3_S212,HCBP3,1,P3,Gv,Pendelton,CS,HN,,81.0,3.0,3.85
HCBS1_S224,HCBS1,1,S1,Gv,Sand,CS,HN,,81.0,1.0,0.05


In [None]:
# 