# Concatenate Occurrence Table with Metadata and Generate Frequency Tables and Plots
 __Author:__ Alan K. Jarmusch (ajarmusch@ucsd.edu)<br>
 __Version:__ 1.0 <br>
 __Date of Last Revision:__ 07-02-2018 <br>
 __Input:__ Downloaded Occurrence Table ("View Compounds and File Occurrence") output from GNPS (link below). Occurrence Table is filtered using a Curated Information Table (manually generated and uploaded). Metadata. <br>
 headers are filenames with .mzXML extension; first column is LibraryID with annotations; output is transposed and N/A is deleted
 example: CompoundOccuranceTable_filenamesClipped.txt<br>
 __Function:__ Occurrence table filtering to eliminate multiplicity of GNPS annotations (via different library or library spectra hits) via filtering using a Curated Information Table. The resulting occurrence table is then concatenated with metadata. One output are frequency tables which specify the molecular diversity (number of unique molecules of a particular category) and observations (either an absolute number of observations or a relative observation rate, i.e. observations divided by the number of samples). Plots. <br>
 __GNPS:__ http://gnps.ucsd.edu <br>

In [3]:
suppressMessages(library(ggplot2))
suppressMessages(library(dplyr))
suppressMessages(library(grid))
suppressMessages(library(gridExtra))
suppressMessages(library(tidyr))
suppressMessages(library(labeling))
#suppressMessages(library(reshape2))
suppressMessages(library(Formula))
#suppressMessages(library(Hmisc))
suppressMessages(library(psych))

# Input Occurrence Table (downloaded from GNPS)
    columns reordered such that GNPS annotation and frequency (sum of occurance in files, i.e. by row) are first. header changed to reflect the filename (barcode) without additional information added by analysis (e.g. injection number, well position, etc)
    https://gnps.ucsd.edu/ProteoSAFe/status.jsp?task=a005e8462be34652962da6691b95be58

In [4]:
#read data into notebook
dmat_occurance <- read.delim("Data/20180702_GNPSOccurrenceTable.txt", sep="\t", header=FALSE, colClasses="character")
colnames(dmat_occurance) <- dmat_occurance[1,]
dmat_occurance <- dmat_occurance[-1,-c(2)]
rownames(dmat_occurance) <- NULL
    dmat_occurance <- subset(dmat_occurance, dmat_occurance$LibraryID != "N/A")
    dmat_occurance <- subset(dmat_occurance, dmat_occurance$LibraryID != "B10A30 Faulkner legacy library looks like sterol or lipid needs to be verified")
    dmat_occurance <- subset(dmat_occurance, dmat_occurance$LibraryID != "MoNA:788880 Gabapentin Related Compound E")
#look at first couple of lines
head(dmat_occurance)
#check dimension of data read into notebook
dim(dmat_occurance)

LibraryID,AA3594_1_10_RA10_01_37671.mzXML,AA3594_1_1_RA1_01_37666.mzXML,AA3594_1_2_RA2_01_37667.mzXML,AA3594_1_3_RA3_01_37668.mzXML,AA3594_1_4_RA4_01_37669.mzXML,AA3594_1_5_diluted_RC5_01_37659.mzXML,AA3594_1_6_diluted_RC6_01_37661.mzXML,AA3594_1_7_RA7_01_37664.mzXML,AA3594_1_8_RA8_01_37665.mzXML,⋯,WW5157_2_10_RE10_01_37486.mzXML,WW5157_2_1_RE1_01_37476.mzXML,WW5157_2_2_RE2_01_37477.mzXML,WW5157_2_3_RE3_01_37478.mzXML,WW5157_2_4_RE4_01_37479.mzXML,WW5157_2_5_RE5_01_37480.mzXML,WW5157_2_6_RE6_01_37481.mzXML,WW5157_2_7_RE7_01_37483.mzXML,WW5157_2_8_RE8_01_37484.mzXML,WW5157_2_9_RE9_01_37485.mzXML
Ethyl 3-hydroxybenzoate,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
3-(2-Hydroxyphenyl)propionic acid,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
Massbank: Dextromethorphan,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
Massbank:FIO00071 (-)-Epicatechin,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
"Massbank:PR100263 (+)-Epicatechin|EpCt-pl|ent-Epicatechin|Epicatechol|(+)-3,3',4',5,7-Pentahydroxyflavan",0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,1
"MoNA:585772 [(2R)-2-ethylhexyl] 2-cyano-3,3-diphenylprop-2-enoate",0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


## Curated Information Table

In [5]:
#read data into notebook
mmat_sourceinfo <- read.delim("Data/GNPS_Curated_Source_Information_Version2.0 - Curated_Source_Information_Table (18).tsv",  sep="\t", header=TRUE)
mmat_sourceinfo <- mmat_sourceinfo[-c(1:3),]
mmat_sourceinfo <- subset(mmat_sourceinfo, mmat_sourceinfo$Curated_GNPS_Annotation != "disregard")
mmat_sourceinfo <- subset(mmat_sourceinfo, mmat_sourceinfo$Curated_GNPS_Annotation != "literature limited")
#look at first couple of lines
head(mmat_sourceinfo)
#check dimension of data read into notebook
dim(mmat_sourceinfo)

Unnamed: 0,GNPS_annotation,Curated_GNPS_Annotation,Inchi_whole.string,SMILES_canonical,Structural_Class,Structural_Subclass,Structural_Subclass2,Source_GFOP,Source,Source_Sub1,⋯,Drug_Info_2,Bioactivity_boolean,Bioactivity_Known,Origin_Known,Sample_Type_Detected_In_Food,Sample_Type_Detected_In_HumanFeces,Project_Vital_Foodomics,Project_Vital_AGP1,Comment,Reference
4,()11-HEDE,"(12E,14Z)-11-Hydroxyicosa-12,14-dienoic acid","InChI=1S/C20H36O3/c1-2-3-4-5-7-10-13-16-19(21)17-14-11-8-6-9-12-15-18-20(22)23/h7,10,13,16,19,21H,2-6,8-9,11-12,14-15,17-18H2,1H3,(H,22,23)/b10-7-,16-13+",CCCCCC=CC=CC(CCCCCCCCCC(=O)O)O,lipid,fatty acid,,food,food,food,⋯,,,,,yes,yes,yes,yes,"not documented; produced by non-enzymatic oxidation of 11,14-eicosadienoic acid",https://www.caymanchem.com/product/37500
5,"(2.beta.,3.alpha.,5.alpha.,16.beta.,17.beta.)-2-(4-Morpholinyl)-16-(1-pyrrolidinyl)androstane-3,17-diol","(2,3,5,16,17)-2-(4-Morpholinyl)-16-(1-pyrrolidinyl)androstane-3,17-diol",literature limited,literature limited,literature limited,literature limited,literature limited,literature limited,literature limited,literature limited,⋯,literature limited,,,,yes,yes,yes,yes,,
6,Ethyl 3-hydroxybenzoate,Ethyl 3-hydroxybenzoate,"InChI=1S/C9H10O3/c1-2-12-9(11)7-4-3-5-8(10)6-7/h3-6,10H,2H2,1H3",CCOC(=O)C1=CC(=CC=C1)O,,,,,food,food additive,⋯,,,,,no,no,no,no,,https://pubchem.ncbi.nlm.nih.gov/compound/Ethyl_3-hydroxybenzoate#section=Top
7,3-(2-Hydroxyphenyl)propionic acid,2-Hydroxybenzenepropanoic acid,"InChI=1S/C9H10O3/c10-8-4-2-1-3-7(8)5-6-9(11)12/h1-4,10H,5-6H2,(H,11,12)",C1=CC=C(C(=C1)CCC(=O)O)O,,,,,food,food,⋯,,,,,no,no,no,no,,https://pubchem.ncbi.nlm.nih.gov/compound/3-_2-Hydroxyphenyl_propionic_acid
8,(Phenoxymethyl)penicilloic acid,(Phenoxymethyl)penicilloic acid,"InChI=1S/C16H20N2O6S/c1-16(2)12(15(22)23)18-13(25-16)11(14(20)21)17-10(19)8-24-9-6-4-3-5-7-9/h3-7,11-13,18H,8H2,1-2H3,(H,17,19)(H,20,21)(H,22,23)",CC1(C(NC(S1)C(C(=O)O)NC(=O)COC2=CC=CC=C2)C(=O)O)C,drug,drug,,antimicrobial,drug,drug metabolite,⋯,penicillin metabolite,yes,pharmaceutical,,yes,yes,yes,yes,,https://pubchem.ncbi.nlm.nih.gov/compound/96439#section=Top
9,Spectral Match to 1-(1Z-Hexadecenyl)-sn-glycero-3-phosphocholine from NIST14,1-(1Z-Hexadecenyl)-sn-glycero-3-phosphocholine,"InChI=1S/C24H50NO6P/c1-5-6-7-8-9-10-11-12-13-14-15-16-17-18-20-29-22-24(26)23-31-32(27,28)30-21-19-25(2,3)4/h18,20,24,26H,5-17,19,21-23H2,1-4H3/b20-18-/t24-/m1/s1",CCCCCCCCCCCCCC\C=C/OC[C@@H](O)COP([O-])(=O)OCC[N+](C)(C)C,lipid,phospholipid,phosphocholine,multiple sources,multiple sources,multiple sources,⋯,,,,,yes,yes,yes,yes,,http://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:73850


# Metadata

In [6]:
#metadata
metadata <- read.delim("Data/SI_Table_1.csv", header=TRUE, sep=",", colClasses = "character")
#metadata <- subset(metadata,metadata$body_habitat == "uberon:oral cavity") != 
dim(metadata)
head(metadata)
#is(metadata)

filename,unique_sample_ID,unique_sample_ID.1,ATTRIBUTE_sample_ID,ATTRIBUTE_Meds_number,ATTRIBUTE_prescribed_acetaminophen,ATTRIBUTE_prescribed_albuterol,ATTRIBUTE_prescribed_allopurinol,ATTRIBUTE_prescribed_amlodipine,ATTRIBUTE_prescribed_aspirin,⋯,time_of_last_dose,date_of_whole_blood_sample_collection,time_of_whole_blood_sample_collection,Drug_1_Level_whole_blood_ngml,ATTRIBUTE_Timepoint,ATTRIBUTE_Visit,ATTRIBUTE_Body_Site,ATTRIBUTE_Sample_Location_General_Text,ATTRIBUTE_Sample_Location_General_Sub1_Text,ATTRIBUTE_Sample_Location_General_Sub2_Text
AA3594_2_9_RB9_01_37685.mzXML,AA3594_2_9_RB9,AA3594_2_9,AA3594,22,0,0,1,0,0,⋯,7:00:00 PM,3/19/2017,7:35:00 AM,11.6,2,lab,9,Hand,Backhand,L_Backhand
AA3594_2_6_diluted_RD6_01_37680.mzXML,AA3594_2_6_diluted,AA3594_2_6,AA3594,22,0,0,1,0,0,⋯,7:00:00 PM,3/19/2017,7:35:00 AM,11.6,2,lab,6,Axillary,Axillary,L_Axillary
AA3594_2_8_RB8_01_37684.mzXML,AA3594_2_8_RB8,AA3594_2_8,AA3594,22,0,0,1,0,0,⋯,7:00:00 PM,3/19/2017,7:35:00 AM,11.6,2,lab,8,Hand,Backhand,R_Backhand
AA3594_1_1_RA1_01_37666.mzXML,AA3594_1_1_RA1,AA3594_1_1,AA3594,22,0,0,1,0,0,⋯,7:00:00 PM,3/19/2017,7:35:00 AM,11.6,1,clinic,1,Face,Forehead,R_Forehead
AA3594_2_3_RB3_01_37675.mzXML,AA3594_2_3_RB3,AA3594_2_3,AA3594,22,0,0,1,0,0,⋯,7:00:00 PM,3/19/2017,7:35:00 AM,11.6,2,lab,3,Face,Nose,R_Nose
AA3594_1_9_RA9_01_37670.mzXML,AA3594_1_9_RA9,AA3594_1_9,AA3594,22,0,0,1,0,0,⋯,7:00:00 PM,3/19/2017,7:35:00 AM,11.6,1,clinic,9,Hand,Backhand,L_Backhand


## Merge  Occurrence Table and Curated Information Table

In [7]:
#combine
merge_mmat_sourceinfo_dmat_occurance <- t(merge(mmat_sourceinfo[,1:22], dmat_occurance, by.x="GNPS_annotation", by.y="LibraryID"))
  names <- rownames(merge_mmat_sourceinfo_dmat_occurance)
  rownames(merge_mmat_sourceinfo_dmat_occurance) <- NULL
  merge_mmat_sourceinfo_dmat_occurance <- cbind(names,merge_mmat_sourceinfo_dmat_occurance)
    colnames(merge_mmat_sourceinfo_dmat_occurance) <- NULL
head(merge_mmat_sourceinfo_dmat_occurance)
#write.table(merge_mmat_sourceinfo_dmat_occurance,"20180121_merge_mmat_sourceinfo_dmat_occurance_AmericanGutPhase1.txt", row.names=FALSE, sep="\t")

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
GNPS_annotation,.alpha.-Cyclodextrin,.alpha.-Hexylcinnamaldehyde,.alpha.-Ionone,(-)-Catechin,(-)-Secoisolariciresinol,"(.+/-.)-8-Hydroxy-5Z,9E,11Z,14Z,17Z-eicosapentaenoic acid",(+)-Catechin,2-(Cyclohexylamino)ethanesulfonic acid,3-(2-Hydroxyphenyl)propionic acid,⋯,Spectral Match to Sulfamethizole from NIST14,Spectral Match to Timolol from NIST14,Spectral Match to Triphenyl phosphate from NIST14,Spectral Match to Tris(2-butoxyethyl) phosphate from NIST14,Spectral Match to Tyr-Pro from NIST14,Spectral Match to Undecaethylene glycol from NIST14,Sulfachloropyridazine,Sulfamethoxazole,Syringaldehyde,Syringic acid
Curated_GNPS_Annotation,alpha-Cyclodextrin,alpha-Hexylcinnamaldehyde,Ionone,Catechin,Secoisolariciresinol,"8-Hydroxy-5,9,11,14,17-eicosapentaenoic acid",Catechin,2-(Cyclohexylamino)ethanesulfonic acid,2-Hydroxybenzenepropanoic acid,⋯,Sulfamethizole,Timolol,Triphenyl phosphate,Tris(2-butoxyethyl) phosphate,Tyr Pro,Undecaethylene glycol,Sulfachloropyridazine,Sulfamethoxazole,Syringaldehyde,Syringic acid
Inchi_whole.string,"InChI=1S/C36H60O30/c37-1-7-25-13(43)19(49)31(55-7)62-26-8(2-38)57-33(21(51)15(26)45)64-28-10(4-40)59-35(23(53)17(28)47)66-30-12(6-42)60-36(24(54)18(30)48)65-29-11(5-41)58-34(22(52)16(29)46)63-27-9(3-39)56-32(61-25)20(50)14(27)44/h7-54H,1-6H2/t7-,8-,9-,10-,11-,12-,13-,14-,15-,16-,17-,18-,19-,20-,21-,22-,23-,24-,25-,26-,27-,28-,29-,30-,31-,32-,33-,34-,35-,36-/m1/s1","InChI=1S/C15H20O/c1-2-3-4-6-11-15(13-16)12-14-9-7-5-8-10-14/h5,7-10,12-13H,2-4,6,11H2,1H3/b15-12+","InChI=1S/C13H20O/c1-10-6-5-9-13(3,4)12(10)8-7-11(2)14/h6-8,12H,5,9H2,1-4H3/b8-7+","InChI=1S/C15H14O6/c16-8-4-11(18)9-6-13(20)15(21-14(9)5-8)7-1-2-10(17)12(19)3-7/h1-5,13,15-20H,6H2/t13-,15+/m0/s1","InChI=1S/C20H26O6/c1-25-19-9-13(3-5-17(19)23)7-15(11-21)16(12-22)8-14-4-6-18(24)20(10-14)26-2/h3-6,9-10,15-16,21-24H,7-8,11-12H2,1-2H3/t15-,16-/m0/s1","InChI=1S/C20H30O3/c1-2-3-4-5-6-7-8-9-10-13-16-19(21)17-14-11-12-15-18-20(22)23/h3-4,6-7,9-11,13-14,16,19,21H,2,5,8,12,15,17-18H2,1H3,(H,22,23)/b4-3-,7-6-,10-9-,14-11-,16-13+","InChI=1S/C15H14O6/c16-8-4-11(18)9-6-13(20)15(21-14(9)5-8)7-1-2-10(17)12(19)3-7/h1-5,13,15-20H,6H2/t13-,15+/m0/s1","InChI=1S/C8H17NO3S/c10-13(11,12)7-6-9-8-4-2-1-3-5-8/h8-9H,1-7H2,(H,10,11,12)","InChI=1S/C9H10O3/c10-8-4-2-1-3-7(8)5-6-9(11)12/h1-4,10H,5-6H2,(H,11,12)",⋯,"InChI=1S/C9H10N4O2S2/c1-6-11-12-9(16-6)13-17(14,15)8-4-2-7(10)3-5-8/h2-5H,10H2,1H3,(H,12,13)",,"InChI=1S/C18H15O4P/c19-23(20-16-10-4-1-5-11-16,21-17-12-6-2-7-13-17)22-18-14-8-3-9-15-18/h1-15H","InChI=1S/C18H39O7P/c1-4-7-10-20-13-16-23-26(19,24-17-14-21-11-8-5-2)25-18-15-22-12-9-6-3/h4-18H2,1-3H3",,,"InChI=1S/C10H9ClN4O2S/c11-9-5-6-10(14-13-9)15-18(16,17)8-3-1-7(12)2-4-8/h1-6H,12H2,(H,14,15)","InChI=1S/C10H11N3O3S/c1-7-6-10(12-16-7)13-17(14,15)9-4-2-8(11)3-5-9/h2-6H,11H2,1H3,(H,12,13)",,"InChI=1S/C9H10O5/c1-13-6-3-5(9(11)12)4-7(14-2)8(6)10/h3-4,10H,1-2H3,(H,11,12)"
SMILES_canonical,C(C1C2C(C(C(O1)OC3C(OC(C(C3O)O)OC4C(OC(C(C4O)O)OC5C(OC(C(C5O)O)OC6C(OC(C(C6O)O)OC7C(OC(O2)C(C7O)O)CO)CO)CO)CO)CO)O)O)O,CCCCCCC(=CC1=CC=CC=C1)C=O,CC1=CCCC(C1C=CC(=O)C)(C)C,C1C(C(OC2=CC(=CC(=C21)O)O)C3=CC(=C(C=C3)O)O)O,COC1=C(C=CC(=C1)CC(CO)C(CC2=CC(=C(C=C2)O)OC)CO)O,CCC=CCC=CCC=CC=CC(CC=CCCCC(=O)O)O,C1C(C(OC2=CC(=CC(=C21)O)O)C3=CC(=C(C=C3)O)O)O,C1CCC(CC1)NCCS(=O)(=O)O,C1=CC=C(C(=C1)CCC(=O)O)O,⋯,CC1=NN=C(S1)NS(=O)(=O)C2=CC=C(C=C2)N,,C1=CC=C(C=C1)OP(=O)(OC2=CC=CC=C2)OC3=CC=CC=C3,CCCCOCCOP(=O)(OCCOCCCC)OCCOCCCC,,,C1=CC(=CC=C1N)S(=O)(=O)NC2=NN=C(C=C2)Cl,CC1=CC(=NO1)NS(=O)(=O)C2=CC=C(C=C2)N,,COC1=CC(=CC(=C1O)OC)C(=O)O
Structural_Class,carbohydrate,,lipid,phytochemical,phytochemical,lipid,phytochemical,other,,⋯,drug,drug,,,amino acid / peptide,,drug,drug,,phytochemical
Structural_Subclass,polysaccharide,,isoprenoid,flavonoid,phenol,eicosanoid,flavonoid,,,⋯,drug,drug,,,dipeptide,,drug,drug,,phenol


# Handle Duplicate GNPS Annotations
    collapse duplicate GNPS annotations for the same molecule into a single annotation, preserving which samples in which it was detected without duplication.

In [8]:
remove_dups <- merge_mmat_sourceinfo_dmat_occurance
colnames(remove_dups) <- remove_dups[2,]
head(remove_dups)
dup <- unique(colnames(remove_dups)[which(duplicated(colnames(remove_dups)))])
dup

Curated_GNPS_Annotation,alpha-Cyclodextrin,alpha-Hexylcinnamaldehyde,Ionone,Catechin,Secoisolariciresinol,"8-Hydroxy-5,9,11,14,17-eicosapentaenoic acid",Catechin.1,2-(Cyclohexylamino)ethanesulfonic acid,2-Hydroxybenzenepropanoic acid,⋯,Sulfamethizole,Timolol,Triphenyl phosphate,Tris(2-butoxyethyl) phosphate,Tyr Pro,Undecaethylene glycol,Sulfachloropyridazine,Sulfamethoxazole,Syringaldehyde,Syringic acid
GNPS_annotation,.alpha.-Cyclodextrin,.alpha.-Hexylcinnamaldehyde,.alpha.-Ionone,(-)-Catechin,(-)-Secoisolariciresinol,"(.+/-.)-8-Hydroxy-5Z,9E,11Z,14Z,17Z-eicosapentaenoic acid",(+)-Catechin,2-(Cyclohexylamino)ethanesulfonic acid,3-(2-Hydroxyphenyl)propionic acid,⋯,Spectral Match to Sulfamethizole from NIST14,Spectral Match to Timolol from NIST14,Spectral Match to Triphenyl phosphate from NIST14,Spectral Match to Tris(2-butoxyethyl) phosphate from NIST14,Spectral Match to Tyr-Pro from NIST14,Spectral Match to Undecaethylene glycol from NIST14,Sulfachloropyridazine,Sulfamethoxazole,Syringaldehyde,Syringic acid
Curated_GNPS_Annotation,alpha-Cyclodextrin,alpha-Hexylcinnamaldehyde,Ionone,Catechin,Secoisolariciresinol,"8-Hydroxy-5,9,11,14,17-eicosapentaenoic acid",Catechin,2-(Cyclohexylamino)ethanesulfonic acid,2-Hydroxybenzenepropanoic acid,⋯,Sulfamethizole,Timolol,Triphenyl phosphate,Tris(2-butoxyethyl) phosphate,Tyr Pro,Undecaethylene glycol,Sulfachloropyridazine,Sulfamethoxazole,Syringaldehyde,Syringic acid
Inchi_whole.string,"InChI=1S/C36H60O30/c37-1-7-25-13(43)19(49)31(55-7)62-26-8(2-38)57-33(21(51)15(26)45)64-28-10(4-40)59-35(23(53)17(28)47)66-30-12(6-42)60-36(24(54)18(30)48)65-29-11(5-41)58-34(22(52)16(29)46)63-27-9(3-39)56-32(61-25)20(50)14(27)44/h7-54H,1-6H2/t7-,8-,9-,10-,11-,12-,13-,14-,15-,16-,17-,18-,19-,20-,21-,22-,23-,24-,25-,26-,27-,28-,29-,30-,31-,32-,33-,34-,35-,36-/m1/s1","InChI=1S/C15H20O/c1-2-3-4-6-11-15(13-16)12-14-9-7-5-8-10-14/h5,7-10,12-13H,2-4,6,11H2,1H3/b15-12+","InChI=1S/C13H20O/c1-10-6-5-9-13(3,4)12(10)8-7-11(2)14/h6-8,12H,5,9H2,1-4H3/b8-7+","InChI=1S/C15H14O6/c16-8-4-11(18)9-6-13(20)15(21-14(9)5-8)7-1-2-10(17)12(19)3-7/h1-5,13,15-20H,6H2/t13-,15+/m0/s1","InChI=1S/C20H26O6/c1-25-19-9-13(3-5-17(19)23)7-15(11-21)16(12-22)8-14-4-6-18(24)20(10-14)26-2/h3-6,9-10,15-16,21-24H,7-8,11-12H2,1-2H3/t15-,16-/m0/s1","InChI=1S/C20H30O3/c1-2-3-4-5-6-7-8-9-10-13-16-19(21)17-14-11-12-15-18-20(22)23/h3-4,6-7,9-11,13-14,16,19,21H,2,5,8,12,15,17-18H2,1H3,(H,22,23)/b4-3-,7-6-,10-9-,14-11-,16-13+","InChI=1S/C15H14O6/c16-8-4-11(18)9-6-13(20)15(21-14(9)5-8)7-1-2-10(17)12(19)3-7/h1-5,13,15-20H,6H2/t13-,15+/m0/s1","InChI=1S/C8H17NO3S/c10-13(11,12)7-6-9-8-4-2-1-3-5-8/h8-9H,1-7H2,(H,10,11,12)","InChI=1S/C9H10O3/c10-8-4-2-1-3-7(8)5-6-9(11)12/h1-4,10H,5-6H2,(H,11,12)",⋯,"InChI=1S/C9H10N4O2S2/c1-6-11-12-9(16-6)13-17(14,15)8-4-2-7(10)3-5-8/h2-5H,10H2,1H3,(H,12,13)",,"InChI=1S/C18H15O4P/c19-23(20-16-10-4-1-5-11-16,21-17-12-6-2-7-13-17)22-18-14-8-3-9-15-18/h1-15H","InChI=1S/C18H39O7P/c1-4-7-10-20-13-16-23-26(19,24-17-14-21-11-8-5-2)25-18-15-22-12-9-6-3/h4-18H2,1-3H3",,,"InChI=1S/C10H9ClN4O2S/c11-9-5-6-10(14-13-9)15-18(16,17)8-3-1-7(12)2-4-8/h1-6H,12H2,(H,14,15)","InChI=1S/C10H11N3O3S/c1-7-6-10(12-16-7)13-17(14,15)9-4-2-8(11)3-5-9/h2-6H,11H2,1H3,(H,12,13)",,"InChI=1S/C9H10O5/c1-13-6-3-5(9(11)12)4-7(14-2)8(6)10/h3-4,10H,1-2H3,(H,11,12)"
SMILES_canonical,C(C1C2C(C(C(O1)OC3C(OC(C(C3O)O)OC4C(OC(C(C4O)O)OC5C(OC(C(C5O)O)OC6C(OC(C(C6O)O)OC7C(OC(O2)C(C7O)O)CO)CO)CO)CO)CO)O)O)O,CCCCCCC(=CC1=CC=CC=C1)C=O,CC1=CCCC(C1C=CC(=O)C)(C)C,C1C(C(OC2=CC(=CC(=C21)O)O)C3=CC(=C(C=C3)O)O)O,COC1=C(C=CC(=C1)CC(CO)C(CC2=CC(=C(C=C2)O)OC)CO)O,CCC=CCC=CCC=CC=CC(CC=CCCCC(=O)O)O,C1C(C(OC2=CC(=CC(=C21)O)O)C3=CC(=C(C=C3)O)O)O,C1CCC(CC1)NCCS(=O)(=O)O,C1=CC=C(C(=C1)CCC(=O)O)O,⋯,CC1=NN=C(S1)NS(=O)(=O)C2=CC=C(C=C2)N,,C1=CC=C(C=C1)OP(=O)(OC2=CC=CC=C2)OC3=CC=CC=C3,CCCCOCCOP(=O)(OCCOCCCC)OCCOCCCC,,,C1=CC(=CC=C1N)S(=O)(=O)NC2=NN=C(C=C2)Cl,CC1=CC(=NO1)NS(=O)(=O)C2=CC=C(C=C2)N,,COC1=CC(=CC(=C1O)OC)C(=O)O
Structural_Class,carbohydrate,,lipid,phytochemical,phytochemical,lipid,phytochemical,other,,⋯,drug,drug,,,amino acid / peptide,,drug,drug,,phytochemical
Structural_Subclass,polysaccharide,,isoprenoid,flavonoid,phenol,eicosanoid,flavonoid,,,⋯,drug,drug,,,dipeptide,,drug,drug,,phenol


In [9]:
dup <- colnames(remove_dups)[which(duplicated(colnames(remove_dups)))]
for(i in 1:length(dup)) {
    idx <- which(colnames(remove_dups)==dup[i])
    if(length(idx)==1) next
    lvec <- apply(remove_dups[,idx], 1, function(x) any(x==1))
    remove_dups[lvec,idx[1]] <- 1
    remove_dups <- remove_dups[,-idx[-1]]
}         

In [10]:
dim(merge_mmat_sourceinfo_dmat_occurance)
dim(remove_dups)
head(remove_dups)

Curated_GNPS_Annotation,alpha-Cyclodextrin,alpha-Hexylcinnamaldehyde,Ionone,Catechin,Secoisolariciresinol,"8-Hydroxy-5,9,11,14,17-eicosapentaenoic acid",2-(Cyclohexylamino)ethanesulfonic acid,2-Hydroxybenzenepropanoic acid,3-Methoxycinnamic acid,⋯,Sulfamethizole,Timolol,Triphenyl phosphate,Tris(2-butoxyethyl) phosphate,Tyr Pro,Undecaethylene glycol,Sulfachloropyridazine,Sulfamethoxazole,Syringaldehyde,Syringic acid
GNPS_annotation,.alpha.-Cyclodextrin,.alpha.-Hexylcinnamaldehyde,.alpha.-Ionone,(-)-Catechin,(-)-Secoisolariciresinol,"(.+/-.)-8-Hydroxy-5Z,9E,11Z,14Z,17Z-eicosapentaenoic acid",2-(Cyclohexylamino)ethanesulfonic acid,3-(2-Hydroxyphenyl)propionic acid,3-Methoxycinnamic acid,⋯,Spectral Match to Sulfamethizole from NIST14,Spectral Match to Timolol from NIST14,Spectral Match to Triphenyl phosphate from NIST14,Spectral Match to Tris(2-butoxyethyl) phosphate from NIST14,Spectral Match to Tyr-Pro from NIST14,Spectral Match to Undecaethylene glycol from NIST14,Sulfachloropyridazine,Sulfamethoxazole,Syringaldehyde,Syringic acid
Curated_GNPS_Annotation,alpha-Cyclodextrin,alpha-Hexylcinnamaldehyde,Ionone,Catechin,Secoisolariciresinol,"8-Hydroxy-5,9,11,14,17-eicosapentaenoic acid",2-(Cyclohexylamino)ethanesulfonic acid,2-Hydroxybenzenepropanoic acid,3-Methoxycinnamic acid,⋯,Sulfamethizole,Timolol,Triphenyl phosphate,Tris(2-butoxyethyl) phosphate,Tyr Pro,Undecaethylene glycol,Sulfachloropyridazine,Sulfamethoxazole,Syringaldehyde,Syringic acid
Inchi_whole.string,"InChI=1S/C36H60O30/c37-1-7-25-13(43)19(49)31(55-7)62-26-8(2-38)57-33(21(51)15(26)45)64-28-10(4-40)59-35(23(53)17(28)47)66-30-12(6-42)60-36(24(54)18(30)48)65-29-11(5-41)58-34(22(52)16(29)46)63-27-9(3-39)56-32(61-25)20(50)14(27)44/h7-54H,1-6H2/t7-,8-,9-,10-,11-,12-,13-,14-,15-,16-,17-,18-,19-,20-,21-,22-,23-,24-,25-,26-,27-,28-,29-,30-,31-,32-,33-,34-,35-,36-/m1/s1","InChI=1S/C15H20O/c1-2-3-4-6-11-15(13-16)12-14-9-7-5-8-10-14/h5,7-10,12-13H,2-4,6,11H2,1H3/b15-12+","InChI=1S/C13H20O/c1-10-6-5-9-13(3,4)12(10)8-7-11(2)14/h6-8,12H,5,9H2,1-4H3/b8-7+","InChI=1S/C15H14O6/c16-8-4-11(18)9-6-13(20)15(21-14(9)5-8)7-1-2-10(17)12(19)3-7/h1-5,13,15-20H,6H2/t13-,15+/m0/s1","InChI=1S/C20H26O6/c1-25-19-9-13(3-5-17(19)23)7-15(11-21)16(12-22)8-14-4-6-18(24)20(10-14)26-2/h3-6,9-10,15-16,21-24H,7-8,11-12H2,1-2H3/t15-,16-/m0/s1","InChI=1S/C20H30O3/c1-2-3-4-5-6-7-8-9-10-13-16-19(21)17-14-11-12-15-18-20(22)23/h3-4,6-7,9-11,13-14,16,19,21H,2,5,8,12,15,17-18H2,1H3,(H,22,23)/b4-3-,7-6-,10-9-,14-11-,16-13+","InChI=1S/C8H17NO3S/c10-13(11,12)7-6-9-8-4-2-1-3-5-8/h8-9H,1-7H2,(H,10,11,12)","InChI=1S/C9H10O3/c10-8-4-2-1-3-7(8)5-6-9(11)12/h1-4,10H,5-6H2,(H,11,12)","InChI=1S/C10H10O3/c1-13-9-4-2-3-8(7-9)5-6-10(11)12/h2-7H,1H3,(H,11,12)/b6-5+",⋯,"InChI=1S/C9H10N4O2S2/c1-6-11-12-9(16-6)13-17(14,15)8-4-2-7(10)3-5-8/h2-5H,10H2,1H3,(H,12,13)",,"InChI=1S/C18H15O4P/c19-23(20-16-10-4-1-5-11-16,21-17-12-6-2-7-13-17)22-18-14-8-3-9-15-18/h1-15H","InChI=1S/C18H39O7P/c1-4-7-10-20-13-16-23-26(19,24-17-14-21-11-8-5-2)25-18-15-22-12-9-6-3/h4-18H2,1-3H3",,,"InChI=1S/C10H9ClN4O2S/c11-9-5-6-10(14-13-9)15-18(16,17)8-3-1-7(12)2-4-8/h1-6H,12H2,(H,14,15)","InChI=1S/C10H11N3O3S/c1-7-6-10(12-16-7)13-17(14,15)9-4-2-8(11)3-5-9/h2-6H,11H2,1H3,(H,12,13)",,"InChI=1S/C9H10O5/c1-13-6-3-5(9(11)12)4-7(14-2)8(6)10/h3-4,10H,1-2H3,(H,11,12)"
SMILES_canonical,C(C1C2C(C(C(O1)OC3C(OC(C(C3O)O)OC4C(OC(C(C4O)O)OC5C(OC(C(C5O)O)OC6C(OC(C(C6O)O)OC7C(OC(O2)C(C7O)O)CO)CO)CO)CO)CO)O)O)O,CCCCCCC(=CC1=CC=CC=C1)C=O,CC1=CCCC(C1C=CC(=O)C)(C)C,C1C(C(OC2=CC(=CC(=C21)O)O)C3=CC(=C(C=C3)O)O)O,COC1=C(C=CC(=C1)CC(CO)C(CC2=CC(=C(C=C2)O)OC)CO)O,CCC=CCC=CCC=CC=CC(CC=CCCCC(=O)O)O,C1CCC(CC1)NCCS(=O)(=O)O,C1=CC=C(C(=C1)CCC(=O)O)O,COC1=CC=CC(=C1)C=CC(=O)O,⋯,CC1=NN=C(S1)NS(=O)(=O)C2=CC=C(C=C2)N,,C1=CC=C(C=C1)OP(=O)(OC2=CC=CC=C2)OC3=CC=CC=C3,CCCCOCCOP(=O)(OCCOCCCC)OCCOCCCC,,,C1=CC(=CC=C1N)S(=O)(=O)NC2=NN=C(C=C2)Cl,CC1=CC(=NO1)NS(=O)(=O)C2=CC=C(C=C2)N,,COC1=CC(=CC(=C1O)OC)C(=O)O
Structural_Class,carbohydrate,,lipid,phytochemical,phytochemical,lipid,other,,phytochemical,⋯,drug,drug,,,amino acid / peptide,,drug,drug,,phytochemical
Structural_Subclass,polysaccharide,,isoprenoid,flavonoid,phenol,eicosanoid,,,phenylpropanoid,⋯,drug,drug,,,dipeptide,,drug,drug,,phenol


# Combine Occurrence Table after Duplicate Handling with Metadata

In [8]:
#combine
final_matrix <- merge(metadata, remove_dups, by.x="filename",  by.y="Curated_GNPS_Annotation")
dim(remove_dups)
dim(metadata)
dim(final_matrix)
head(final_matrix)
write.csv(final_matrix,"Tables_MS2/20180829_Immuno15Skin_FinalOccuranceTablewithMetadata.csv", row.names=FALSE)

filename,unique_sample_ID,unique_sample_ID.1,ATTRIBUTE_sample_ID,ATTRIBUTE_Meds_number,ATTRIBUTE_prescribed_acetaminophen,ATTRIBUTE_prescribed_albuterol,ATTRIBUTE_prescribed_allopurinol,ATTRIBUTE_prescribed_amlodipine,ATTRIBUTE_prescribed_aspirin,⋯,Sulfamethizole,Timolol,Triphenyl phosphate,Tris(2-butoxyethyl) phosphate,Tyr Pro,Undecaethylene glycol,Sulfachloropyridazine,Sulfamethoxazole,Syringaldehyde,Syringic acid
AA3594_1_1_RA1_01_37666.mzXML,AA3594_1_1_RA1,AA3594_1_1,AA3594,22,0,0,1,0,0,⋯,0,0,0,0,1,0,1,0,0,1
AA3594_1_10_RA10_01_37671.mzXML,AA3594_1_10_RA10,AA3594_1_10,AA3594,22,0,0,1,0,0,⋯,0,0,0,0,0,0,1,1,1,0
AA3594_1_2_RA2_01_37667.mzXML,AA3594_1_2_RA2,AA3594_1_2,AA3594,22,0,0,1,0,0,⋯,0,0,0,0,1,0,1,0,1,1
AA3594_1_3_RA3_01_37668.mzXML,AA3594_1_3_RA3,AA3594_1_3,AA3594,22,0,0,1,0,0,⋯,0,0,0,0,1,0,1,0,1,1
AA3594_1_4_RA4_01_37669.mzXML,AA3594_1_4_RA4,AA3594_1_4,AA3594,22,0,0,1,0,0,⋯,0,0,0,0,1,0,1,0,1,1
AA3594_1_5_diluted_RC5_01_37659.mzXML,AA3594_1_5_diluted,AA3594_1_5,AA3594,22,0,0,1,0,0,⋯,0,0,0,0,0,0,0,0,0,0


----------------------------------------------------------------------------------------------------------------------------
# Frequency Tables and Plots
----------------------------------------------------------------------------------------------------------------------------

In [9]:
AGP_vector <- final_matrix[,1]
remove_dups <- as.data.frame(remove_dups)
AGP_test <- remove_dups[remove_dups$Curated_GNPS_Annotation %in% AGP_vector, ]
AGP_test <- rbind(remove_dups[1:22,],AGP_test)
AGP <- t(AGP_test)
rownames(AGP) <- NULL
colnames(AGP) <- AGP[1,]
AGP <- AGP[-1,]
AGP_frequency <- as.data.frame(AGP)
write.csv(AGP_frequency,"Tables_MS2/20180829_Immuno15Skin_SourceTracking_ALL_InfoTable.csv", row.names=FALSE)

In [10]:
unique_Source <- unique(AGP_frequency$Source)
unique_Source
unique_SourceSub1 <- unique(AGP_frequency$Source_Sub1)
unique_SourceSub1
unique_SourceSub2 <- unique(AGP_frequency$Source_Sub2)
unique_SourceSub2
unique_Bioactivity_Known <- unique(AGP_frequency$Bioactivity_Known)
unique_Bioactivity_Known
unique_DrugInfo1 <- unique(AGP_frequency$Drug_Info_1)
unique_DrugInfo1
unique_DrugInfo2 <- unique(AGP_frequency$Drug_Info_2)
unique_DrugInfo2

# GFOP Phase 1 - Frequency Table

In [11]:
# read .csv file
    AGPonly_Frequency_from_OccurenceTable <- read.csv("Tables_MS2/20180829_Immuno15Skin_SourceTracking_ALL_InfoTable.csv",
                                                      header=TRUE)
# calculate row_sum ("observations by row")
    AGPonly_Frequency_from_OccurenceTable <- cbind(rowSums(AGPonly_Frequency_from_OccurenceTable[,-c(1:22)]), 
                                                   AGPonly_Frequency_from_OccurenceTable)
    colnames(AGPonly_Frequency_from_OccurenceTable)[1] <- "Row_Sum"
# remove all annotations that have zero observations
    abc <- subset(AGPonly_Frequency_from_OccurenceTable,AGPonly_Frequency_from_OccurenceTable$Row_Sum != 0)
    rownames(abc) <- NULL
# how many annotations have zero observations
    sum(AGPonly_Frequency_from_OccurenceTable$Row_Sum == 0)
# check dimensions
    dim(AGPonly_Frequency_from_OccurenceTable)
    dim(abc)
    head(abc)
AGPonly_Frequency_from_OccurenceTable <- abc

Row_Sum,GNPS_annotation,Curated_GNPS_Annotation,Inchi_whole.string,SMILES_canonical,Structural_Class,Structural_Subclass,Structural_Subclass2,Source_GFOP,Source,⋯,WW5157_2_10_RE10_01_37486.mzXML,WW5157_2_1_RE1_01_37476.mzXML,WW5157_2_2_RE2_01_37477.mzXML,WW5157_2_3_RE3_01_37478.mzXML,WW5157_2_4_RE4_01_37479.mzXML,WW5157_2_5_RE5_01_37480.mzXML,WW5157_2_6_RE6_01_37481.mzXML,WW5157_2_7_RE7_01_37483.mzXML,WW5157_2_8_RE8_01_37484.mzXML,WW5157_2_9_RE9_01_37485.mzXML
3,.alpha.-Cyclodextrin,alpha-Cyclodextrin,"InChI=1S/C36H60O30/c37-1-7-25-13(43)19(49)31(55-7)62-26-8(2-38)57-33(21(51)15(26)45)64-28-10(4-40)59-35(23(53)17(28)47)66-30-12(6-42)60-36(24(54)18(30)48)65-29-11(5-41)58-34(22(52)16(29)46)63-27-9(3-39)56-32(61-25)20(50)14(27)44/h7-54H,1-6H2/t7-,8-,9-,10-,11-,12-,13-,14-,15-,16-,17-,18-,19-,20-,21-,22-,23-,24-,25-,26-,27-,28-,29-,30-,31-,32-,33-,34-,35-,36-/m1/s1",C(C1C2C(C(C(O1)OC3C(OC(C(C3O)O)OC4C(OC(C(C4O)O)OC5C(OC(C(C5O)O)OC6C(OC(C(C6O)O)OC7C(OC(O2)C(C7O)O)CO)CO)CO)CO)CO)O)O)O,carbohydrate,polysaccharide,maltodextrin,multiple sources,multiple sources,⋯,0,0,0,0,0,0,0,0,0,0
15,.alpha.-Hexylcinnamaldehyde,alpha-Hexylcinnamaldehyde,"InChI=1S/C15H20O/c1-2-3-4-6-11-15(13-16)12-14-9-7-5-8-10-14/h5,7-10,12-13H,2-4,6,11H2,1H3/b15-12+",CCCCCCC(=CC1=CC=CC=C1)C=O,,,cinnamaldehyde,food,food,⋯,0,0,0,0,0,0,0,0,0,0
5,.alpha.-Ionone,Ionone,"InChI=1S/C13H20O/c1-10-6-5-9-13(3,4)12(10)8-7-11(2)14/h6-8,12H,5,9H2,1-4H3/b8-7+",CC1=CCCC(C1C=CC(=O)C)(C)C,lipid,isoprenoid,,unknown,unknown,⋯,0,0,0,0,0,0,0,1,0,0
91,(-)-Catechin,Catechin,"InChI=1S/C15H14O6/c16-8-4-11(18)9-6-13(20)15(21-14(9)5-8)7-1-2-10(17)12(19)3-7/h1-5,13,15-20H,6H2/t13-,15+/m0/s1",C1C(C(OC2=CC(=CC(=C21)O)O)C3=CC(=C(C=C3)O)O)O,phytochemical,flavonoid,,food,food,⋯,0,1,0,1,0,1,0,1,1,0
3,(-)-Secoisolariciresinol,Secoisolariciresinol,"InChI=1S/C20H26O6/c1-25-19-9-13(3-5-17(19)23)7-15(11-21)16(12-22)8-14-4-6-18(24)20(10-14)26-2/h3-6,9-10,15-16,21-24H,7-8,11-12H2,1-2H3/t15-,16-/m0/s1",COC1=C(C=CC(=C1)CC(CO)C(CC2=CC(=C(C=C2)O)OC)CO)O,phytochemical,phenol,,food,food,⋯,0,0,1,0,0,0,0,0,0,0
10,"(.+/-.)-8-Hydroxy-5Z,9E,11Z,14Z,17Z-eicosapentaenoic acid","8-Hydroxy-5,9,11,14,17-eicosapentaenoic acid","InChI=1S/C20H30O3/c1-2-3-4-5-6-7-8-9-10-13-16-19(21)17-14-11-12-15-18-20(22)23/h3-4,6-7,9-11,13-14,16,19,21H,2,5,8,12,15,17-18H2,1H3,(H,22,23)/b4-3-,7-6-,10-9-,14-11-,16-13+",CCC=CCC=CCC=CC=CC(CC=CCCCC(=O)O)O,lipid,eicosanoid,,multiple sources,multiple sources,⋯,0,0,0,0,0,0,0,0,0,0
