Prior to loading the dataset into R, it was pre-processed in Excel. The raw elemental intensities were each divided by the Rh column, which scaled each element, and a new column for each element was generated. Then, all amounts less than or equal to 0 were replaced with 0.0001, to avoid any issues with NaNs in the dataset.

# Load libraries

In [1]:
# try installing vctrs fresh, since it is giving me errors
#install.packages("vctrs", dependencies = TRUE, repos = 'http://cran.us.r-project.org')
library(vctrs)

# caret package is used to create partition of the dataset, generating a stratified train & test set
#install.packages("caret", dependencies = TRUE, repos = 'http://cran.us.r-project.org')
library(caret)

# Dicer library is used to run ensemble clustering, which generates interim labels for unknown samples
#install.packages("diceR", dependencies = TRUE, repos = 'http://cran.us.r-project.org')
library(diceR)

# try installing Biobase, since I got error messages
#if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("Biobase")
library(Biobase)

# randomForest is the classification system which identifies unknown samples
#install.packages("randomForest", dependencies = TRUE, repos = 'http://cran.us.r-project.org')
library(randomForest)


The downloaded binary packages are in
	/var/folders/pr/t9901z0n4z7dbkstysncwp0m0000gn/T//RtmpiNPk8M/downloaded_packages


“dependency ‘superpc’ is not available”



The downloaded binary packages are in
	/var/folders/pr/t9901z0n4z7dbkstysncwp0m0000gn/T//RtmpiNPk8M/downloaded_packages


Loading required package: lattice

Loading required package: ggplot2




The downloaded binary packages are in
	/var/folders/pr/t9901z0n4z7dbkstysncwp0m0000gn/T//RtmpiNPk8M/downloaded_packages


Bioconductor version 3.11 (BiocManager 1.30.10), R 4.0.2 (2020-06-22)

Installing package(s) 'Biobase'




The downloaded binary packages are in
	/var/folders/pr/t9901z0n4z7dbkstysncwp0m0000gn/T//RtmpiNPk8M/downloaded_packages


Old packages: 'backports', 'callr', 'covr', 'devtools', 'DT', 'fs', 'glue',
  'htmlwidgets', 'IRkernel', 'jsonlite', 'knitr', 'MASS', 'mgcv', 'nlme',
  'openssl', 'pkgbuild', 'processx', 'ps', 'RcppArmadillo', 'remotes',
  'stringi', 'survival', 'sys', 'usethis', 'withr', 'xfun'

Loading required package: BiocGenerics

Loading required package: parallel


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, 


The downloaded binary packages are in
	/var/folders/pr/t9901z0n4z7dbkstysncwp0m0000gn/T//RtmpiNPk8M/downloaded_packages


randomForest 4.6-14

Type rfNews() to see new features/changes/bug fixes.


Attaching package: ‘randomForest’


The following object is masked from ‘package:Biobase’:

    combine


The following object is masked from ‘package:BiocGenerics’:

    combine


The following object is masked from ‘package:ggplot2’:

    margin




In [2]:
# set the seed
set.seed(24924)

# Load train dataset

In [3]:
train <- read.csv("AllSamples.csv", header = TRUE)
# remove silicified wood & limestone types from the dataset
train <- train[(train$Vis != "Silicified Wood" & train$Vis != "Limestone"),]

head(train)

Unnamed: 0_level_0,X,is_known,Vis,Ag,Al,As,Au,Ca,Cu,Fe,⋯,Si.Rh,Sn.Rh,Sr.Rh,Th.Rh,Ti.Rh,U.Rh,V.Rh,Y.Rh,Zn.Rh,Zr.Rh
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,71.11.1-89.01,Guess,Gravel Cherts,1368,1,0.0001,54,1515,331,5335,⋯,0.0623453,0.030244431,0.13691213,0.0232828,0.015702351,0.027769183,0.005491955,0.00363552,0.090423886,7.74e-09
2,71.11.1-89.02,Guess,Gravel Cherts,1269,15,55.0,25,682,300,2670,⋯,0.06373444,0.019170124,0.18995851,0.01294606,0.013609959,0.016431535,0.006887967,8.3e-05,0.07186722,8.3e-09
3,71.11.1-89.03,Guess,Gravel Cherts,1165,32,4.0,46,607,523,6119,⋯,0.05545573,0.010944136,0.09882391,0.007350539,0.018703038,0.008330611,0.005308723,0.004573669,0.039692911,0.002531852
4,71.11.1-89.04,Guess,Gravel Cherts,1299,40,42.0,45,338,410,5307,⋯,0.06011742,0.010667328,0.04589432,0.01174233,0.021582734,0.024559663,0.001901927,0.011080791,0.032746217,0.004796163
5,71.11.1-89.05,Guess,Gravel Cherts,1724,35,0.0001,51,201,301,1745,⋯,0.07111721,0.002502422,0.08411366,0.006780756,0.010251857,0.010494026,0.005247013,0.012431385,0.009283177,8.07e-09
6,71.11.1-89.06,Guess,Gravel Cherts,1246,139,159.0,18,188,292,4858,⋯,0.07108405,0.009646556,0.09575393,7.91e-09,0.008381434,0.006167471,0.001897683,7.91e-05,0.014627975,0.00585119


In [4]:
dim(train)

In [5]:
# restrict dataset to only important columns: sample id, is_known, and the elemental intensities
train_df = train[,c(1,2,3,32:58)]
# rename X to sample_id
names(train_df)[1] <- "sample_id"
# preview dataset
head(train_df)

Unnamed: 0_level_0,sample_id,is_known,Vis,Ag.Rh,Al.Rh,As.Rh,Au.Rh,Ca.Rh,Cu.Rh,Fe.Rh,⋯,Si.Rh,Sn.Rh,Sr.Rh,Th.Rh,Ti.Rh,U.Rh,V.Rh,Y.Rh,Zn.Rh,Zr.Rh
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,71.11.1-89.01,Guess,Gravel Cherts,0.10581683,7.74e-05,7.74e-09,0.00417698,0.1171875,0.02560334,0.4126702,⋯,0.0623453,0.030244431,0.13691213,0.0232828,0.015702351,0.027769183,0.005491955,0.00363552,0.090423886,7.74e-09
2,71.11.1-89.02,Guess,Gravel Cherts,0.1053112,0.001244813,0.004564315,0.002074689,0.05659751,0.02489627,0.2215768,⋯,0.06373444,0.019170124,0.18995851,0.01294606,0.013609959,0.016431535,0.006887967,8.3e-05,0.07186722,8.3e-09
3,71.11.1-89.03,Guess,Gravel Cherts,0.09514864,0.002613525,0.000326691,0.003756942,0.0495753,0.0427148,0.499755,⋯,0.05545573,0.010944136,0.09882391,0.007350539,0.018703038,0.008330611,0.005308723,0.004573669,0.039692911,0.002531852
4,71.11.1-89.04,Guess,Gravel Cherts,0.10741751,0.003307699,0.003473084,0.003721161,0.02795005,0.03390391,0.4388489,⋯,0.06011742,0.010667328,0.04589432,0.01174233,0.021582734,0.024559663,0.001901927,0.011080791,0.032746217,0.004796163
5,71.11.1-89.05,Guess,Gravel Cherts,0.13916694,0.002825315,8.07e-09,0.004116887,0.01622538,0.02429771,0.1408621,⋯,0.07111721,0.002502422,0.08411366,0.006780756,0.010251857,0.010494026,0.005247013,0.012431385,0.009283177,8.07e-09
6,71.11.1-89.06,Guess,Gravel Cherts,0.09852139,0.010990749,0.01257215,0.001423262,0.01486518,0.02308848,0.3841227,⋯,0.07108405,0.009646556,0.09575393,7.91e-09,0.008381434,0.006167471,0.001897683,7.91e-05,0.014627975,0.00585119


# Choose a clustering method

Please see the notebooks "cluster_alg_selection.ipynb" & "cluster_alg_selection_2.ipynb" for more information on this step. The result of those two notebooks is that we selected the gmm algorithm with 5 clusters to create the labels for the artifacts.

# Run ensemble clustering on train dataset

In [6]:
cluster <- consensus_cluster(train_df[,4:30], nk=2:5, p.item=1, reps=1, 
                             algorithms=c("gmm"), scale = FALSE)








In [7]:
# save the results of the algorithm as a dataframe
gmm <- cluster[,,"GMM",1:4]
head(gmm)

Unnamed: 0,2,3,4,5
1,1,3,4,5
2,1,1,1,5
3,1,1,1,1
4,1,1,1,1
5,1,1,1,1
6,1,1,1,1


In [8]:
# Each number in the table refers to the group that each sample has been assigned to.
# Edit each value in the tables by adding "Group_" to each of the numbers so that they are strings and can be counted, not summed, by crosstab
gmm[,1:4] <- paste("Group", gmm[,1:4], sep = "_")
# turn this matrix into a dataframe
gmm <- as.data.frame(gmm)
head(gmm)

Unnamed: 0_level_0,2,3,4,5
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>
1,Group_1,Group_3,Group_4,Group_5
2,Group_1,Group_1,Group_1,Group_5
3,Group_1,Group_1,Group_1,Group_1
4,Group_1,Group_1,Group_1,Group_1
5,Group_1,Group_1,Group_1,Group_1
6,Group_1,Group_1,Group_1,Group_1


In [9]:
# Assign the appropriate labels to each of the Groups
# first add the vis & is_known columns back in
gmm$is_known <- train$is_known
gmm$vis <- train$Vis
# then create an ftable with the sums of each of the known labels
gmm_raw <- gmm[gmm$is_known == "Known",c(4,6)]
gmm_raw <- ftable(gmm_raw[])
gmm_raw <- as.data.frame(gmm_raw)
gmm_raw

X5,vis,Freq
<fct>,<fct>,<int>
Group_1,Alibates,15
Group_2,Alibates,0
Group_3,Alibates,2
Group_4,Alibates,3
Group_5,Alibates,3
Group_1,Edwards Plateau Chert,46
Group_2,Edwards Plateau Chert,1
Group_3,Edwards Plateau Chert,0
Group_4,Edwards Plateau Chert,11
Group_5,Edwards Plateau Chert,14


In [10]:
gmm_art <- gmm[gmm$is_known != "Known",c(4,6)]
gmm_art <- ftable(gmm_art[])
gmm_art <- as.data.frame(gmm_art)
gmm_art

X5,vis,Freq
<fct>,<fct>,<int>
Group_1,Agate,33
Group_2,Agate,0
Group_3,Agate,3
Group_4,Agate,8
Group_5,Agate,6
Group_1,Alibates,2
Group_2,Alibates,0
Group_3,Alibates,0
Group_4,Alibates,1
Group_5,Alibates,0


In [11]:
# add the results back into the train_df
train_df$gmm_label <- gmm[,4]
head(train_df)

Unnamed: 0_level_0,sample_id,is_known,Vis,Ag.Rh,Al.Rh,As.Rh,Au.Rh,Ca.Rh,Cu.Rh,Fe.Rh,⋯,Sn.Rh,Sr.Rh,Th.Rh,Ti.Rh,U.Rh,V.Rh,Y.Rh,Zn.Rh,Zr.Rh,gmm_label
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1,71.11.1-89.01,Guess,Gravel Cherts,0.10581683,7.74e-05,7.74e-09,0.00417698,0.1171875,0.02560334,0.4126702,⋯,0.030244431,0.13691213,0.0232828,0.015702351,0.027769183,0.005491955,0.00363552,0.090423886,7.74e-09,Group_5
2,71.11.1-89.02,Guess,Gravel Cherts,0.1053112,0.001244813,0.004564315,0.002074689,0.05659751,0.02489627,0.2215768,⋯,0.019170124,0.18995851,0.01294606,0.013609959,0.016431535,0.006887967,8.3e-05,0.07186722,8.3e-09,Group_5
3,71.11.1-89.03,Guess,Gravel Cherts,0.09514864,0.002613525,0.000326691,0.003756942,0.0495753,0.0427148,0.499755,⋯,0.010944136,0.09882391,0.007350539,0.018703038,0.008330611,0.005308723,0.004573669,0.039692911,0.002531852,Group_1
4,71.11.1-89.04,Guess,Gravel Cherts,0.10741751,0.003307699,0.003473084,0.003721161,0.02795005,0.03390391,0.4388489,⋯,0.010667328,0.04589432,0.01174233,0.021582734,0.024559663,0.001901927,0.011080791,0.032746217,0.004796163,Group_1
5,71.11.1-89.05,Guess,Gravel Cherts,0.13916694,0.002825315,8.07e-09,0.004116887,0.01622538,0.02429771,0.1408621,⋯,0.002502422,0.08411366,0.006780756,0.010251857,0.010494026,0.005247013,0.012431385,0.009283177,8.07e-09,Group_1
6,71.11.1-89.06,Guess,Gravel Cherts,0.09852139,0.010990749,0.01257215,0.001423262,0.01486518,0.02308848,0.3841227,⋯,0.009646556,0.09575393,7.91e-09,0.008381434,0.006167471,0.001897683,7.91e-05,0.014627975,0.00585119,Group_1


In [12]:
# Change the labels from Group_n to the appropriate label
# known samples retain their labels, guesses get the new labels from the clustering method
# excepting the El Sauz CHert labels, because those are not actually guesses, we know them to be ESC because of their characteristics

train_df$gmm_label[train_df$is_known == "Known" & train_df$Vis == "El Sauz Chert"] <- "ESC"
train_df$gmm_label[train_df$is_known == "Known" & train_df$Vis == "Oman Chert"] <- "Oman"
train_df$gmm_label[train_df$is_known == "Known" & train_df$Vis == "Edwards Plateau Chert"] <- "EPC"
train_df$gmm_label[train_df$is_known == "Known" & train_df$Vis == "Knife River Flint"] <- "KRF"
train_df$gmm_label[train_df$is_known == "Known" & train_df$Vis == "Alibates"] <- "Alibates"
train_df$gmm_label[train_df$is_known == "Guess" & train_df$Vis == "El Sauz Chert"] <- "ESC"
train_df$gmm_label[train_df$is_known == "Guess" & train_df$gmm_label == "Group_1"] <- "Chert_1"
train_df$gmm_label[train_df$is_known == "Guess" & train_df$gmm_label == "Group_2"] <- "Igneous"
train_df$gmm_label[train_df$is_known == "Guess" & train_df$gmm_label == "Group_3"] <- "ESC"
train_df$gmm_label[train_df$is_known == "Guess" & train_df$gmm_label == "Group_4"] <- "Chert_3"
train_df$gmm_label[train_df$is_known == "Guess" & train_df$gmm_label == "Group_5"] <- "Chert_2"

head(train_df)

Unnamed: 0_level_0,sample_id,is_known,Vis,Ag.Rh,Al.Rh,As.Rh,Au.Rh,Ca.Rh,Cu.Rh,Fe.Rh,⋯,Sn.Rh,Sr.Rh,Th.Rh,Ti.Rh,U.Rh,V.Rh,Y.Rh,Zn.Rh,Zr.Rh,gmm_label
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1,71.11.1-89.01,Guess,Gravel Cherts,0.10581683,7.74e-05,7.74e-09,0.00417698,0.1171875,0.02560334,0.4126702,⋯,0.030244431,0.13691213,0.0232828,0.015702351,0.027769183,0.005491955,0.00363552,0.090423886,7.74e-09,Chert_2
2,71.11.1-89.02,Guess,Gravel Cherts,0.1053112,0.001244813,0.004564315,0.002074689,0.05659751,0.02489627,0.2215768,⋯,0.019170124,0.18995851,0.01294606,0.013609959,0.016431535,0.006887967,8.3e-05,0.07186722,8.3e-09,Chert_2
3,71.11.1-89.03,Guess,Gravel Cherts,0.09514864,0.002613525,0.000326691,0.003756942,0.0495753,0.0427148,0.499755,⋯,0.010944136,0.09882391,0.007350539,0.018703038,0.008330611,0.005308723,0.004573669,0.039692911,0.002531852,Chert_1
4,71.11.1-89.04,Guess,Gravel Cherts,0.10741751,0.003307699,0.003473084,0.003721161,0.02795005,0.03390391,0.4388489,⋯,0.010667328,0.04589432,0.01174233,0.021582734,0.024559663,0.001901927,0.011080791,0.032746217,0.004796163,Chert_1
5,71.11.1-89.05,Guess,Gravel Cherts,0.13916694,0.002825315,8.07e-09,0.004116887,0.01622538,0.02429771,0.1408621,⋯,0.002502422,0.08411366,0.006780756,0.010251857,0.010494026,0.005247013,0.012431385,0.009283177,8.07e-09,Chert_1
6,71.11.1-89.06,Guess,Gravel Cherts,0.09852139,0.010990749,0.01257215,0.001423262,0.01486518,0.02308848,0.3841227,⋯,0.009646556,0.09575393,7.91e-09,0.008381434,0.006167471,0.001897683,7.91e-05,0.014627975,0.00585119,Chert_1


## Random Forests

In this section we will create two random forest classifications. The first will be run using the labels generated by visual classification, which we know are not optimal. The second will be built with the labels generated by the clustering algorithm selected in the previous section.

1. RF built with vis labels
2. RF built with chosen cluster labels

In [13]:
# take a portion of train and create a test dataset, to be added to the test set already in csv form, since there are so few samples in the test dataset
trainIndex <- createDataPartition(train_df$gmm_label, p = .8, 
                                  list = FALSE, times = 1)

lithTrain <- train_df[ trainIndex,]
lithTest  <- train_df[-trainIndex,]

head(lithTrain)

Unnamed: 0_level_0,sample_id,is_known,Vis,Ag.Rh,Al.Rh,As.Rh,Au.Rh,Ca.Rh,Cu.Rh,Fe.Rh,⋯,Sn.Rh,Sr.Rh,Th.Rh,Ti.Rh,U.Rh,V.Rh,Y.Rh,Zn.Rh,Zr.Rh,gmm_label
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1,71.11.1-89.01,Guess,Gravel Cherts,0.10581683,7.74e-05,7.74e-09,0.00417698,0.1171875,0.02560334,0.4126702,⋯,0.030244431,0.13691213,0.0232828,0.015702351,0.027769183,0.005491955,0.00363552,0.090423886,7.74e-09,Chert_2
2,71.11.1-89.02,Guess,Gravel Cherts,0.1053112,0.001244813,0.004564315,0.002074689,0.05659751,0.02489627,0.2215768,⋯,0.019170124,0.18995851,0.01294606,0.013609959,0.016431535,0.006887967,8.3e-05,0.07186722,8.3e-09,Chert_2
3,71.11.1-89.03,Guess,Gravel Cherts,0.09514864,0.002613525,0.000326691,0.003756942,0.0495753,0.0427148,0.499755,⋯,0.010944136,0.09882391,0.007350539,0.018703038,0.008330611,0.005308723,0.004573669,0.039692911,0.002531852,Chert_1
5,71.11.1-89.05,Guess,Gravel Cherts,0.13916694,0.002825315,8.07e-09,0.004116887,0.01622538,0.02429771,0.1408621,⋯,0.002502422,0.08411366,0.006780756,0.010251857,0.010494026,0.005247013,0.012431385,0.009283177,8.07e-09,Chert_1
6,71.11.1-89.06,Guess,Gravel Cherts,0.09852139,0.010990749,0.01257215,0.001423262,0.01486518,0.02308848,0.3841227,⋯,0.009646556,0.09575393,7.91e-09,0.008381434,0.006167471,0.001897683,7.91e-05,0.014627975,0.00585119,Chert_1
7,71.11.1-89.07,Guess,Gravel Cherts,0.11442869,0.002752294,0.002668891,0.007923269,0.01050876,0.02460384,0.2135947,⋯,0.018432027,0.12335279,0.01209341,0.006839033,0.019599666,0.004587156,8.34e-05,0.081067556,0.005421184,Chert_2


In [14]:
# 1. RF - vis
# the predicted value (Y) is the Vis column 
RF_vis_Y <- lithTrain$Vis

# the values we will use to predict (x) are the elemental concentrations
RF_x <- lithTrain[,4:30]

head(RF_x)

Unnamed: 0_level_0,Ag.Rh,Al.Rh,As.Rh,Au.Rh,Ca.Rh,Cu.Rh,Fe.Rh,K.Rh,Mg.Rh,Mn.Rh,⋯,Si.Rh,Sn.Rh,Sr.Rh,Th.Rh,Ti.Rh,U.Rh,V.Rh,Y.Rh,Zn.Rh,Zr.Rh
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,0.10581683,7.74e-05,7.74e-09,0.00417698,0.1171875,0.02560334,0.4126702,0.010597153,0.009127475,0.003326114,⋯,0.0623453,0.030244431,0.13691213,0.0232828,0.015702351,0.027769183,0.005491955,0.00363552,0.090423886,7.74e-09
2,0.1053112,0.001244813,0.004564315,0.002074689,0.05659751,0.02489627,0.2215768,0.005394191,0.007883817,0.010373444,⋯,0.06373444,0.019170124,0.18995851,0.01294606,0.013609959,0.016431535,0.006887967,8.3e-05,0.07186722,8.3e-09
3,0.09514864,0.002613525,0.000326691,0.003756942,0.0495753,0.0427148,0.499755,0.015436132,0.003430252,0.005553741,⋯,0.05545573,0.010944136,0.09882391,0.007350539,0.018703038,0.008330611,0.005308723,0.004573669,0.039692911,0.002531852
5,0.13916694,0.002825315,8.07e-09,0.004116887,0.01622538,0.02429771,0.1408621,8.07e-05,0.002179529,0.007830158,⋯,0.07111721,0.002502422,0.08411366,0.006780756,0.010251857,0.010494026,0.005247013,0.012431385,0.009283177,8.07e-09
6,0.09852139,0.010990749,0.01257215,0.001423262,0.01486518,0.02308848,0.3841227,0.001265122,0.007748873,0.009725627,⋯,0.07108405,0.009646556,0.09575393,7.91e-09,0.008381434,0.006167471,0.001897683,7.91e-05,0.014627975,0.00585119
7,0.11442869,0.002752294,0.002668891,0.007923269,0.01050876,0.02460384,0.2135947,0.000834028,0.011342786,0.007506255,⋯,0.06964137,0.018432027,0.12335279,0.01209341,0.006839033,0.019599666,0.004587156,8.34e-05,0.081067556,0.005421184


In [15]:
RF_output_vis <- randomForest(y = as.factor(RF_vis_Y), x = RF_x, importance = TRUE, 
                              ntree = 10001, proximity = TRUE)
RF_output_vis


Call:
 randomForest(x = RF_x, y = as.factor(RF_vis_Y), ntree = 10001,      importance = TRUE, proximity = TRUE) 
               Type of random forest: classification
                     Number of trees: 10001
No. of variables tried at each split: 5

        OOB estimate of  error rate: 22.97%
Confusion matrix:
                      Agate Alibates Black Cherts Black Metamorphic
Agate                     0        0            0                 0
Alibates                  0        0            0                 0
Black Cherts              0        0            0                 2
Black Metamorphic         0        0            0                 8
Edwards Plateau Chert     0        0            0                 0
El Sauz Chert             0        0            0                 0
Gravel Cherts             0        0            0                 7
Knife River Flint         0        0            0                 0
Oman Chert                0        0            0                 0
Undiff

In [16]:
# 2. RF - gmm classification
# the predicted value (Y) is the gmm_label column 
RF_gmm_Y <- lithTrain$gmm_label

RF_output_gmm <- randomForest(y = as.factor(RF_gmm_Y), x = RF_x, importance = TRUE, 
                              ntree = 10001, proximity = TRUE)
RF_output_gmm


Call:
 randomForest(x = RF_x, y = as.factor(RF_gmm_Y), ntree = 10001,      importance = TRUE, proximity = TRUE) 
               Type of random forest: classification
                     Number of trees: 10001
No. of variables tried at each split: 5

        OOB estimate of  error rate: 14.42%
Confusion matrix:
         Alibates Chert_1 Chert_2 Chert_3 EPC ESC Igneous KRF Oman class.error
Alibates        0      14       2       0   2   1       0   0    0  1.00000000
Chert_1         0     344       5       0   2   0       0   0    0  0.01994302
Chert_2         0      24     104       0   0   0       0   0    0  0.18750000
Chert_3         0      10       8      10   1  11       0   0    0  0.75000000
EPC             0      24       6       0  28   0       0   0    0  0.51724138
ESC             0       1       2       0   0 275       0   0    0  0.01079137
Igneous         0       0       4       1   0   3      27   0    0  0.22857143
KRF             0       4       3       0   1   1     

In [17]:
# the results of this random forest is much better than the results when including silicified wood & limestone
# I will calculate the accuracy & precision of this subset instead of the entire dataset

Load test dataset, remove the silicified wood & limestone samples, and restrict the values to only the elemental concentrations

In [18]:
test <- read.csv("Test.csv", header = TRUE)

# remove silicified wood & limestone
test <- test[(test$Vis != "Silicified Wood" & test$Vis != "Limestone"),]

# fix the name of the sample_id column
names(test)[1] <- "sample_id"

# add the generated labels from the earlier classification where X in this table matches sample_id in the other
test$gmm_label <- train_df$gmm_label[match(test$sample_id, train_df$sample_id)]

# restrict the dataset to only the 
test = test[,c(1,2,31:58)]

# add the test set generated with the caret package
# first, drop the is_known column
lithTest <- subset(lithTest, select = -c(is_known))
lithTest <- rbind(test, lithTest)

head(lithTest)

Unnamed: 0_level_0,sample_id,Vis,Ag.Rh,Al.Rh,As.Rh,Au.Rh,Ca.Rh,Cu.Rh,Fe.Rh,K.Rh,⋯,Sn.Rh,Sr.Rh,Th.Rh,Ti.Rh,U.Rh,V.Rh,Y.Rh,Zn.Rh,Zr.Rh,gmm_label
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1,BH 001,El Sauz Chert,0.10972569,0.003075644,0.011637573,0.007481297,0.01246883,0.02236077,0.11197008,0.003657523,⋯,0.015128845,0.06118038,0.002078138,0.07057357,0.000665004,0.002576891,8.3126e-05,0.010307564,0.247215295,ESC
2,BH 002,Gravel Cherts,0.11635067,0.001458435,0.002025604,8.1024e-05,0.01450332,0.0241452,0.05777022,0.000162048,⋯,0.008102415,0.39734241,0.011019284,0.02130935,0.052827743,0.00761627,8.1024e-05,0.006076811,0.0,Chert_1
3,BH 003,Undiff. igneous,0.11377399,0.005714286,0.009381663,0.002132196,0.02626866,0.02328358,1.65637527,0.0154371,⋯,0.011428571,0.07565032,0.015692964,0.0365032,0.007590618,0.007420043,8.5288e-05,0.009722814,0.07130064,Chert_3
4,BH 004,Undiff. igneous,0.10462904,0.00262705,0.231452124,0.006794094,0.02074463,0.0292599,11.30491892,0.001992934,⋯,0.020654045,0.06585742,0.025726968,0.03116224,0.019295226,0.008062324,0.003170577,0.063773893,0.0,Igneous
5,BH 005,Gravel Cherts,0.09101322,0.000881057,0.005550661,0.007929515,0.08317181,0.0307489,1.91048458,0.02660793,⋯,0.022819383,0.93788546,0.007929515,0.07444934,0.034713656,0.017180617,0.03876652,0.021409692,0.094449339,Chert_3
6,BH 006,Gravel Cherts,0.10854994,0.003678274,0.0,0.010053948,0.0161844,0.02354095,0.51356874,0.002533922,⋯,0.009645251,0.06457414,0.018881805,0.01790093,0.017573974,0.008664378,8.1739e-05,0.006457414,0.000980873,Chert_1


In [19]:
dim(lithTest)

In [20]:
dim(test)

In [21]:
# restrict dataset to only the elemental concentrations
test_df <- lithTest[,3:29]

head(test_df)

Unnamed: 0_level_0,Ag.Rh,Al.Rh,As.Rh,Au.Rh,Ca.Rh,Cu.Rh,Fe.Rh,K.Rh,Mg.Rh,Mn.Rh,⋯,Si.Rh,Sn.Rh,Sr.Rh,Th.Rh,Ti.Rh,U.Rh,V.Rh,Y.Rh,Zn.Rh,Zr.Rh
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,0.10972569,0.003075644,0.011637573,0.007481297,0.01246883,0.02236077,0.11197008,0.003657523,0.006234414,0.00482128,⋯,0.07298421,0.015128845,0.06118038,0.002078138,0.07057357,0.000665004,0.002576891,8.3126e-05,0.010307564,0.247215295
2,0.11635067,0.001458435,0.002025604,8.1024e-05,0.01450332,0.0241452,0.05777022,0.000162048,8.1024e-05,0.001782531,⋯,0.06919462,0.008102415,0.39734241,0.011019284,0.02130935,0.052827743,0.00761627,8.1024e-05,0.006076811,0.0
3,0.11377399,0.005714286,0.009381663,0.002132196,0.02626866,0.02328358,1.65637527,0.0154371,0.008784648,0.016801706,⋯,0.05569296,0.011428571,0.07565032,0.015692964,0.0365032,0.007590618,0.007420043,8.5288e-05,0.009722814,0.07130064
4,0.10462904,0.00262705,0.231452124,0.006794094,0.02074463,0.0292599,11.30491892,0.001992934,0.0082435,0.024639913,⋯,0.07464444,0.020654045,0.06585742,0.025726968,0.03116224,0.019295226,0.008062324,0.003170577,0.063773893,0.0
5,0.09101322,0.000881057,0.005550661,0.007929515,0.08317181,0.0307489,1.91048458,0.02660793,0.015594714,0.056475771,⋯,0.06176211,0.022819383,0.93788546,0.007929515,0.07444934,0.034713656,0.017180617,0.03876652,0.021409692,0.094449339
6,0.10854994,0.003678274,0.0,0.010053948,0.0161844,0.02354095,0.51356874,0.002533922,0.003351316,0.007193068,⋯,0.06547327,0.009645251,0.06457414,0.018881805,0.01790093,0.017573974,0.008664378,8.1739e-05,0.006457414,0.000980873


Evaluate the results of the random forest trained on the labeled data. 

In [26]:
preds <- predict(RF_output_gmm, test_df)
aa <- table(lithTest$gmm_label, preds)
aa

          preds
           Alibates Chert_1 Chert_2 Chert_3 EPC ESC Igneous KRF Oman
  Alibates        0       1       2       1   0   0       0   0    0
  Chert_1         0     140       9       0   1   0       0   0    0
  Chert_2         0      14      34       2   4   0       0   0    0
  Chert_3         0       2       4       8   3   4       0   0    0
  EPC             0      10       1       0   3   0       0   0    0
  ESC             0       4       0       0   0  74       0   0    0
  Igneous         0       0       3       1   0   3      10   0    0
  KRF             0       1       0       0   1   0       0   0    0
  Oman            0       0       0       0   0   0       0   0    4

## Perform Validation Measures

### Accuracy

In [27]:
accuracy <- (aa[1,1]+aa[2,2]+aa[3,3]+aa[4,4]+aa[5,5]
             +aa[6,6]+aa[7,7]+aa[8,8]+aa[9,9])/sum(aa)
accuracy

### Precision

In [28]:
Prec_Ali <- aa[1,1]/sum(aa[1,])
Prec_Ch1 <- aa[2,2]/sum(aa[2,])
Prec_Ch2 <- aa[3,3]/sum(aa[3,])
Prec_Ch3 <- aa[4,4]/sum(aa[4,])
Prec_EPC <- aa[5,5]/sum(aa[5,])
Prec_ESC <- aa[6,6]/sum(aa[6,])
Prec_Ign <- aa[7,7]/sum(aa[7,])
Prec_KRF <- aa[8,8]/sum(aa[8,])
Prec_Oma <- aa[9,9]/sum(aa[9,])

print(paste0("Alibates: ", round(Prec_Ali, 3)))
print(paste0("Chert_1: ", round(Prec_Ch1, 3)))
print(paste0("Chert_2: ", round(Prec_Ch2, 3)))
print(paste0("Chert_3: ", round(Prec_Ch3, 3)))
print(paste0("Edwards Plateau Chert: ", round(Prec_EPC, 3)))
print(paste0("El Sauz Chert: ", round(Prec_ESC, 3)))
print(paste0("Igneous: ", round(Prec_Ign, 3)))
print(paste0("Knife River Flint: ", round(Prec_KRF, 3)))
print(paste0("Oman Chert: ", round(Prec_Oma, 3)))

[1] "Alibates: 0"
[1] "Chert_1: 0.933"
[1] "Chert_2: 0.63"
[1] "Chert_3: 0.381"
[1] "Edwards Plateau Chert: 0.214"
[1] "El Sauz Chert: 0.949"
[1] "Igneous: 0.588"
[1] "Knife River Flint: 0"
[1] "Oman Chert: 1"


### Recall

In [29]:
Recall_Ali <- aa[1,1]/sum(aa[,1])
Recall_Ch1 <- aa[2,2]/sum(aa[,2])
Recall_Ch2 <- aa[3,3]/sum(aa[,3])
Recall_Ch3 <- aa[4,4]/sum(aa[,4])
Recall_EPC <- aa[5,5]/sum(aa[,5])
Recall_ESC <- aa[6,6]/sum(aa[,6])
Recall_Ign <- aa[7,7]/sum(aa[,7])
Recall_KRF <- aa[8,8]/sum(aa[,8])
Recall_Oma <- aa[9,9]/sum(aa[,9])

print(paste0("Alibates: ", round(Recall_Ali, 3)))
print(paste0("Chert_1: ", round(Recall_Ch1, 3)))
print(paste0("Chert_2: ", round(Recall_Ch2, 3)))
print(paste0("Chert_3: ", round(Recall_Ch3, 3)))
print(paste0("Edwards Plateau Chert: ", round(Recall_EPC, 3)))
print(paste0("El Sauz Chert: ", round(Recall_ESC, 3)))
print(paste0("Igneous: ", round(Recall_Ign, 3)))
print(paste0("Knife River Flint: ", round(Recall_KRF, 3)))
print(paste0("Oman Chert: ", round(Recall_Oma, 3)))

[1] "Alibates: NaN"
[1] "Chert_1: 0.814"
[1] "Chert_2: 0.642"
[1] "Chert_3: 0.667"
[1] "Edwards Plateau Chert: 0.25"
[1] "El Sauz Chert: 0.914"
[1] "Igneous: 1"
[1] "Knife River Flint: NaN"
[1] "Oman Chert: 1"


### F1 Score

In [32]:
F1_Ali <- (2*Prec_Ali*Recall_Ali)/(Prec_Ali+Recall_Ali)
F1_Ch1 <- (2*Prec_Ch1*Recall_Ch1)/(Prec_Ch1+Recall_Ch1)
F1_Ch2 <- (2*Prec_Ch2*Recall_Ch2)/(Prec_Ch2+Recall_Ch2)
F1_Ch3 <- (2*Prec_Ch3*Recall_Ch3)/(Prec_Ch3+Recall_Ch3)
F1_EPC <- (2*Prec_EPC*Recall_EPC)/(Prec_EPC+Recall_EPC)
F1_ESC <- (2*Prec_ESC*Recall_ESC)/(Prec_ESC+Recall_ESC)
F1_Ign <- (2*Prec_Ign*Recall_Ign)/(Prec_Ign+Recall_Ign)
F1_KRF <- (2*Prec_KRF*Recall_KRF)/(Prec_KRF+Recall_KRF)
F1_Oma <- (2*Prec_Oma*Recall_Oma)/(Prec_Oma+Recall_Oma)

print(paste0("Alibates: ", round(F1_Ali, 3)))
print(paste0("Chert_1: ", round(F1_Ch1, 3)))
print(paste0("Chert_2: ", round(F1_Ch2, 3)))
print(paste0("Chert_3: ", round(F1_Ch3, 3)))
print(paste0("Edwards Plateau Chert: ", round(F1_EPC, 3)))
print(paste0("El Sauz Chert: ", round(F1_ESC, 3)))
print(paste0("Igneous: ", round(F1_Ign, 3)))
print(paste0("Knife River Flint: ", round(F1_KRF, 3)))
print(paste0("Oman Chert: ", round(F1_Oma, 3)))

[1] "Alibates: NaN"
[1] "Chert_1: 0.87"
[1] "Chert_2: 0.636"
[1] "Chert_3: 0.485"
[1] "Edwards Plateau Chert: 0.231"
[1] "El Sauz Chert: 0.931"
[1] "Igneous: 0.741"
[1] "Knife River Flint: NaN"
[1] "Oman Chert: 1"
