-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_mining.R
160 lines (124 loc) · 4.77 KB
/
data_mining.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# read trait information from .docx
library(qdapTools)
raw_traits <- qdapTools::read_docx(
"C:/Users/bruno/Documents/RESEARCH/PhD/EvolutionProject/taxon_traits.docx"
)
# fill a data frame with the 360 morphological traits
df <- data.frame(matrix(NA, ncol = 361))
# first one is all NA anyway, but we need the name
df[1, 1] <- sub("\\ .*", "", raw_traits[[1]])
for (i in 2:length(raw_traits)) {
# create a placeholder for this row
df_sub <- data.frame(matrix(0, ncol = 361))
# get the taxon name
df_sub[1, 1] <- sub("\\ .*", "", raw_traits[[i]])
# get the character data
chars <- strsplit(sub(".*\\ ", "", raw_traits[[i]]), "")[[1]]
# very confusing while loop necessary since
# a lot of the characters are uncertain (e.g. [12])
# count (index of the df we're in)
count <- 2
# candidates for character (inside [])
cands <- c()
# whether we're inside a []
running <- FALSE
# loop count
w <- 1
# 362 since we have 360 characters + column 1
while (count < 362) {
# if the char is [, need to get in it
if (chars[w] == "[") {
running <- TRUE
# increase loop counter
w <- w + 1
# continue to the next while iteration
next
}
# if we are inside a []
if (running) {
# if we already checked all of the numbers inside
if (chars[w] == "]") {
# sample one out of them to be the trait value
df_sub[1, count] <- sample(cands, 1)
# reset all the necessary variables
cands <- c()
running <- FALSE
# increase counters
count <- count + 1
w <- w + 1
# continue to the next while iteration
next
} else {
# if we're in the interior of a [], add the value
# to the cands vector, including NA if it's a ?
cands <- c(cands, ifelse(chars[w] == "?", NA, chars[w]))
# increase loop counter
w <- w + 1
# continue to next while iteration
next
}
} else {
# if we are not inside [], just add value
# to the matrix (remembering ? = NA)
df_sub[1, count] <- ifelse(chars[w] == "?", NA, chars[w])
# increase counters
count <- count + 1
w <- w + 1
}
}
# add row to our data frame
df <- rbind(df, df_sub)
}
# informative column names
colnames(df) <- c("species", paste0("trait_", 1:360))
# get the taxon list with time + geographic data (from Table 1)
taxon <- read.csv(
"C:/Users/bruno/Documents/RESEARCH/PhD/EvolutionProject/taxon_list.txt",
sep = "\t", header = FALSE)
# select only the taxon list
taxon_list <- as.character(taxon[, 1])
# make genus species separation a _ instead of space
taxon_list <- unlist(lapply(taxon_list, function(s) sub("\\ ", "_", s)))
# create new data frame to hold only the caninae species
final_df <- data.frame()
for (i in 1:nrow(df)) {
# if the species name is in the list, put it in the new df
if (df[i, 1] %in% taxon_list) {
final_df <- rbind(final_df, df[i, ])
}
}
# expect 78 rows, got
print(nrow(final_df))
# so we know some are missing, but which?
for (i in 1:length(taxon_list)) {
if (!(taxon_list[i] %in% final_df[, 1])) {
print(taxon_list[i])
}
}
# looking over Appendix S2 and the paper, seems like there were
# some typos and/or name disagreements, so we correct that
# add the remaining speces
final_df <- rbind(final_df, df[df$species %in%
c("Lupullella_adusta",
"Lupullella_mesomelas",
"Xenocyon_lycanoides"), ])
# make their name equal to the paper (to agree with tree)
final_df[final_df$species %in% c("Lupullella_adusta",
"Lupullella_mesomelas",
"Xenocyon_lycanoides"), 1] <-
c("Lupulella_adusta", "Lupulella_mesomelas", "Xenocyon_lycaonoides")
# finally, make all trait values numeric
for (i in 2:ncol(final_df)) {
final_df[, i] <- as.numeric(final_df[, i])
}
# for how many traits do we have complete information?
info <- unlist(lapply(2:ncol(final_df), function(c) sum(!is.na(final_df[, c]))))
which(info == nrow(final_df))
# 193 (194 in df) is body size, so we have a continuous trait
# categorical traits for which we have every taxa are
# really low variance, so we find a vector of variances
vars <- unlist(lapply(2:ncol(final_df), function(x) var(final_df[!is.na(final_df[, x]), x])))
# and find traits such that we have more than 0.1 variance
# and more than 95% information on taxa
which((vars > 0.1) & (info > 0.95*nrow(final_df)))
# 146 (147 in df) seems to work well