-
Notifications
You must be signed in to change notification settings - Fork 1
/
read.hhr.R
executable file
·104 lines (97 loc) · 4.67 KB
/
read.hhr.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#' Read hhr file (HH-suite output format)
#'
#' @param fname input file name
#' @import dplyr
#' @import tidyr
#' @import stringr
#' @export
read.hhr <- function(fname) {
all.lines <- readLines(fname)
param.start <- 1
data.start <- which(grepl("^ *No Hit", all.lines)) %>% first %>% `+`(1)
align.start <- which(grepl("^No 1", all.lines)) %>% first
param.end <- data.start - 2
data.end <- align.start - 1
align.end <- length(all.lines)
if (is.na(align.start)) {
data <- tibble(
No = integer(),
Hit.ID = character(),
Hit.Description = character(),
Q.ss_pred = character(),
Q.query = character(),
Q.consensus = character(),
Q.Start = integer(),
Q.End = integer(),
Q.Length = integer(),
T.consensus = character(),
T.Start = integer(),
T.End = integer(),
T.Length = integer(),
T.hit = character(),
T.ss_dssp = character(),
T.ss_pred = character(),
Aligned_cols = integer(),
E.value = numeric(),
Identities = numeric(),
Probab = numeric(),
Score = numeric(),
Similarity = numeric(),
Sum_probs = numeric(),
Template_Neff = numeric()
)
} else {
data <- data.frame(line = all.lines[align.start:align.end], stringsAsFactors = F) %>%
filter(line != "") %>%
extract(line, into = c("name", "value"), regex = "([^ ]+) ?(.+)?", remove = F) %>%
mutate(No = ifelse(name == "No", value, NA) %>% as.integer) %>%
mutate(Hit.ID = ifelse(substr(name, 1, 1) == ">", substr(name, 2, nchar(.)), NA)) %>%
mutate(Hit.Description = ifelse(substr(name, 1, 1) == ">", value, NA)) %>%
mutate(Match = ifelse(grepl("=", name), line, NA)) %>%
mutate(name = ifelse(grepl("Q Consensus", lag(line)) & grepl("T Consensus", lead(line)), "M", name)) %>%
mutate(value = ifelse(name == "M", line, value)) %>%
fill(No) %>%
group_by(No) %>%
summarize(
Hit.ID = na.omit(Hit.ID) %>% first,
Hit.Description = na.omit(Hit.Description) %>% first,
Match = na.omit(Match) %>% first,
Q.ss_pred = value[name == "Q" & grepl("^ss_pred ", value)] %>% substr(., 16, nchar(.)) %>% paste(collapse = "") %>% gsub(" +", "", .),
Q.query = value[name == "Q" & grepl("^Consensus ", lead(value))] %>% substr(., 16, nchar(.)) %>% paste(collapse = " "),
Q.consensus = value[name == "Q" & grepl("^Consensus ", value)] %>% substr(., 16, nchar(.)) %>% paste(collapse = " "),
T.consensus = value[name == "T" & grepl("^Consensus ", value)] %>% substr(., 16, nchar(.)) %>% paste(collapse = " "),
T.hit = value[name == "T" & grepl("^Consensus ", lag(value))] %>% substr(., 16, nchar(.)) %>% paste(collapse = " "),
T.ss_dssp = value[name == "T" & grepl("^ss_dssp ", value)] %>% substr(., 16, nchar(.)) %>% paste(collapse = " ") %>% gsub(" +", "", .),
T.ss_pred = value[name == "T" & grepl("^ss_pred ", value)] %>% substr(., 16, nchar(.)) %>% paste(collapse = "") %>% gsub(" ", "", .),
.groups = "drop"
) %>%
extract(Q.consensus, into = c("Q.Start", "Q.End", "Q.Length"), regex = "^ *(\\d+) .+ (\\d+) +[(](\\d+)[)]$", remove = F, convert = T) %>%
extract(T.consensus, into = c("T.Start", "T.End", "T.Length"), regex = "^ *(\\d+) .+ (\\d+) +[(](\\d+)[)]$", remove = F, convert = T) %>%
mutate(
Q.consensus = gsub("[0-9() ]+", "", Q.consensus),
Q.query = gsub("[0-9() ]+", "", Q.query),
T.consensus = gsub("[0-9() ]+", "", T.consensus),
T.hit = gsub("[0-9() ]+", "", T.hit),
) %>%
#extract(Hit.Description, into = "Hit.Organism", regex = "[{]([^}]+)[}]", remove = F) %>%
#extract(Hit.Description, into = "Hit.Description", regex = "([^;]+)", remove = F) %>%
#extract(Hit.Description, into = "Hit.Keywords", regex = "[^;]+; ([^;]+)", remove = F) %>%
mutate(Match = str_split(Match, " +")) %>%
unnest(cols = Match) %>%
separate(Match, into = c("key", "value"), "=") %>%
mutate(value = sub("%", "", value) %>% as.numeric) %>%
spread(key, value) %>%
rename(E.value = `E-value`) %>%
mutate(Aligned_cols = as.integer(Aligned_cols))
}
attributes(data) <- data.frame(key = all.lines[param.start:param.end]) %>%
mutate(value = substr(key, 14, 10000) %>% trimws, key = substr(key, 1, 14) %>% trimws) %>%
filter(key != "") %>%
{setNames(.$value, .$key)} %>%
as.list %>%
c(attributes(data))
attr(data, "Match_columns") <- attr(data, "Match_columns") %>% as.integer
attr(data, "Neff") <- attr(data, "Neff") %>% as.integer
attr(data, "Searched_HMMs") <- attr(data, "Searched_HMMs") %>% as.integer
return(data)
}