In [None]:
library(ggplot2)
library(reshape2)
library(RColorBrewer)
suppressMessages(library(dplyr))
library(stringr)
suppressMessages(library(tidyr))
theme_set(theme_bw())
library(scales)
options(repr.plot.width=7, repr.plot.height=4)
isotypes = c('Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Gln', 'Glu', 'Gly', 'His', 'Ile', 'iMet', 'Leu', 'Lys', 'Met', 'Phe', 'Pro', 'Ser', 'Thr', 'Trp', 'Tyr', 'Val')

fills = c('A'='#ffd92f', 'C'='#4daf4a', 'G'='#e41a1c', 'U'='#377eb8', 'A:U'='#93da69', 'U:A'='#93da69', 'G:C'='#c1764a', 'C:G'='#c1764a', 'G:U'='#b26cbd', 'U:G'='#b26cbd', '-'='gray60', '-:-'='gray60')
suppressMessages(library(Biostrings))

In [None]:
load('best-freqs.RData')
load('clade-isotype-specific.RData')
load('isotype-specific.RData')
load('consensus-IDEs.RData')
load('clade-isotype-specific-freqs.RData')
identities = read.delim('identities.tsv', sep='\t')
identities$quality = as.logical(identities$quality)
identities$restrict = as.logical(identities$restrict)
identities = identities %>% mutate(quality=quality & !restrict)
genome_table = read.delim('genome_table+.txt', sep='\t', stringsAsFactors=FALSE, header=FALSE, col.names=c("species_short", "species", "species_long", "domain", "clade"))

# Weird tRNAs in context of sequence features

List of weird tRNAs:

- Susan Ackerman's [Arg-UCU](http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi19/genes/tRNA-Arg-TCT-4-1.html) in mouse. 
    - Only Arg-UCU without an intron.
    - See alignment for details, but it also includes C20a instead of T20a, G20b, C40 instead of T40, etc. 
    - Most importantly, it contains C50, but its mutation to T50 likely causes misfolding in the B6J mouse strain.
    - Conserved in human, and potentially all the way to zebrafish and Drosophila.
- [Lys-CUU](http://gtrnadb.ucsc.edu/genomes/eukaryota/Scere3/genes/tRNA-Lys-CTT-1-1.html) in yeast. It's the only isodecoder for Lys-CUU. Can fold into an "F"-form able to be imported into the mitochondria.
- [Gly-GCC](http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi19/genes/tRNA-Gly-GCC-2-1.html), which is downregulated in B cell lymphoma cell lines (Maute paper). Represses RPA1, which is involved in DNA repair, DNA replication, and DNA damage response. Overexpression reduces proliferation and sensitizes cell to etoposide-induced DNA damage.

## Code for extracting position-specific scores

We need to (a) filter for the subset of tRNAs we're interested in, (b) create a multiple alignment -> covariance model (no need to calibrate), (c) align, (d) parse output, and (e) visualize.

In [1]:
calculate_position_specific_scores = function(seq = "", seqname = "", clade = "", isotype = "", anticodon = "") {
  # get subset of tRNAs and write to file
  subset = identities %>% select_('species', 'seqname', 'isotype', 'clade', 'anticodon', 'quality') %>% filter_('quality')
  if (clade != "") subset = subset %>% filter_(paste0("clade == '", clade, "'"))
  if (isotype != "") subset = subset %>% filter_(paste0("isotype == '", isotype, "'"))
  if (isotype != "" & anticodon != "") subset = subset %>% filter_(paste0("anticodon == '", anticodon, "'"))
  euk_seqs = readDNAStringSet(filepath = '/projects/lowelab/users/blin/identity/euk-isotypes/fasta/euk.fa', format = 'fasta')
  writeXStringSet(euk_seqs[match(subset$seqname, str_extract(names(euk_seqs), '\\S+'))],
                  filepath = 'subset.fa')

  # create covariance model
  system('cmalign -g --notrunc --matchonly -o subset.sto /projects/lowelab/users/blin/tRNAscan/models/domain-specific/euk-num-092016.cm subset.fa')
  system('cmbuild --hand --enone -F subset.cm subset.sto')

  # align our tRNA
  seq = DNAStringSet(seq)
  names(seq) = seqname
  writeXStringSet(seq, filepath = "my-tRNA.fa")
  system('cmalign -g --notrunc --matchonly --tfile my-tRNA.tfile -o my-tRNA.sto subset.cm my-tRNA.fa')
    
  # parse output
  system('python parse-parsetree.py my-tRNA.tfile > my-tRNA.bits')
  bits = read.table('my-tRNA.bits', header = FALSE) %>%
    mutate(Position = factor(V1, c('1:72', '2:71', '3:70', '4:69', '5:68', '6:67', '7:66', '8', '9', '10:25', '11:24', '12:23', '13:22', '14', '15', '16', '17', '18', '19', '20', '20a', '21', '26', '27:43', '28:42', '29:41', '30:40', '31:39', '32', '33', '34', '35', '36', '37', '38', '44', '45', '46', '47', '48', '49:65', '50:64', '51:63', '52:62', '53:61', '54', '55', '56', '57', '58', '59', '60', '73'))) %>%
    mutate(Bits = V2) %>%
    mutate(Identity = V3) %>%
    mutate(Source = "Input") %>%
    select(-V1, -V2) %>%
    filter(!is.na(Position))

  # emit and read in consensus sequence
  # cmemit is not good at putting deletions/insertions into consensus emit (-c switch), so exponentiate instead
  system('cmemit -N 1 --exp 7 -o subset-cons.fa subset.cm')
  system('cmalign -g --notrunc --matchonly --tfile subset-cons.tfile -o subset-cons.sto subset.cm subset-cons.fa')
  system('python parse-parsetree.py subset-cons.tfile > subset-cons.bits')

  bits = rbind(bits, read.table('subset-cons.bits', header = FALSE) %>%
    mutate(Position = factor(V1, c('1:72', '2:71', '3:70', '4:69', '5:68', '6:67', '7:66', '8', '9', '10:25', '11:24', '12:23', '13:22', '14', '15', '16', '17', '18', '19', '20', '20a', '21', '26', '27:43', '28:42', '29:41', '30:40', '31:39', '32', '33', '34', '35', '36', '37', '38', '44', '45', '46', '47', '48', '49:65', '50:64', '51:63', '52:62', '53:61', '54', '55', '56', '57', '58', '59', '60', '73'))) %>%
    mutate(Bits = V2) %>%
    mutate(Identity = V3) %>%
    mutate(Source = "Consensus") %>%
    select(-V1, -V2) %>%
    filter(!is.na(Position)))
    
  # compare consensus and our tRNA; fix instances where cmemit did not output most likely identity
  bits = bits %>% group_by(Position) %>%
    arrange(Source) %>% 
    summarize(Bits = Bits[2] - Bits[1], Consensus = Identity[1], Identity = Identity[2]) %>% 
    ungroup() %>%
    mutate(Consensus = ifelse(Bits > 0, as.character(Identity), as.character(Consensus))) %>%
    mutate(Bits = ifelse(Bits > 0, 0, Bits))

  # clean up
  system('rm subset.sto subset.cm subset.fa')
  system('rm subset-cons.fa subset-cons.bits subset-cons.tfile subset-cons.sto')
  system('rm my-tRNA.fa my-tRNA.tfile my-tRNA.bits my-tRNA.sto')

  return(bits)
}
    
calculate_scores_multiplex = function(seq = "", seqname = "", clade = "", isotype = "", anticodon = "") {

  bits = calculate_position_specific_scores(seq, seqname, clade = "", isotype = "", anticodon = "") %>% 
    mutate(Clade = "Eukaryota", Isotype = "All isotypes", Anticodon = "")

  if (clade != "") {
    bits = rbind(bits, calculate_position_specific_scores(seq, seqname, clade, isotype = "", anticodon = "") %>%
                 mutate(Clade = clade, Isotype = "All isotypes", Anticodon = ""))
  }
  if (isotype != "") {
    bits = rbind(bits, calculate_position_specific_scores(seq, seqname, clade = "", isotype, anticodon = "") %>%
                 mutate(Clade = "Eukaryota", Isotype = isotype, Anticodon = "(all isodecoders)"))
  }
  if (clade != "" & isotype != "") {
    bits = rbind(bits, calculate_position_specific_scores(seq, seqname, clade = clade, isotype = isotype, anticodon = "") %>%
                 mutate(Clade = clade, Isotype = isotype, Anticodon = "(all isodecoders)"))
  }
  if (isotype != "" & anticodon != "") {
    bits = rbind(bits, calculate_position_specific_scores(seq, seqname, clade = "", isotype = isotype, anticodon = anticodon) %>%
                 mutate(Clade = "Eukaryota", Isotype = isotype, Anticodon = anticodon))
  }
  if (clade != "" & isotype != "" & anticodon != "") {
    bits = rbind(bits, calculate_position_specific_scores(seq, seqname, clade, isotype, anticodon) %>%
                 mutate(Clade = clade, Isotype = isotype, Anticodon = anticodon))
  }
  
  return(bits %>% mutate(tRNA = seqname))
}
  
calculate_scores_multiseq = function(seqs = "", clade = "Eukaryota", isotype = "", anticodon = "") {  
  multi_bits = data.frame(Position = character(0), Bits = character(0), Identity = character(0), Consensus = character(0), Clade = character(0), Isotype = character(0), Anticodon = character(0))
    for (i in 1:length(seqs)) {
    bits = calculate_position_specific_scores(seqs[i], names(seqs)[i], clade, isotype, anticodon) %>% 
      mutate(Clade = clade, Isotype = isotype, Anticodon = anticodon, tRNA = names(seqs)[i])
    multi_bits = rbind(multi_bits, bits)
  }
  
  return(multi_bits)
}

# Arg-UCU

In [None]:
options(repr.plot.width=10, repr.plot.height=13)
bits = calculate_scores_multiplex(seq = "GTCTCTGTGGCGCAATGGAcgAGCGCGCTGGACTTCTAATCCAGAGGtTCCGGGTTCGAGTCCCGGCAGAGATG",
                                  seqname = "hg19_chr1.trna84-ArgTCT",
                                  clade = "Mammalia", 
                                  isotype = "Arg",
                                  anticodon = "TCT")

df = bits %>% mutate(Clade = as.factor(Clade), Isotype = as.factor(Isotype), Anticodon = as.factor(Anticodon), Consensus = as.factor(Consensus), Identity = as.factor(Identity), tRNA = as.factor(tRNA)) %>%
  mutate(Isodecoder = as.factor(paste0(Isotype, " ", Anticodon))) %>%
  select(Position, Clade, tRNA, Bits, Consensus, Identity, Isodecoder)

ggplot(df) + geom_bar(aes(x = Position, y = Bits, fill = Identity), size = 0.1, color = 'gray20', width = 0.8, position = 'dodge', stat = 'identity') + 
  facet_wrap( ~ paste0(Clade, " / ", Isodecoder), ncol = 3) + 
  geom_text(aes(x = Position, color = Identity, label = Identity), size = 3, y = min(df$Bits) - 0.4) +
  geom_text(aes(x = Position, color = Consensus, label = Consensus), size = 3, y = max(df$Bits) + 0.4) +
  coord_flip() +
  scale_fill_manual(values=fills) +
  scale_color_manual(values=fills) + 
  scale_x_discrete(limits = rev(levels(df$Position))) + 
  scale_y_continuous(limits = c(min(df$Bits) - 0.6, max(df$Bits) + 0.6)) + 
  theme(legend.position='none') + 
  ylab("Score difference between consensus and input (bits)")

In [None]:
options(repr.plot.width=12, repr.plot.height=7)
bits = calculate_scores_multiseq(seqs = c("hg19-tRNA-Arg-TCT-4-1" = "GTCTCTGTGGCGCAATGGAcgAGCGCGCTGGACTTCTAATCCAGAGGtTCCGGGTTCGAGTCCCGGCAGAGATG",
                                          "hg19-tRNA-Arg-TCT-1-1" = "GGCTCCGTGGCGCAATGGAtAGCGCATTGGACTTCTAgaggctgaaggcATTCAAAGGtTCCGGGTTCGAGTCCCGGCGGAGTCG",
                                          "hg19-tRNA-Arg-TCT-2-1" = "GGCTCTGTGGCGCAATGGAtAGCGCATTGGACTTCTAgtgacgaatagagcaATTCAAAGGtTGTGGGTTCGAATCCCACCAGAGTCG",
                                          "hg19-tRNA-Arg-TCT-3-1" = "GGCTCTGTGGCGCAATGGAtAGCGCATTGGACTTCTAgctgagcctagtgtggtcATTCAAAGGtTGTGGGTTCGAGTCCCACCAGAGTCG"),
                                 clade = "Mammalia", isotype = "Arg", anticodon = "TCT")

bits %>% select(Position, Bits, Identity, Consensus, tRNA) %>%
  ggplot() + geom_bar(aes(x = Position, y = Bits, fill = Identity), size = 0.1, color = 'gray20', width = 0.8, position = 'dodge', stat = 'identity') + 
  facet_wrap( ~ tRNA, nrow = 1) +
  geom_text(aes(x = Position, color = Identity, label = Identity), size = 3, y = min(bits$Bits) - 0.4) +
  geom_text(aes(x = Position, color = Consensus, label = Consensus), size = 3, y = max(bits$Bits) + 0.4) +
  coord_flip() +
  scale_fill_manual(values=fills) +
  scale_color_manual(values=fills) + 
  scale_x_discrete(limits = rev(levels(bits$Position))) + 
  scale_y_continuous(limits = c(min(bits$Bits) - 0.6, max(bits$Bits) + 0.6)) + 
  theme(legend.position='none') + 
  ylab("Score difference between consensus and input (bits)")