In [1]:
library(ggplot2)
library(reshape2)
library(RColorBrewer)
suppressMessages(library(dplyr))
library(stringr)
suppressMessages(library(tidyr))
theme_set(theme_bw())
library(scales)
options(repr.plot.width=7, repr.plot.height=4)
isotypes = c('Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Gln', 'Glu', 'Gly', 'His', 'Ile', 'iMet', 'Leu', 'Lys', 'Met', 'Phe', 'Pro', 'Ser', 'Thr', 'Trp', 'Tyr', 'Val')

paired_positions = c('X1.72'='1:72', 'X2.71'='2:71', 'X3.70'='3:70', 'X4.69'='4:69', 'X5.68'='5:68', 'X6.67'='6:67', 'X7.66'='7:66', 'X8.14'='*8:14', 'X9.23'='*9:23', 'X10.25'='10:25', 'X10.45'='*10:45', 'X11.24'='11:24', 'X12.23'='12:23', 'X13.22'='13:22', 'X15.48'='*15:48','X18.55'='*18:55', 'X19.56'='*19:56', 'X22.46'='*22:46', 'X26.44'='*26:44', 'X27.43'='27:43', 'X28.42'='28:42', 'X29.41'='29:41', 'X30.40'='30:40', 'X31.39'='31:39', 'X49.65'='49:65', 'X50.64'='50:64', 'X51.63'='51:63', 'X52.62'='52:62', 'X53.61'='53:61', 'X54.58'='*54:58')
paired_identities = c('GC', 'AU', 'UA', 'CG', 'GU', 'UG', 'PurinePyrimidine', 'PyrimidinePurine', 'StrongPair', 'WeakPair', 'AminoKeto', 'KetoAmino', 'Wobble', 'Paired', 'Bulge', 'Mismatched', 'Absent')

single_positions = c('X8'='8', 'X9'='9', 'X14'='14', 'X15'='15', 'X16'='16', 'X17'='17', 'X17a'='17a', 'X17b'='17b', 'X18'='18', 'X19'='19', 'X20'='20', 'X20a'='20a', 'X20b'='20b', 'X21'='21', 'X26'='26', 'X32'='32', 'X33'='33', 'X34'='34', 'X35'='35', 'X36'='36', 'X37'='37', 'X38'='38', 'X44'='44', 'X45'='45', 'X46'='46', 'X47'='47', 'X48'='48', 'X54'='54', 'X55'='55', 'X56'='56', 'X57'='57', 'X58'='58', 'X59'='59', 'X60'='60', 'X73'='73')
single_identities = c('A', 'C', 'G', 'U', 'Purine', 'Pyrimidine', 'Weak', 'Strong', 'Amino', 'Keto', 'B', 'D', 'H', 'V', 'Absent')

colors = c('A'='gray20', 'C'='gray20', 'G'='gray20', 'U'='gray20', 'Absent'='gray20', 'Purine'='gray20', 'Pyrimidine'='gray20', 'Weak'='gray20', 'Strong'='gray20', 'Amino'='gray20', 'Keto'='gray20', 'B'='gray50', 'D'='gray50', 'H'='gray50', 'V'='gray50', 'GC'='gray20', 'AU'='gray20', 'UA'='gray20', 'CG'='gray20', 'GU'='gray20', 'UG'='gray20', 'PurinePyrimidine'='gray20', 'PyrimidinePurine'='gray20', 'StrongPair'='gray20', 'WeakPair'='gray20', 'AminoKeto'='gray20', 'KetoAmino'='gray20', 'Wobble'='gray20', 'Paired'='gray20', 'Bulge'='gray20', 'Mismatched'='gray20')
fills = c('A'='#ffd92f', 'C'='#4daf4a', 'G'='#e41a1c', 'U'='#377eb8', 'Purine'='#ff8300', 'Pyrimidine'='#66c2a5', 'Weak'='#b3de69', 'Strong'='#fb72b2', 'Amino'='#c1764a', 'Keto'='#b26cbd', 'B'='#e5c494', 'D'='#ccebd5', 'H'='#ffa79d', 'V'='#a6cdea', 'Absent'='gray60', 'Mismatched'='gray30', 'Paired'='#ffffcc')

In [61]:
load('best-freqs.RData')
load('clade-isotype-specific.RData')
load('isotype-specific.RData')
load('consensus-IDEs.RData')
load('clade-isotype-specific-freqs.RData')

In [41]:
identities = read.delim('identities.tsv', sep='\t')
identities$quality = as.logical(identities$quality)
identities$restrict = as.logical(identities$restrict)
identities = identities %>% mutate(quality=quality & (!restrict | isotype == "iMet"))

# Introduction

Strategy 1: find rejected (a) consensus, (b) isotype-specific, or (c) clade-isotype-specific elements. Set a cutoff at 95% presence, but fails (i) isotype check, (ii) clade check, or (iii) species check. Spin in previous work, if any. Bonus points if previous work _conflicts_ with consensus definition.

Strategy 2: highlight instances where previous work conflicts.

Some of these will have been covered in `first-pass-consensus`.

# Strategy 1A

## Find targets

In [34]:
code_groups = c('A'=1, 'C'=1, 'G'=1, 'U'=1, 'Absent'=1, 
                'Purine'=2, 'Pyrimidine'=2,
                'Weak'=3, 'Strong'=3, 'Amino'=3, 'Keto'=3,
                'B'=4, 'D'=4, 'H'=4, 'V'=4,
                'GC'=1, 'AU'=1, 'UA'=1, 'CG'=1, 'GU'=1, 'UG'=1,
                'StrongPair'=2, 'WeakPair'=2, 'Wobble'=2,
                'PurinePyrimidine'=3, 'PyrimidinePurine'=3, 'AminoKeto'=3, 'KetoAmino'=3,
                'Paired'=4, 'Mismatched'=4, 'Bulge'=4)

cutoff_freqs = data.frame()
for (cutoff in c(0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 1.0)) {
  df = clade_iso_ac_freqs %>%
    group_by(positions, variable) %>%
    summarize(count=sum(value), freq=sum(value)/sum(total)) %>%
    filter(freq >= cutoff) %>%
    mutate(cutoff=as.character(cutoff)) %>%
    select(positions, variable, freq, cutoff) %>%
    group_by(positions) %>%
    arrange(code_groups[variable], desc(freq)) %>%
    filter(row_number(positions) == 1)
  if (nrow(cutoff_freqs) == 0) cutoff_freqs = df
  else cutoff_freqs = rbind(cutoff_freqs, df)
}

In [35]:
cutoff_freqs %>% 
  filter(positions %in% c(names(single_positions), names(paired_positions))) %>%
  select(positions, cutoff, variable) %>%
  spread(cutoff, variable)

Unnamed: 0,positions,0.5,0.6,0.7,0.8,0.9,0.95,0.99
1,X10.25,GC,GC,GC,Paired,Paired,Paired,Paired
2,X10.45,Mismatched,Mismatched,Mismatched,Mismatched,,,
3,X11.24,CG,CG,PyrimidinePurine,PyrimidinePurine,PyrimidinePurine,PyrimidinePurine,Paired
4,X12.23,PyrimidinePurine,PyrimidinePurine,Paired,Paired,Paired,Paired,
5,X13.22,PyrimidinePurine,Paired,,,,,
6,X14,A,A,A,A,A,A,A
7,X15,G,G,G,G,Purine,Purine,Purine
8,X15.48,GC,GC,GC,PurinePyrimidine,PurinePyrimidine,Paired,
9,X16,U,U,U,Pyrimidine,Pyrimidine,B,
10,X17,Absent,Absent,,,,,


In [36]:
consensus

Unnamed: 0,positions,identity
1,X11.24,Paired
2,X14,A
3,X18,V
4,X19,G
5,X19.56,GC
6,X21,H
7,X2.71,Paired
8,X30.40,Paired
9,X32,B
10,X33,Pyrimidine


For strategy 1A, some good candidates here: 
- R9 (95%)
- G18, U55, and G18:U55 (99%)
- A21 and A14/A21 (99%)
- U33 (95%)
- R46 (99%)
- G53:C61 (99%)
- U55 (99%)

## R9 and R46

### What's known

- Marck and Grosjean have a purine for iMet and no consensus for elongator, though they are mostly R or V. R9 is also conserved in initiators for archaea/bacteria.
- 9-12-23 is a tertiary interaction. [Gautheret et al. (1995)](http://dx.doi.org/10.1006/jmbi.1995.0200) has these frequencies from the Sprinzl 1991 database: ![9-12-23 frequency matrix](figures/9-12-23-gautheret.png)

- Position 9 is known to be modified with m$^1$G (along with 37) in a wide range of eukaryotes. 

### Our data

- Here's our frequencies:

In [44]:
table(paste0(identities[identities$quality, ]$X12, ':', identities[identities$quality, ]$X23), identities[identities$quality, ]$X9)

     
          -     A     C     G     U
  A:A     0     6     8     9     0
  A:C     0    22     1   578     0
  A:G     0     2     2    11     0
  A:U     0  1853   641    96     5
  C:A     0     6     0   321     1
  C:C     0     1     0     2     0
  C:G     0   644   174 13514    54
  C:U     0     0     0     3     0
  G:A     0     1     0     2     0
  G:C     0  2845     7  9278     5
  G:G     0     0     0     8     0
  G:U     0    13     1    34     0
  U:A     1 13537     8  3849    53
  U:C     0     1     0     3     0
  U:G     0    23     0    51     0
  U:U     0     6     0     1     0

#### Which isotypes/clades/species fail the consensus checks?

In [73]:
## Clade/isotype check
best_freqs %>% filter(positions == 'X9') %>% group_by(clade, isotype) %>%
  summarize(status=sum((variable %in% c("G", "A", "Purine"))*count)/sum(count) == 1) %>%
  filter(!status)

## Species check
identities %>% select(clade, species, isotype, X9) %>% group_by(clade, species, isotype) %>%
  summarize(status=sum(X9 %in% c("G", "A", "Purine")/n()) >= 0.5,
            freq=sum(X9 %in% c("G", "A", "Purine")), ntRNAs=n()) %>%
  filter(!status)

Unnamed: 0,clade,isotype,status
1,Insecta,Glu,False
2,Insecta,His,False
3,Mammalia,His,False
4,Spermatophyta,Glu,False
5,Streptophyta,Glu,False
6,Vertebrata,His,False


Unnamed: 0,clade,species,isotype,status,freq,ntRNAs
1,Fungi,botrCine_B05_10,Tyr,False,0,5
2,Fungi,crypGatt_WM276,Gly,False,0,2
3,Fungi,crypNeof_VAR_GRUBII_H99,Gly,False,0,2
4,Fungi,crypNeof_VAR_NEOFORMANS_B_3501,Gly,False,0,2
5,Fungi,crypNeof_VAR_NEOFORMANS_JEC21,Gly,False,0,2
6,Insecta,dm6,Glu,False,6,19
7,Insecta,dm6,His,False,0,5
8,Insecta,dp4,His,False,0,5
9,Insecta,droSim1,Glu,False,3,13
10,Insecta,droSim1,His,False,1,5


### What's new

The 9:12:23 ratios are a bit different. But it shows the same thing: there is some selectivity for what base triples are allowed, but enough tolerance that there are a few interactions that persist.

The default hypothesis is that the 9-23 interaction has a conserved interaction type that explains the frequencies shown. We can marginalize as such because 12:23 is always a WC pair. This is not the case - although some (R:R) have a a strong trans WC-Hoogsteen pair of hydrogen bonds, others (C:R) do not. The hoogsteen side of C is just two stable carbons unlikely to hydrogen bond.

Examining the frequencies _without_ histidine shows that the C9:U23 is conserved in His, and A9:U23 is conserved in Asp, for some clades. (See tertiary interactions figure).

So here's what we know.
- 12:23 is almost always WC paired, so we can marginalize it.
- 9:23 is thought to be a _trans_ interaction with two hydrogen bonds. It's most commonly a purine:purine.
- His and Asp deviate from the common purine:purine interaction in some clades. There are many more (see tertiary interactions figure). 

**Conclusion 1**: Position 9 was thought to be a purine. Instead, it varies by isotype and clade. Much of the variation can be explained by looking at 9:23 is an clade/isotype discriminating tertiary interaction.

We can leave it at that. But we should also look into compensatory interactions if 9:23 is disrupted.

#### tRNA covariation frequencies for 9:23, 22:46, and 10:45

In [126]:
df = table(paste0(identities[identities$quality, ]$X9, ':', identities[identities$quality, ]$X23),
           paste0(identities[identities$quality, ]$X46, ':', identities[identities$quality, ]$X22, ' / ', identities[identities$quality, ]$X45, ':', identities[identities$quality, ]$X10),
           identities[identities$quality, ]$isotype)

In [162]:
as.data.frame(df) %>% group_by(Var3) %>% filter(Freq > 50)

Unnamed: 0,Var1,Var2,Var3,Freq
1,A:A,G:G / A:G,Ala,116
2,A:A,G:G / G:G,Ala,2589
3,G:A,G:G / G:G,Ala,53
4,G:C,G:G / G:G,Ala,351
5,G:A,G:U / G:G,Ala,147
6,G:C,G:U / G:G,Ala,122
7,G:C,A:A / G:G,Arg,884
8,G:G,A:A / G:G,Arg,1623
9,G:G,A:C / G:G,Arg,91
10,A:A,G:G / G:G,Arg,69


This is messy. For each isotype there may be a different tertiary interaction compensating for the lack of A:A. It seems that purine:purine interactions are unusually enriched among all three of these except in valine. 

![classical interaction structure](figures/3d-interactions-oliva.png)

#### Average number of purine:purine interactions by isotype and clade

In [169]:
RRs = c("A:A", "A:G", "G:G", "G:A")
identities %>% select(isotype, clade, quality, X9.23, X22.46, X10.45, X26.44) %>% 
  filter(quality) %>%
  rowwise() %>% 
  mutate(nRR=(X9.23 %in% RRs) + (X22.46 %in% RRs) + (X10.45 %in% RRs) + (X26.44 %in% RRs)) %>%
  group_by(isotype, clade) %>% 
  summarize(nRR=signif(mean(nRR), 3)) %>%
  spread(isotype, nRR)

: Grouping rowwise data frame strips rowwise nature

Unnamed: 0,clade,Ala,Arg,Asn,Asp,Cys,Gln,Glu,Gly,His,Ile,iMet,Leu,Lys,Met,Phe,Pro,Ser,Thr,Trp,Tyr,Val
1,Fungi,2.76,3.26,3.72,3.21,2.9,1.37,1.78,1.96,2.84,3.82,2.46,1.92,3.5,3.91,3.91,2.02,1.85,3.91,3.01,3.76,2.22
2,Insecta,4.0,3.44,3.0,2.0,4.0,2.0,3.33,2.68,2.05,3.9,3.0,2.0,4.0,3.0,4.0,2.0,2.0,4.0,3.0,4.0,2.0
3,Mammalia,3.98,3.51,2.98,2.04,3.98,1.99,2.98,2.6,2.01,3.68,3.0,2.0,3.96,2.99,3.96,2.0,2.03,3.99,2.99,4.0,2.04
4,Nematoda,4.0,3.48,2.99,2.0,4.0,1.98,2.83,2.44,1.99,3.83,3.0,1.99,2.6,2.99,3.97,2.0,2.0,3.98,2.97,3.99,1.99
5,Spermatophyta,4.0,3.52,3.87,2.01,3.93,2.5,3.57,3.5,3.0,4.0,3.0,2.0,3.01,3.03,4.0,2.0,1.19,3.74,3.0,4.0,2.0
6,Streptophyta,3.99,3.58,3.77,2.0,3.67,2.4,3.45,3.48,3.0,4.0,3.0,2.0,3.0,2.99,3.99,2.03,1.34,3.79,3.0,3.84,2.0
7,Vertebrata,3.97,3.42,2.96,2.0,3.97,1.99,2.97,2.4,1.99,3.66,3.0,1.99,3.97,2.98,3.81,2.0,2.04,3.98,2.99,3.97,1.98


#### Average number of purine:purine interactions by position/isotype

In [175]:
RRs = c("A:A", "A:G", "G:G", "G:A")
identities %>% select(isotype, quality, X9.23, X22.46, X10.45, X26.44) %>% 
  filter(quality) %>%
  rowwise() %>% 
  mutate(X9.23=X9.23 %in% RRs, X22.46=X22.46 %in% RRs, X10.45=X10.45 %in% RRs, X26.44=X26.44 %in% RRs) %>%
  gather(position, RR, X9.23, X22.46, X10.45, X26.44, -isotype, -quality) %>%
  group_by(isotype, position) %>%
  summarize(RR=round(mean(RR), digits=1)) %>%
  spread(isotype, RR)

Unnamed: 0,position,Ala,Arg,Asn,Asp,Cys,Gln,Glu,Gly,His,Ile,iMet,Leu,Lys,Met,Phe,Pro,Ser,Thr,Trp,Tyr,Val
1,X10.45,1.0,1.0,1.0,1.0,1.0,1.0,0.9,0.8,1.0,1.0,0.9,0,0.9,1.0,1,1,0.1,1,1.0,1,0.9
2,X22.46,0.9,1.0,1.0,1.0,1.0,0.8,0.8,0.9,1.0,1.0,1.0,1,1.0,1.0,1,0,0.9,1,1.0,1,0.0
3,X26.44,1.0,1.0,1.0,0.1,0.9,0.0,0.3,0.1,0.2,1.0,1.0,0,0.8,1.0,1,0,0.0,1,0.8,1,1.0
4,X9.23,0.9,0.5,0.3,0.2,0.9,0.1,0.7,0.7,0.2,0.8,0.0,1,0.9,0.2,1,1,0.9,1,0.2,1,0.2


#### Are non-purine-purine interactions enriched for some other type of interaction?

In [188]:
identities %>% select(isotype, quality, X9.23, X22.46, X10.45, X26.44) %>% 
  filter(quality) %>%
  gather(position, identity, X9.23, X22.46, X10.45, X26.44, -isotype, -quality) %>%
  filter(!(identity %in% RRs)) %>%
  group_by(position, identity) %>%
  summarize(count=n()) %>%
  filter(count > 50) %>%
  spread(identity, count, fill=0)


: attributes are not identical across measure variables; they will be dropped

Unnamed: 0,position,A:C,A:U,C:-,C:A,C:G,C:U,G:-,G:C,G:U,U:-,U:A,U:C,U:G,U:U
1,X10.45,0,0,287,0,0,0,6213,455,572,399,0,0,55,0
2,X22.46,0,64,0,123,60,0,0,0,0,0,1081,0,5728,70
3,X26.44,150,1348,0,1131,461,94,0,660,6850,0,4142,3415,420,644
4,X9.23,2869,1872,0,0,176,642,0,9861,134,0,54,0,54,0


**Conclusion 2**: Tertiary interactions within this "tRNA core region" along the D stem are enriched in purine-purine interactions, though the extent varies by isotype and position. Meanwhile, the interactions that are _not_ purine-purine are also isotype, position, and clade-specific.