# Analysing distributions of closely-related virus pairs

## Import modules/Setup inputs 

In [1]:
from __future__ import division
from os.path import expanduser as eu
from os import listdir as ls
import re 
import sys 

class params:
    same_passage_binary = 1 # binary to analyse closely-related virus pairs with similar passage histories 
    child_min_age_limits = [0] # min child age threshold to anlayse
    child_max_age_limits = [5, 7, 10, 12, 15] # max child age threshold to analyse
    adult_min_age_limits = [20, 25, 30, 35] # min adult age threshold to analyse
    adult_max_age_limits = [120] # max adult age threshold to analyse

## Retrieve data from pairs_distribution outputs for all age thresholds analysed

## Generate CSV file as R input

Repeat all the above cells for all subtypes, each time appending to pair_distribution.csv.

In [2]:
# output filename
if params.same_passage_binary == 1: 
    outfname = 'pair_distributions_same-pas.csv'
else: 
    outfname = 'pair_distributions.csv'

files = ls(eu('./files/H3N2')) + ls(eu('./files/H1N1pdm09')) + ls(eu('./files/BVic')) + ls(eu('./files/BYam'))

agekey_to_pd = {}
agekey_to_pt_totcount = {}
agekey_to_pd_stats = {}

for f, file in enumerate(files):
    if not re.search('^.+pairs-distribution.+\.txt$', file):
        continue
        
    if params.same_passage_binary == 1:
        if re.search('SP0', file):
            continue
    else:
        if re.search('SP1', file):
            continue
    
    # get age thresholds 
    childmin, childmax = re.search('(\d+)C(\d+)', file).group(1, 2)
    if int(childmax) not in params.child_max_age_limits or int(childmin) not in params.child_min_age_limits:
        continue
        
    adultmin, adultmax = re.search('(\d+)A(\d+)', file).group(1, 2)
    if int(adultmin) not in params.adult_min_age_limits or int(adultmax) not in params.adult_max_age_limits: 
        continue
    
    # parse subtype from filename
    try:
        subtype = re.search('(H3N2|H1N1pdm09|BVic|BYam)', file).group()
    except:
        continue
    
    age_key = (childmin, childmax, adultmin, adultmax, subtype)
    
    # binaries parsing specific file info 
    pair_freq_bin = 1
    whole_HA_bin = 0
    rbs_bin = 0
    
    fhandle = filter(None, open(eu('./files/{}/{}'.format(subtype, file)), 'rU').readlines())
    for line in fhandle:
        if pair_freq_bin == 1:
            # grab pair frequencies for each pair and amino acid substitution frequencies 
            try:
                subno, ca, ac, cc, aa = map(int, re.search('(0|1|2|3|4|5)\\t(\d+)\\t(\d+)\\t(\d+)\\t(\d+)', line).group(1, 2, 3, 4, 5))
                try:
                    agekey_to_pt_totcount[age_key]['CA'] += ca
                    agekey_to_pt_totcount[age_key]['AC'] += ac
                    agekey_to_pt_totcount[age_key]['CC'] += cc
                    agekey_to_pt_totcount[age_key]['AA'] += aa
                except:
                    agekey_to_pt_totcount[age_key] = {'CA':ca}
                    agekey_to_pt_totcount[age_key]['AC'] = ac
                    agekey_to_pt_totcount[age_key]['CC'] = cc
                    agekey_to_pt_totcount[age_key]['AA'] = aa

                try:
                    agekey_to_pd[age_key][subno] = (ca, ac, cc, aa)
                except:
                    agekey_to_pd[age_key] = {subno:(ca, ac, cc, aa)}
                
                if subno == 5:
                    pair_freq_bin = 0
                continue
            except:
                pass
        
        # grab association anlayses results 
        elif re.search('Association analyses between pair-type \(end-in-adult v\. end-in-child\) and propensity for substitution\(s\) - whole HA', line):
            whole_HA_bin = 1
            continue
        elif re.search('Association analyses between pair-type \(end-in-adult v\. end-in-child\) and propensity for substitution\(s\) - RBS/AS', line):
            rbs_bin = 1
            continue
        
        if whole_HA_bin == 1 or rbs_bin == 1:
            try: 
                OR, pval = re.search('^OR\t(\d+\.\d+|InF|NaN)\tp-value\t(\d+\.\d+)', line).group(1, 2)
                if whole_HA_bin == 1:
                    agekey_to_pd_stats[age_key] = {'whole':(OR, pval)}
                    whole_HA_bin = 0
                elif rbs_bin == 1:
                    agekey_to_pd_stats[age_key]['rbas'] = (OR, pval)
                    rbs_bin = 0
            except: 
                pass 

In [3]:
with open(outfname, 'w') as output:
    output.write('childmin,childmax,adultmin,adultmax,sub_no,prop,pt,OR_whole,pval_whole,OR_rbas,pval_rbas,subtype\n')
    for age_key in sorted(agekey_to_pd.keys()):
        childmin, childmax, adultmin, adultmax, subtype = age_key
        for subno in sorted(agekey_to_pd[age_key].keys()):
            for c, count in enumerate(agekey_to_pd[age_key][subno]):
                pt = {0:'CA', 1:'AC', 2:'CC', 3:'AA'}[c]
                try:
                    proportion = count/agekey_to_pt_totcount[age_key][pt]
                except: 
                    proportion = 'NaN'
                output_line = map(str, [childmin, childmax, adultmin, adultmax, subno, proportion, pt, agekey_to_pd_stats[age_key]['whole'][0], agekey_to_pd_stats[age_key]['whole'][1], agekey_to_pd_stats[age_key]['rbas'][0], agekey_to_pd_stats[age_key]['rbas'][1], subtype])
                output.write('{}\n'.format(','.join(output_line)))

## R code to generate lattice plots 

In [3]:
# load modules
require(ggplot2)
require(lattice)
require(latticeExtra)
library(RColorBrewer)
require(gridExtra)

# read data 
same_passage_binary <- 1
if (same_passage_binary == 1){
    data <- read.csv("pair_distributions_same-pas.csv")
}else{
    data <- read.csv("pair_distributions.csv")
}
data

childmin,childmax,adultmin,adultmax,sub_no,prop,pt,OR_whole,pval_whole,OR_rbas,pval_rbas,subtype
0,10,20,120,0,0.544378698,CA,1.064278,0.7213860,0.7146455,0.1259215,BVic
0,10,20,120,0,0.577142857,AC,1.064278,0.7213860,0.7146455,0.1259215,BVic
0,10,20,120,0,0.472803347,CC,1.064278,0.7213860,0.7146455,0.1259215,BVic
0,10,20,120,0,0.465686275,AA,1.064278,0.7213860,0.7146455,0.1259215,BVic
0,10,20,120,1,0.378698225,CA,1.064278,0.7213860,0.7146455,0.1259215,BVic
0,10,20,120,1,0.320000000,AC,1.064278,0.7213860,0.7146455,0.1259215,BVic
0,10,20,120,1,0.384937238,CC,1.064278,0.7213860,0.7146455,0.1259215,BVic
0,10,20,120,1,0.382352941,AA,1.064278,0.7213860,0.7146455,0.1259215,BVic
0,10,20,120,2,0.053254438,CA,1.064278,0.7213860,0.7146455,0.1259215,BVic
0,10,20,120,2,0.085714286,AC,1.064278,0.7213860,0.7146455,0.1259215,BVic


## Build lattice plots for age sensitivity analyses 

In [4]:
# color palette 
z_col <- brewer.pal(10, "RdYlBu")
z_break <- c(0,0.01,0.02,0.03,0.04,0.05,0.2,0.4,0.6,0.8,1.0)

# generate lattice plot 
generate_lattice_plot <- function(st, st_label, RASBINARY){
  # filter data for subtype 
  fildat <- subset(data, subtype==st)
  # X-axis (Max child age)
  X <- c(fildat$childmax)
  # Non-redundant X-axis 
  dictX <- data.frame(key=sort(unique(X)), value=seq_along(sort(unique(X))))
  sortX <- dictX[match(X, dictX$key), ]$value
  if (RASBINARY) {
    CELLVALUE <- ifelse(fildat$pval_rbas<0.05,format(round(fildat$OR_rbas, 2), nsmall=2),'')  
  } else {
    CELLVALUE <- ifelse(fildat$pval_whole<0.05,format(round(fildat$OR_whole, 2), nsmall=2),'')
  }
  fildat <- cbind(fildat, sortX, CELLVALUE)
  if (RASBINARY) {
    latplot <- levelplot(pval_rbas ~ sortX*adultmin, data=fildat,
                        scales=list(x=list(labels=c("", "5", "7", "10", "12", "15"), cex=1.5), y=list(cex=1.5)),
                        xlab = list(label="Maximum children age (Y)", cex=1.8), ylab = list(label="Mininum adult age (Y)", cex=1.8),
                        col.regions = z_col,
                        border = 'black',
                        at=z_break,
                        main=list(label=st_label, cex=1.8),
                        colorkey = list(labels=list(at=c(0,0.05, 0.2, 0.4, 0.6, 0.8, 1.0), cex=1)), 
                        legend=list(top=list(fun=grid::textGrob("p-values", y=-0.3, x=1.125))))
  } else {
    latplot <- levelplot(pval_whole ~ sortX*adultmin, data=fildat,
                        scales=list(x=list(labels=c("", "5", "7", "10", "12", "15"), cex=1.5), y=list(cex=1.5)),
                        xlab = list(label="Maximum children age (Y)", cex=1.8), ylab = list(label="Mininum adult age (Y)", cex=1.8),
                        col.regions = z_col,
                        border = 'black',
                        at=z_break,
                        main=list(label=st_label, cex=1.8),
                        colorkey = list(labels=list(at=c(0,0.05, 0.2, 0.4, 0.6, 0.8, 1.0), cex=1)), 
                        legend=list(top=list(fun=grid::textGrob("p-values", y=-0.3, x=1.125))))
  }

  latplot <- latplot + layer(panel.text(sortX, adultmin, CELLVALUE, cex=1.65), data=fildat) 
  return (latplot)
}

# Whole HA 
H3plot <- generate_lattice_plot('H3N2', 'A/H3N2', 0)
H1plot <- generate_lattice_plot('H1N1pdm09', 'A/H1N1pdm09', 0)
BVicplot <- generate_lattice_plot('BVic', 'B/Victoria', 0)
BYamplot <- generate_lattice_plot('BYam', 'B/Yamagata', 0)

# generate png file
if (same_passage_binary == 1){
    fname_whole <- "age_sensitivity_analysis_wholeHA_same-passage.png"
    fname_rbas <- "age_sensitivity_analysis_RBAS_same-passage.png"
}else{
    fname_whole <- "age_sensitivity_analysis_wholeHA.png"
    fname_rbas <- "age_sensitivity_analysis_RBAS.png"
}

png(file=fname_whole, height=5, width=24, units = "in", res=600)
grid.arrange(H3plot, H1plot, BVicplot, BYamplot, ncol=4)
dev.off()

# RBS/AS only
H3plot <- generate_lattice_plot('H3N2', 'A/H3N2', 1)
H1plot <- generate_lattice_plot('H1N1pdm09', 'A/H1N1pdm09', 1)
BVicplot <- generate_lattice_plot('BVic', 'B/Victoria', 1)
BYamplot <- generate_lattice_plot('BYam', 'B/Yamagata', 1)

# generate png file
png(file=fname_rbas, height=5, width=24, units = "in", res=600)
grid.arrange(H3plot, H1plot, BVicplot, BYamplot, ncol=4)
dev.off()

### Age sensitivity analysis (across whole HA protein)
![](./age_sensitivity_analysis_wholeHA_same-passage.png)

### Age sensitivity analysis (canonical receptor-binding/antigenic sites only)
![](./age_sensitivity_analysis_RBAS_same-passage.png)

## Pair distribution stratified by number of amino acid substitutions
From sensitivity lattice plots, the age thresholds with the most significant association with observation of amino acid substitution(s) are: 

* A/H3N2 - Child(C) <= 5, Adult(A) >=35*
* A/H1N1pdm09 - C <= 5, A >= 25 (based on whole HA protein) 
* B/Vic - C <= 15, A >= 20 (based on RBS/AS only) 
* B/Yam - C <= 5, A >= 35*

\*No significance observed so we use largest age difference between children and adults

In [5]:
require(scales)

# filter data 
fildat <- subset(data, ((subtype=="H3N2" & childmax==5 & adultmin==30)|(subtype=="H1N1pdm09" & childmax==5 & adultmin==25)|(subtype=="BVic" & childmax==15 & adultmin==20)|(subtype=="BYam" & childmax==5 & adultmin==35)))

# change labels 
fildat$subtype <- factor(fildat$subtype, levels=c('H3N2', 'H1N1pdm09', 'BVic', 'BYam'), labels=c("A/H3N2", "A/H1N1pdm09", "B/Victoria", "B/Yamagata"))

# change levels
fildat$pt <- factor(fildat$pt, levels=c('CA','AA','AC','CC'))

# base plot
p <- ggplot(fildat, aes(x=sub_no, y=prop, color=pt)) + geom_point(size=1) + geom_line(size=0.5)

# legend color
p <- p + scale_color_manual(values=c('#d7191c', '#fdae61', '#abd9e9', '#2c7bb6'))

# axes labels 
p <- p + labs(x="No. of amino acid substitutions", y="Proportion of pairs", color="Pair type") 

# clear grid and background (including legend)
p <- p + theme(panel.grid.major = element_line(colour = "gray95"), panel.grid.minor = element_blank(),
               panel.background = element_blank(), axis.line = element_line(colour = "black"),
               legend.key=element_blank(),
               text = element_text(size=7))

# tick marks 
y_min <- 0
y_max <- 0.8
p <- p + scale_y_continuous(breaks = seq(y_min, y_max,by=0.2), limits=c(y_min, y_max), labels = percent)

# filename
if (same_passage_binary == 1){
    fname <- "pair_distribution_stratified_by_aa_sub_same-pas.png"
}else{
    fname <- "pair_distribution_stratified_by_aa_sub.png"
}

# facet 
png(file=fname, height=2, width=7, units = "in", res=600)
p + facet_grid(~subtype, labeller=label_parsed)
dev.off()

Loading required package: scales


![](./pair_distribution_stratified_by_aa_sub_same-pas.png)