In [1]:
FASTQC_PATH = "/home/analytics/distr/FastQC/fastqc"
TRIMMOMATIC_JAR = "/home/analytics/bin/trimmomatic.jar"
BOWTIE_DIR = "/home/analytics/distr/bowtie2-2.3.4.3-linux-x86_64/"
SAMTOOLS_DIR = "/home/analytics/distr/samtools-1.9/build/bin"
VARSCAN_DIR = "/home/analytics/distr/varscan/"

In [2]:
!wget -P ../data/week2/ ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR170/001/SRR1705851/SRR1705851.fastq.gz 2> /dev/null
!gunzip ../data/week2/SRR1705851.fastq.gz

In [3]:
!wget -P ../data/week2/ http://public.dobzhanskycenter.ru/mrayko/Week2/KF848938.1.fasta 2> /dev/null

In [4]:
!$FASTQC_PATH ../data/week2/SRR1705851.fastq 2> /dev/null

Analysis complete for SRR1705851.fastq


In [5]:
from IPython.display import IFrame

In [6]:
from boto.s3 import connect_to_region
from boto import connect_s3
from boto.s3.key import Key

In [7]:
AWS_S3_ROOT = "http://bioinf-workshop.s3-website.eu-central-1.amazonaws.com/"

def perform_file_upload(file_, file_name):

    conn = connect_to_region("eu-central-1", 
                             aws_access_key_id=open("../.access_key", "r").read().strip(),
                             aws_secret_access_key=open("../.secret_key", "r").read().strip())

    bucket = conn.get_bucket("bioinf-workshop")
    key = Key(bucket)
    key.key = file_name
    key.set_contents_from_string(file_, headers={"Content-Type": "text/html"})
    key.set_acl('public-read')
    return AWS_S3_ROOT + file_name # public url to access the uploaded file

In [8]:
f = perform_file_upload(open("../data/week2/SRR1705851_fastqc.html", "r").read(), "week_2/fastqc_1.html")
IFrame(src=f, width=1000, height=600)

In [9]:
!ls ../data/week2/

KF848938.1.fasta  SRR1705851_fastqc.html
SRR1705851.fastq  SRR1705851_fastqc.zip


In [10]:
!java -jar $TRIMMOMATIC_JAR SE \
    -phred33 \
    -trimlog ../data/week2/trim.log \
    ../data/week2/SRR1705851.fastq \
    ../data/week2/trimmed.fastq \
    HEADCROP:17 \
    CROP:128 \
    SLIDINGWINDOW:4:25 \
    MINLEN:30

TrimmomaticSE: Started with arguments: -phred33 -trimlog ../data/week2/trim.log ../data/week2/SRR1705851.fastq ../data/week2/trimmed.fastq HEADCROP:17 CROP:128 SLIDINGWINDOW:4:25 MINLEN:30
Automatically using 16 threads
Input Reads: 358265 Surviving: 340185 (94.95%) Dropped: 18080 (5.05%)
TrimmomaticSE: Completed successfully


In [11]:
!$FASTQC_PATH ../data/week2/trimmed.fastq 2> /dev/null

Analysis complete for trimmed.fastq


In [12]:
f = perform_file_upload(open("../data/week2/trimmed_fastqc.html", "r").read(), "week_2/fastqc_2.html")
IFrame(src=f, width=1000, height=600)

In [13]:
from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.p = False
        self.sequences = []
    
    def handle_starttag(self, tag, attrs):
        self.last_tag = tag
        self.last_attrs = attrs

    def handle_data(self, data):
        if "Sequence" in data and self.last_tag == "th" or self.p:
            self.p = True
            if set(list(data)).union({"A", "C", "G", "T"}) == {"A", "C", "G", "T"}:
                self.sequences.append(data)

parser = MyHTMLParser()
parser.feed(open("../data/week2/SRR1705851_fastqc.html", "r").read())

In [14]:
with open("../data/week2/overrep_seqs.fa", "w") as f:
    for line_i, line in enumerate(parser.sequences):
        f.write("> sequence#{}\n".format(line_i + 1))
        f.write(line + "\n")

In [15]:
!$BOWTIE_DIR/bowtie2-build ../data/week2/KF848938.1.fasta ../data/KF848938.1.idx

Settings:
  Output files: "../data/KF848938.1.idx.*.bt2"
  Line rate: 6 (line is 64 bytes)
  Lines per side: 1 (side is 64 bytes)
  Offset rate: 4 (one in 16)
  FTable chars: 10
  Strings: unpacked
  Max bucket size: default
  Max bucket size, sqrt multiplier: default
  Max bucket size, len divisor: 4
  Difference-cover sample period: 1024
  Endianness: little
  Actual local endianness: little
  Sanity checking: disabled
  Assertions: disabled
  Random seed: 0
  Sizeofs: void*:8, int:4, long:8, size_t:8
Input files DNA, FASTA:
  ../data/week2/KF848938.1.fasta
Building a SMALL index
Reading reference sizes
  Time reading reference sizes: 00:00:00
Calculating joined length
Writing header
Reserving space for joined string
Joining reference sequences
  Time to join reference sequences: 00:00:00
bmax according to bmaxDivN setting: 416
Using parameters --bmax 312 --dcv 1024
  Doing ahead-of-time memory usage test
  Passed!  Constructing with these parameters: --bmax 312 --dcv 1024
Constructi

In [16]:
!$BOWTIE_DIR/bowtie2 -x ../data/KF848938.1.idx \
    -U ../data/week2/trimmed.fastq \
    -p 20 > ../data/week2/alignment.sam

340185 reads; of these:
  340185 (100.00%) were unpaired; of these:
    12190 (3.58%) aligned 0 times
    327995 (96.42%) aligned exactly 1 time
    0 (0.00%) aligned >1 times
96.42% overall alignment rate


In [18]:
!$SAMTOOLS_DIR/samtools view -S -b ../data/week2/alignment.sam > ../data/week2/alignment.bam

!$SAMTOOLS_DIR/samtools sort \
    ../data/week2/alignment.bam \
    -o ../data/week2/alignment_sorted.bam
    
!$SAMTOOLS_DIR/samtools index ../data/week2/alignment_sorted.bam

!$SAMTOOLS_DIR/samtools flagstat ../data/week2/alignment.bam

340185 + 0 in total (QC-passed reads + QC-failed reads)
0 + 0 secondary
0 + 0 supplementary
0 + 0 duplicates
327995 + 0 mapped (96.42% : N/A)
0 + 0 paired in sequencing
0 + 0 read1
0 + 0 read2
0 + 0 properly paired (N/A : N/A)
0 + 0 with itself and mate mapped
0 + 0 singletons (N/A : N/A)
0 + 0 with mate mapped to a different chr
0 + 0 with mate mapped to a different chr (mapQ>=5)


In [19]:
!$SAMTOOLS_DIR/samtools mpileup \
    -f ../data/week2/KF848938.1.fasta \
    -d 100000 \
    ../data/week2/alignment_sorted.bam >  ../data/week2/alignment.mpileup

[mpileup] 1 samples in 1 input files


In [20]:
!java -jar $VARSCAN_DIR/VarScan.v2.4.0.jar \
    mpileup2snp ../data/week2/alignment.mpileup --min-var-freq .001 \
    --variants \
    --output-vcf 1 > ../data/week2/VarScan_results.vcf

Only SNPs will be reported
Min coverage:	8
Min reads2:	2
Min var freq:	0.001
Min avg qual:	15
P-value thresh:	0.01
Reading input from ../data/week2/alignment.mpileup
1665 bases in pileup file
16 variant positions (10 SNP, 6 indel)
0 were failed by the strand-filter
10 variant positions reported (10 SNP, 0 indel)


In [21]:
!cat ../data/week2/VarScan_results.vcf

##fileformat=VCFv4.1
##source=VarScan2
##INFO=<ID=ADP,Number=1,Type=Integer,Description="Average per-sample depth of bases with Phred score >= 15">
##INFO=<ID=WT,Number=1,Type=Integer,Description="Number of samples called reference (wild-type)">
##INFO=<ID=HET,Number=1,Type=Integer,Description="Number of samples called heterozygous-variant">
##INFO=<ID=HOM,Number=1,Type=Integer,Description="Number of samples called homozygous-variant">
##INFO=<ID=NC,Number=1,Type=Integer,Description="Number of samples not called">
##FILTER=<ID=str10,Description="Less than 10% or more than 90% of variant supporting reads on one strand">
##FILTER=<ID=indelError,Description="Likely artifact due to indel reads at this position">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=SDP,Number=1,Type=Integer,Description="Raw Read Depth as reported by SAMtools">
##FORMAT=<ID=DP,Number=1,Type=Integer,Descript

In [22]:
!wget -P ../data/week2/ ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR170/008/SRR1705858/SRR1705858.fastq.gz 2> /dev/null
!wget -P ../data/week2/ ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR170/009/SRR1705859/SRR1705859.fastq.gz 2> /dev/null
!wget -P ../data/week2/ ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR170/000/SRR1705860/SRR1705860.fastq.gz 2> /dev/null

In [23]:
!gunzip ../data/week2/SRR1705858.fastq.gz
!gunzip ../data/week2/SRR1705859.fastq.gz
!gunzip ../data/week2/SRR1705860.fastq.gz

In [25]:
!$FASTQC_PATH ../data/week2/SRR1705858.fastq 2> /dev/null
!$FASTQC_PATH ../data/week2/SRR1705859.fastq 2> /dev/null
!$FASTQC_PATH ../data/week2/SRR1705860.fastq 2> /dev/null

Analysis complete for SRR1705858.fastq
Analysis complete for SRR1705859.fastq
Analysis complete for SRR1705860.fastq


In [26]:
f = perform_file_upload(open("../data/week2/SRR1705858_fastqc.html", "r").read(), "week_2/fastqc_3_1.html")
IFrame(src=f, width=1000, height=600)

In [27]:
f = perform_file_upload(open("../data/week2/SRR1705859_fastqc.html", "r").read(), "week_2/fastqc_3_2.html")
IFrame(src=f, width=1000, height=600)

In [28]:
f = perform_file_upload(open("../data/week2/SRR1705860_fastqc.html", "r").read(), "week_2/fastqc_3_3.html")
IFrame(src=f, width=1000, height=600)

In [37]:
with open("../data/week2/KF848938.1.fasta", "r") as f:
    lines = f.readlines()
    
print("Reference len:", len("".join([l.strip() for l in lines[1:]])))

Reference len: 1665


Approximate coverage: $\frac{2.3 * 10^5 * 120}{1665} \approx 1.6 * 10^4$

In [43]:
!java -jar $TRIMMOMATIC_JAR SE \
    -phred33 \
    -trimlog ../data/week2/trim.log \
    ../data/week2/SRR1705858.fastq \
    ../data/week2/rep_1_trimmed.fastq \
    HEADCROP:17 \
    CROP:128 \
    SLIDINGWINDOW:4:25 \
    MINLEN:30
        
!java -jar $TRIMMOMATIC_JAR SE \
    -phred33 \
    -trimlog ../data/week2/trim.log \
    ../data/week2/SRR1705859.fastq \
    ../data/week2/rep_2_trimmed.fastq \
    HEADCROP:17 \
    CROP:128 \
    SLIDINGWINDOW:4:25 \
    MINLEN:30
        
!java -jar $TRIMMOMATIC_JAR SE \
    -phred33 \
    -trimlog ../data/week2/trim.log \
    ../data/week2/SRR1705860.fastq \
    ../data/week2/rep_3_trimmed.fastq \
    HEADCROP:17 \
    CROP:128 \
    SLIDINGWINDOW:4:25 \
    MINLEN:30

TrimmomaticSE: Started with arguments: -phred33 -trimlog ../data/week2/trim.log ../data/week2/SRR1705858.fastq ../data/week2/rep_1_trimmed.fastq HEADCROP:17 CROP:128 SLIDINGWINDOW:4:25 MINLEN:30
Automatically using 16 threads
Input Reads: 256586 Surviving: 240872 (93.88%) Dropped: 15714 (6.12%)
TrimmomaticSE: Completed successfully
TrimmomaticSE: Started with arguments: -phred33 -trimlog ../data/week2/trim.log ../data/week2/SRR1705859.fastq ../data/week2/rep_2_trimmed.fastq HEADCROP:17 CROP:128 SLIDINGWINDOW:4:25 MINLEN:30
Automatically using 16 threads
Input Reads: 233327 Surviving: 218708 (93.73%) Dropped: 14619 (6.27%)
TrimmomaticSE: Completed successfully
TrimmomaticSE: Started with arguments: -phred33 -trimlog ../data/week2/trim.log ../data/week2/SRR1705860.fastq ../data/week2/rep_3_trimmed.fastq HEADCROP:17 CROP:128 SLIDINGWINDOW:4:25 MINLEN:30
Automatically using 16 threads
Input Reads: 249964 Surviving: 235190 (94.09%) Dropped: 14774 (5.91%)
TrimmomaticSE: Completed successfull

In [45]:
!$FASTQC_PATH ../data/week2/rep_1_trimmed.fastq 2> /dev/null
!$FASTQC_PATH ../data/week2/rep_2_trimmed.fastq 2> /dev/null
!$FASTQC_PATH ../data/week2/rep_3_trimmed.fastq 2> /dev/null

Analysis complete for rep_1_trimmed.fastq
Analysis complete for rep_2_trimmed.fastq
Analysis complete for rep_3_trimmed.fastq


In [47]:
f = perform_file_upload(open("../data/week2/rep_1_trimmed_fastqc.html", "r").read(), "week_2/fastqc_4_1.html")
IFrame(src=f, width=1000, height=600)

In [48]:
f = perform_file_upload(open("../data/week2/rep_2_trimmed_fastqc.html", "r").read(), "week_2/fastqc_4_2.html")
IFrame(src=f, width=1000, height=600)

In [49]:
f = perform_file_upload(open("../data/week2/rep_3_trimmed_fastqc.html", "r").read(), "week_2/fastqc_4_3.html")
IFrame(src=f, width=1000, height=600)

In [50]:
!$BOWTIE_DIR/bowtie2 -x ../data/KF848938.1.idx \
    -U ../data/week2/rep_1_trimmed.fastq \
    -p 20 > ../data/week2/rep_1_alignment.sam

240872 reads; of these:
  240872 (100.00%) were unpaired; of these:
    5615 (2.33%) aligned 0 times
    235257 (97.67%) aligned exactly 1 time
    0 (0.00%) aligned >1 times
97.67% overall alignment rate


In [51]:
!$BOWTIE_DIR/bowtie2 -x ../data/KF848938.1.idx \
    -U ../data/week2/rep_2_trimmed.fastq \
    -p 20 > ../data/week2/rep_2_alignment.sam

218708 reads; of these:
  218708 (100.00%) were unpaired; of these:
    5041 (2.30%) aligned 0 times
    213667 (97.70%) aligned exactly 1 time
    0 (0.00%) aligned >1 times
97.70% overall alignment rate


In [52]:
!$BOWTIE_DIR/bowtie2 -x ../data/KF848938.1.idx \
    -U ../data/week2/rep_3_trimmed.fastq \
    -p 20 > ../data/week2/rep_3_alignment.sam

235190 reads; of these:
  235190 (100.00%) were unpaired; of these:
    5845 (2.49%) aligned 0 times
    229345 (97.51%) aligned exactly 1 time
    0 (0.00%) aligned >1 times
97.51% overall alignment rate


In [53]:
!$SAMTOOLS_DIR/samtools view -S -b ../data/week2/rep_1_alignment.sam > ../data/week2/rep_1_alignment.bam

!$SAMTOOLS_DIR/samtools sort \
    ../data/week2/rep_1_alignment.bam \
    -o ../data/week2/rep_1_alignment_sorted.bam
    
!$SAMTOOLS_DIR/samtools index ../data/week2/rep_1_alignment_sorted.bam

!$SAMTOOLS_DIR/samtools flagstat ../data/week2/rep_1_alignment.bam

240872 + 0 in total (QC-passed reads + QC-failed reads)
0 + 0 secondary
0 + 0 supplementary
0 + 0 duplicates
235257 + 0 mapped (97.67% : N/A)
0 + 0 paired in sequencing
0 + 0 read1
0 + 0 read2
0 + 0 properly paired (N/A : N/A)
0 + 0 with itself and mate mapped
0 + 0 singletons (N/A : N/A)
0 + 0 with mate mapped to a different chr
0 + 0 with mate mapped to a different chr (mapQ>=5)


In [54]:
!$SAMTOOLS_DIR/samtools view -S -b ../data/week2/rep_2_alignment.sam > ../data/week2/rep_2_alignment.bam

!$SAMTOOLS_DIR/samtools sort \
    ../data/week2/rep_2_alignment.bam \
    -o ../data/week2/rep_2_alignment_sorted.bam
    
!$SAMTOOLS_DIR/samtools index ../data/week2/rep_2_alignment_sorted.bam

!$SAMTOOLS_DIR/samtools flagstat ../data/week2/rep_2_alignment.bam

218708 + 0 in total (QC-passed reads + QC-failed reads)
0 + 0 secondary
0 + 0 supplementary
0 + 0 duplicates
213667 + 0 mapped (97.70% : N/A)
0 + 0 paired in sequencing
0 + 0 read1
0 + 0 read2
0 + 0 properly paired (N/A : N/A)
0 + 0 with itself and mate mapped
0 + 0 singletons (N/A : N/A)
0 + 0 with mate mapped to a different chr
0 + 0 with mate mapped to a different chr (mapQ>=5)


In [55]:
!$SAMTOOLS_DIR/samtools view -S -b ../data/week2/rep_3_alignment.sam > ../data/week2/rep_3_alignment.bam

!$SAMTOOLS_DIR/samtools sort \
    ../data/week2/rep_3_alignment.bam \
    -o ../data/week2/rep_3_alignment_sorted.bam
    
!$SAMTOOLS_DIR/samtools index ../data/week2/rep_3_alignment_sorted.bam

!$SAMTOOLS_DIR/samtools flagstat ../data/week2/rep_3_alignment.bam

235190 + 0 in total (QC-passed reads + QC-failed reads)
0 + 0 secondary
0 + 0 supplementary
0 + 0 duplicates
229345 + 0 mapped (97.51% : N/A)
0 + 0 paired in sequencing
0 + 0 read1
0 + 0 read2
0 + 0 properly paired (N/A : N/A)
0 + 0 with itself and mate mapped
0 + 0 singletons (N/A : N/A)
0 + 0 with mate mapped to a different chr
0 + 0 with mate mapped to a different chr (mapQ>=5)


In [56]:
!$SAMTOOLS_DIR/samtools mpileup \
    -f ../data/week2/KF848938.1.fasta \
    -d 100000 \
    ../data/week2/rep_1_alignment_sorted.bam >  ../data/week2/rep_1_alignment.mpileup

[mpileup] 1 samples in 1 input files


In [57]:
!$SAMTOOLS_DIR/samtools mpileup \
    -f ../data/week2/KF848938.1.fasta \
    -d 100000 \
    ../data/week2/rep_2_alignment_sorted.bam >  ../data/week2/rep_2_alignment.mpileup

[mpileup] 1 samples in 1 input files


In [58]:
!$SAMTOOLS_DIR/samtools mpileup \
    -f ../data/week2/KF848938.1.fasta \
    -d 100000 \
    ../data/week2/rep_3_alignment_sorted.bam >  ../data/week2/rep_3_alignment.mpileup

[mpileup] 1 samples in 1 input files


In [59]:
!java -jar $VARSCAN_DIR/VarScan.v2.4.0.jar \
    mpileup2snp ../data/week2/rep_1_alignment.mpileup --min-var-freq .001 \
    --variants \
    --output-vcf 1 > ../data/week2/rep_1_varscan.vcf

Only SNPs will be reported
Min coverage:	8
Min reads2:	2
Min var freq:	0.001
Min avg qual:	15
P-value thresh:	0.01
Reading input from ../data/week2/rep_1_alignment.mpileup
1665 bases in pileup file
24 variant positions (21 SNP, 3 indel)
0 were failed by the strand-filter
21 variant positions reported (21 SNP, 0 indel)


In [60]:
!java -jar $VARSCAN_DIR/VarScan.v2.4.0.jar \
    mpileup2snp ../data/week2/rep_2_alignment.mpileup --min-var-freq .001 \
    --variants \
    --output-vcf 1 > ../data/week2/rep_2_varscan.vcf

Only SNPs will be reported
Min coverage:	8
Min reads2:	2
Min var freq:	0.001
Min avg qual:	15
P-value thresh:	0.01
Reading input from ../data/week2/rep_2_alignment.mpileup
1665 bases in pileup file
14 variant positions (11 SNP, 3 indel)
0 were failed by the strand-filter
11 variant positions reported (11 SNP, 0 indel)


In [61]:
!java -jar $VARSCAN_DIR/VarScan.v2.4.0.jar \
    mpileup2snp ../data/week2/rep_3_alignment.mpileup --min-var-freq .001 \
    --variants \
    --output-vcf 1 > ../data/week2/rep_3_varscan.vcf

Only SNPs will be reported
Min coverage:	8
Min reads2:	2
Min var freq:	0.001
Min avg qual:	15
P-value thresh:	0.01
Reading input from ../data/week2/rep_3_alignment.mpileup
1665 bases in pileup file
23 variant positions (20 SNP, 3 indel)
0 were failed by the strand-filter
20 variant positions reported (20 SNP, 0 indel)


In [85]:
import pandas as pd

In [109]:
for rep in [1, 2, 3]:
    vcf_name = "rep_{}_varscan.vcf".format(rep)
    bases_lines = [l.split() for l in open("../data/week2/{}".format(vcf_name)).readlines()]
    bases = [[l[1], l[3], l[4]] for l in bases_lines[24:]]
    freqs_lines = [l.split(":") for l in open("../data/week2/{}".format(vcf_name)).readlines()]
    freqs = [l[19] for l in freqs_lines[24:]]

    freqs_df = pd.DataFrame([p[0] + [p[1]] for p in zip(bases, freqs)])
    freqs_df.columns = ["position", "reference", "alternative", "frequency"]
    freqs_df.loc[:, "frequency"] = freqs_df.frequency.apply(lambda s: float(s[:-1]) / 100)

    freq_mean = freqs_df.frequency.mean()
    freq_std =  freqs_df.frequency.std()

    print(freq_mean - 3 * freq_std, freq_mean + 3 * freq_std)

0.001140268019764466 0.004021636742140295
0.0015851829540424241 0.002814817045957576
0.0013639619393568882 0.0032460380606431105


In [110]:
!cat ../data/week2/KF848938.1.fasta

>KF848938.1 Influenza A virus (A/USA/RVD1_H3/2011(H3N2)) segment 4 hemagglutinin (HA) gene, partial cds
CAAAAACTTCCTGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTGCCAAACGGAA
CAATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCCACTGAGCTGGTTCAGAGTTCCTC
AACAGGTGAAATATGCAACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTA
TTGGGAGACCCTCAGTGTGATGGCTTCCAAAACAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCC
ACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCAC
ACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGCTCTGCTTGCATA
AGGAGATCTAATAATAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAACTTCAAATACCCAGCAT
TGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTAC
GGACAAGGACCAAATCTTCCTGTATGCTCAAGCAGCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAA
CAAGCTGTAATTCCGAATATCGGATCTAGACCCAGAGTAAGGAATATCCCTAGCAGAGTAAGCATCTATT
GGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTA
CTTTAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGCA
TGCATCACTCCAAATGGAAGCATTCCCAATG

According to: https://www.ncbi.nlm.nih.gov/nuccore/KF848938.1  
CAA — Gln, the first codone, the first chain of HA is of 329 residues.

In [111]:
!cat ../data/week2/VarScan_results.vcf

##fileformat=VCFv4.1
##source=VarScan2
##INFO=<ID=ADP,Number=1,Type=Integer,Description="Average per-sample depth of bases with Phred score >= 15">
##INFO=<ID=WT,Number=1,Type=Integer,Description="Number of samples called reference (wild-type)">
##INFO=<ID=HET,Number=1,Type=Integer,Description="Number of samples called heterozygous-variant">
##INFO=<ID=HOM,Number=1,Type=Integer,Description="Number of samples called homozygous-variant">
##INFO=<ID=NC,Number=1,Type=Integer,Description="Number of samples not called">
##FILTER=<ID=str10,Description="Less than 10% or more than 90% of variant supporting reads on one strand">
##FILTER=<ID=indelError,Description="Likely artifact due to indel reads at this position">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=SDP,Number=1,Type=Integer,Description="Raw Read Depth as reported by SAMtools">
##FORMAT=<ID=DP,Number=1,Type=Integer,Descript

Here we can see two mutations, frequencies of which are low, though exceeding mean+3sigma: 307, 1458

486 is from the second chain, whereas 103 is located in Epitope D (Munoz et al., 2004)