In [5]:
from collections import defaultdict
import pandas as pd

# Compare SNP calls between 2D-sequenced genomes called by Metrichor + Nanopolish, Albacore 2D caller without Nanopolish, and MiSeq

Our goal here is to perform some validation of the old and new MinION base callers (Metrichor and Albacore respectively). For two samples we also have validation data from the Illumina MiSeq as well.

All sequences were aligned with Mafft using the linsi algorithm.


In [23]:
##### functions #####

def assess_matching( reference_seq, metrichor_seq, albacore_seq, miseq_seq = None):
    match_count_dict = defaultdict(list)
    
    if miseq_seq:
        for (a,b,c,d) in zip(reference_seq, metrichor_seq, albacore_seq, miseq_seq):
            if '-' in (a,b,c,d):
                continue
            elif 'n' in (a,b,c,d):
                continue
            else:
                #All 3 platforms call SNP, gold standard agreement
                if b == c == d and a!= b:
                    match_count_dict['all_platforms_call_same_SNP'].append((a,b,c,d))
                    
                elif b == d and c!= b: #metrichor and miseq match, albacore calls SNP
                    match_count_dict['metrichor_and_miseq_match,albacore_calls_SNP'].append((a,b,c))
                    
                elif c == d and b != c: #albacore and miseq match, metrichor calls SNP
                    match_count_dict['albacore_and_miseq_match,metrichor_calls_SNP'].append((a,b,c))
                    
                elif b == c and b!= d: #albacore and metrichor both call SNP, miseq does not 
                    match_count_dict['metrichor_and_albacore_call_SNP,miseq_does_not'].append((a,b,c))                
                                        
    else:
        for (a,b,c) in zip(reference_seq, metrichor_seq, albacore_seq):
            if '-' in (a,b,c):
                continue
            elif 'n' in (a,b,c):
                continue
            else:
                if a == c and a!= b: #ref and albacore match, metrichor does not
                    match_count_dict['albacore_and_ref_match,metrichor_calls_SNP'].append((a,b,c))
                    
                elif a == b and b!= c: #ref and metrichor match, albacore does not
                    match_count_dict['metrichor_and_ref_match,albacore_calls_SNP'].append((a,b,c))
                    
                elif c == b and a!= c: #albacore and metrichor match, base is different from reference
                    match_count_dict['metrichor_and_albacore_call_SNP'].append((a,b,c))
                    
    return match_count_dict



def assess_nanopolish(reference_seq, polished_seq, unpolished_seq):
    match_count_dict = defaultdict(list)
    
    for (a,b,c) in zip(reference_seq, polished_seq, unpolished_seq):
        if '-' in (a,b,c):
            continue
        elif 'n' in (a,b,c):
            continue
        else:
            if a == c and a!= b: #ref and unpolished match, polished does not
                match_count_dict['ref_and_unpolished_match,polished_calls_SNP'].append((a,b,c))
                    
            elif a == b and b!= c: #ref and polished match, unpolished does not
                match_count_dict['ref_and_polished_match,unpolished_calls_SNP'].append((a,b,c))
                    
            elif c == b and a!= c: #albacore and metrichor match, base is different from reference
                match_count_dict['both_polished_and_unpolished_call_SNP'].append((a,b,c))
    
    return match_count_dict



def read_fasta(infile_path):
    seq_dict = {}
    with open(infile_path,'rU') as file:
        for line in file:
            if line.startswith('>'):
                header = line.strip().replace('>','')
            else:
                seq_dict[header] = line.strip()
    return seq_dict



def binary_ref_match( reference_seq, metrichor_seq, albacore_seq, miseq_seq = None):
    mismatch_columns = []
    
    if miseq_seq:
        for (a,b,c,d) in zip(reference_seq, metrichor_seq, albacore_seq, miseq_seq):
            if '-' in (a,b,c,d):
                continue
            elif 'n' in (a,b,c,d):
                continue
            else:
                if (a != b) or (a != c) or (a != d): #if any sequence disagrees with ref
                    mismatch_columns.append([a,b,c,d])                                      
    else:
        for (a,b,c) in zip(reference_seq, metrichor_seq, albacore_seq):
            if '-' in (a,b,c):
                continue
            elif 'n' in (a,b,c):
                continue
            else:
                if (a != b) or (a != c):
                    mismatch_columns.append([a,b,c])
                    
    binary_columns = []
    for column in mismatch_columns:
        binary_column = [0] #ref always truth so always 0
        for base in column[1:]: # don't evaluate the ref
            if base != column[0]:
                binary_column.append(1)
            else:
                binary_column.append(0)
        binary_columns.append(binary_column)
    
    return binary_columns

### VI 1 Validation

In [9]:
VI1_dict = read_fasta('/Users/alliblk/Desktop/gitrepos/zika-seq/seq-validation/VI1-seq-comparisons-aligned.fasta')

VI1_ref = VI1_dict['KJ776791.2_reference']
VI1_met = VI1_dict['VI1_metrichor']
VI1_alba = VI1_dict['VI1_albacore']
VI1_msq = VI1_dict['VI1_miseq']

VI1_comp_dict = assess_matching(VI1_ref, VI1_met, VI1_alba, miseq_seq = VI1_msq)
#print VI1_comp_dict.keys()

#stats
for key in VI1_comp_dict.keys():
    print key + ': {} sites'.format(len(VI1_comp_dict[key]))


metrichor_and_miseq_match,albacore_calls_SNP: 4 sites
all_platforms_call_same_SNP: 31 sites


In [5]:
VI1_binary = binary_ref_match(VI1_ref,VI1_met,VI1_alba,VI1_msq)
VI1_df = pd.DataFrame(VI1_binary, columns=['Reference', 'Metrichor', 'Albacore','MiSeq'])
VI1_df.to_csv(path_or_buf='/Users/alliblk/Desktop/gitrepos/zika-seq/seq-validation/VI1_SNPcall_comparisons.tsv', sep='\t', index=False)

### VI2 Validation

In [10]:
VI2_dict = read_fasta('/Users/alliblk/Desktop/gitrepos/zika-seq/seq-validation/VI2-seq-comparisons-aligned.fasta')

VI2_ref = VI2_dict['KJ776791.2_reference']
VI2_met = VI2_dict['VI2_metrichor']
VI2_alba = VI2_dict['VI2_albacore']

VI2_comp_dict = assess_matching(VI2_ref, VI2_met, VI2_alba)

#stats
for key in VI2_comp_dict.keys():
    print key + ': {} sites'.format(len(VI2_comp_dict[key]))


albacore_and_ref_match,metrichor_calls_SNP: 3 sites
metrichor_and_albacore_call_SNP: 23 sites
metrichor_and_ref_match,albacore_calls_SNP: 1 sites


In [8]:
VI2_binary = binary_ref_match(VI2_ref,VI2_met,VI2_alba)
VI2_df = pd.DataFrame(VI2_binary, columns=['Reference', 'Metrichor', 'Albacore'])
VI2_df.to_csv(path_or_buf='/Users/alliblk/Desktop/gitrepos/zika-seq/seq-validation/VI2_SNPcall_comparisons.tsv', sep='\t', index=False)

### VI3 Validation (this genome was of partial quality) 

In [11]:
VI3_dict = read_fasta('/Users/alliblk/Desktop/gitrepos/zika-seq/seq-validation/VI3-seq-comparisons-aligned.fasta')

VI3_ref = VI3_dict['KJ776791.2_reference']
VI3_met = VI3_dict['VI3_metrichor']
VI3_alba = VI3_dict['VI3_albacore']

VI3_comp_dict = assess_matching(VI3_ref, VI3_met, VI3_alba)

#stats
for key in VI3_comp_dict.keys():
    print key + ': {} sites'.format(len(VI3_comp_dict[key]))

albacore_and_ref_match,metrichor_calls_SNP: 1 sites
metrichor_and_albacore_call_SNP: 20 sites
metrichor_and_ref_match,albacore_calls_SNP: 1 sites


In [10]:
VI3_binary = binary_ref_match(VI3_ref,VI3_met,VI3_alba)
VI3_df = pd.DataFrame(VI3_binary, columns=['Reference', 'Metrichor', 'Albacore'])
VI3_df.to_csv(path_or_buf='/Users/alliblk/Desktop/gitrepos/zika-seq/seq-validation/VI3_SNPcall_comparisons.tsv', sep='\t', index=False)

### VI4 Validation

In [12]:
VI4_dict = read_fasta('/Users/alliblk/Desktop/gitrepos/zika-seq/seq-validation/VI4-seq-comparisons-aligned.fasta')

VI4_ref = VI4_dict['KJ776791.2_reference']
VI4_met = VI4_dict['VI4_metrichor']
VI4_alba = VI4_dict['VI4_albacore']
VI4_msq = VI4_dict['VI4_miseq']

VI4_comp_dict = assess_matching(VI4_ref, VI4_met, VI4_alba, miseq_seq = VI4_msq)

#stats
for key in VI4_comp_dict.keys():
    print key + ': {} sites'.format(len(VI4_comp_dict[key]))

metrichor_and_miseq_match,albacore_calls_SNP: 8 sites
albacore_and_miseq_match,metrichor_calls_SNP: 1 sites
all_platforms_call_same_SNP: 30 sites


In [12]:
VI4_binary = binary_ref_match(VI4_ref,VI4_met,VI4_alba,VI4_msq)
VI4_df = pd.DataFrame(VI4_binary, columns=['Reference', 'Metrichor', 'Albacore','MiSeq'])
VI4_df.to_csv(path_or_buf='/Users/alliblk/Desktop/gitrepos/zika-seq/seq-validation/VI4_SNPcall_comparisons.tsv', sep='\t', index=False)

### VI5 Validation

In [13]:
VI5_dict = read_fasta('/Users/alliblk/Desktop/gitrepos/zika-seq/seq-validation/VI5-seq-comparisons-aligned.fasta')

VI5_ref = VI5_dict['KJ776791.2_reference']
VI5_met = VI5_dict['VI5_metrichor']
VI5_alba = VI5_dict['VI5_albacore']

VI5_comp_dict = assess_matching(VI5_ref, VI5_met, VI5_alba)

#stats
for key in VI5_comp_dict.keys():
    print key + ': {} sites'.format(len(VI5_comp_dict[key]))

albacore_and_ref_match,metrichor_calls_SNP: 3 sites
metrichor_and_albacore_call_SNP: 35 sites
metrichor_and_ref_match,albacore_calls_SNP: 1 sites


In [14]:
VI5_binary = binary_ref_match(VI5_ref,VI5_met,VI5_alba)
VI5_df = pd.DataFrame(VI5_binary, columns=['Reference', 'Metrichor', 'Albacore'])
VI5_df.to_csv(path_or_buf='/Users/alliblk/Desktop/gitrepos/zika-seq/seq-validation/VI5_SNPcall_comparisons.tsv', sep='\t', index=False)

### VI19 Validation

In [15]:
VI19_dict = read_fasta('/Users/alliblk/Desktop/gitrepos/zika-seq/seq-validation/VI19-seq-comparisons-aligned.fasta')

VI19_ref = VI19_dict['KJ776791.2_reference']
VI19_met = VI19_dict['VI19_metrichor']
VI19_alba = VI19_dict['VI19_albacore']

VI19_comp_dict = assess_matching(VI19_ref, VI19_met, VI19_alba)

#stats
for key in VI19_comp_dict.keys():
    print key + ': {} sites'.format(len(VI19_comp_dict[key]))

metrichor_and_albacore_call_SNP: 40 sites


In [16]:
VI19_binary = binary_ref_match(VI19_ref,VI19_met,VI19_alba)
VI19_df = pd.DataFrame(VI19_binary, columns=['Reference', 'Metrichor', 'Albacore'])
VI19_df.to_csv(path_or_buf='/Users/alliblk/Desktop/gitrepos/zika-seq/seq-validation/VI19_SNPcall_comparisons.tsv', sep='\t', index=False)

# Comparing nanopolished to non-nanopolished 1D sequenced samples

Given the depth we were getting from 1D libraries, we thought that sheer number of reads might overcome high errors rates of a single read. Here, we are comparing SNP calls between a MinION 1D-sequenced genome with good coverage where the consensus sequence was generated by majority call from Geneious, and a consensus genome called with out python script (`margin_cons.py`) using variant calls made by Nanopolish.

### VI41 Valdiation

In [24]:
VI41_dict = read_fasta('/Users/alliblk/Desktop/gitrepos/zika-seq/seq-validation/VI41-ref-nanopolish-comp-aligned.fasta')

VI41_ref = VI41_dict['KJ776791.2_reference']
VI41_polished = VI41_dict['VI41_albacore_polished']
VI41_unpolished = VI41_dict['VI41_not_polished']

VI41_comp_dict = assess_nanopolish(VI41_ref, VI41_polished, VI41_unpolished)

for key in VI41_comp_dict.keys():
    print key + ': {} sites'.format(len(VI41_comp_dict[key]))

ref_and_polished_match,unpolished_calls_SNP: 2 sites
both_polished_and_unpolished_call_SNP: 38 sites


In [28]:
VI41_binary = binary_ref_match(VI41_ref, VI41_polished, VI41_unpolished)
VI41_df = pd.DataFrame(VI41_binary, columns=['Reference', 'Nanopolished', 'Not_Nanopolished'])
VI41_df.to_csv(path_or_buf='/Users/alliblk/Desktop/gitrepos/zika-seq/seq-validation/VI41_SNPcall_comparisons.tsv', sep='\t', index=False)