In [1]:
from collections import defaultdict

Our goal here is to perform some validation of the old and new MinION base callers (Metrichor and Albacore respectively). For two samples we also have validation data from the Illumina MiSeq.

We assess the following possibilities, counting how many instances of each possibility occur. If a possibility does not occur, it will not be output.

For samples that have genomes generated with Metrichor, Albacore, and were also sequenced on the MiSeq:

* All platforms call the same SNP: `all_platforms_call_same_SNP`
* Metrichor calls a SNP, MiSeq and reference match: `metrichor_calls_SNP,miseq_and_ref_match`
* Metrichor calls a SNP, MiSeq, Metrichor, and reference match: `metrichor_calls_SNP,albacore_msq_and_ref_match`
* Albacore calls a SNP, MiSeq and reference match: `albacore_calls_SNP,miseq_and_ref_match`
* Albacore calls a SNP, MiSeq, Metrichor, and reference match: `albacore_calls_SNP,metrichor_miseq_and_ref_match`               
* Albacore and Metrichor call the same SNP, MiSeq matches reference: `albacore_metrichor_call_SNP,miseq_matches_ref`
* MiSeq calls a SNP, Metrichor, Albacore, and reference match: `miseq_calls_SNP,metrichor_albacore_ref_match` 
* MiSeq and reference agree, Metrichor and Albacore different SNPs from each other: `miseq_and_ref_agree,albacore_metrichor_each_call_diff_SNPs`


For samples that only have genomes generated with Metrichor or with Albacore, the options are:

* Metrichor calls a SNP, Albacore and reference match: `albacore_and_ref_match,metrichor_calls_SNP`
* Albacore calls a SNP, Metrichor and reference match : `metrichor_and_ref_match,albacore_calls_SNP`
* Metrichor and Albacore agree on SNP: `metrichor_and_albacore_call_SNP`


In [88]:
def assess_matching( reference_seq, metrichor_seq, albacore_seq, miseq_seq = None):
    match_count_dict = defaultdict(list)
    
    if miseq_seq:
        for (a,b,c,d) in zip(reference_seq, metrichor_seq, albacore_seq, miseq_seq):
            if '-' in (a,b,c,d):
                continue
            elif 'n' in (a,b,c,d):
                continue
            else:
                if b == c == d and a!= b: #all platforms call the same SNP
                    match_count_dict['all_platforms_call_same_SNP'].append((a,b,c,d))
                
                elif b != d and a == d: #miseq and ref match, metrichor calls SNP
                    match_count_dict['metrichor_calls_SNP,miseq_and_ref_match'].append((a,b,c,d))
                
                elif c != d and a == d: #miseq and ref match, albacore calls SNP
                    match_count_dict['albacore_calls_SNP,miseq_and_ref_match'].append((a,b,c,d))
                    
                elif a == b == d and a != c: #albacore calls SNP, metri,ref, and miseq agree 
                    match_count_dict['albacore_calls_SNP,metrichor_miseq_and_ref_match'].append((a,b,c,d))
                    
                elif a == c == d and a!= b and a!=d: #metrichor calls SNP, albacore and msq agree with ref
                    match_count_dict['metrichor_calls_SNP,albacore_msq_and_ref_match'].append((a,b,c,d))
                    
                elif b == c and a == d and a != b: #metri and alba call SNP, ref and miseq do not
                    match_count_dict['albacore_metrichor_call_SNP,miseq_matches_ref'].append((a,b,c,d))
                    
                elif a == b == c and a != d: #miseq calls SNP, metrichor, albacore, and reference agree
                    match_count_dict['miseq_calls_SNP,metrichor_albacore_ref_match'].append((a,b,c,d))
                
                elif a == d and a != b and a != c and b != c:
                    match_count_dict['miseq_and_ref_agree,albacore_metrichor_each_call_diff_SNPs'].append((a,b,c,d))
                                        
    else:
        for (a,b,c) in zip(reference_seq, metrichor_seq, albacore_seq):
            if '-' in (a,b,c):
                continue
            elif 'n' in (a,b,c):
                continue
            else:
                if a == c and a!= b: #ref and albacore match, metrichor does not
                    match_count_dict['albacore_and_ref_match,metrichor_calls_SNP'].append((a,b,c))
                    
                elif a == b and b!= c: #ref and metrichor match, albacore does not
                    match_count_dict['metrichor_and_ref_match,albacore_calls_SNP'].append((a,b,c))
                    
                elif c == b and a!= c: #albacore and metrichor match, base is different from reference
                    match_count_dict['metrichor_and_albacore_call_SNP'].append((a,b,c))
   

    return match_count_dict


def read_fasta(infile_path):
    seq_dict = {}
    with open(infile_path,'rU') as file:
        for line in file:
            if line.startswith('>'):
                header = line.strip().replace('>','')
            else:
                seq_dict[header] = line.strip()
    return seq_dict


## VI 1 Validation

In [89]:
VI1_dict = read_fasta('/Users/alliblk/Desktop/draft-genome-trial/seq-validation/VI1-seq-comparisons-aligned.fasta')

VI1_ref = VI1_dict['KJ776791.2_reference']
VI1_met = VI1_dict['VI1_metrichor']
VI1_alba = VI1_dict['VI1_albacore']
VI1_msq = VI1_dict['VI1_miseq']

VI1_comp_dict = assess_matching(VI1_ref, VI1_met, VI1_alba, miseq_seq = VI1_msq)
#print VI1_comp_dict.keys()

#stats
for key in VI1_comp_dict.keys():
    print key + ': {} sites'.format(len(VI1_comp_dict[key]))


all_platforms_call_same_SNP: 31 sites


## VI2 Validation

In [90]:
VI2_dict = read_fasta('/Users/alliblk/Desktop/draft-genome-trial/seq-validation/VI2-seq-comparisons-aligned.fasta')

VI2_ref = VI2_dict['KJ776791.2_reference']
VI2_met = VI2_dict['VI2_metrichor']
VI2_alba = VI2_dict['VI2_albacore']

VI2_comp_dict = assess_matching(VI2_ref, VI2_met, VI2_alba)

#stats
for key in VI2_comp_dict.keys():
    print key + ': {} sites'.format(len(VI2_comp_dict[key]))


albacore_and_ref_match,metrichor_calls_SNP: 3 sites
metrichor_and_albacore_call_SNP: 23 sites
metrichor_and_ref_match,albacore_calls_SNP: 1 sites


## VI3 Validation (this genome was of partial quality) 

In [91]:
VI3_dict = read_fasta('/Users/alliblk/Desktop/draft-genome-trial/seq-validation/VI3-seq-comparisons-aligned.fasta')

VI3_ref = VI3_dict['KJ776791.2_reference']
VI3_met = VI3_dict['VI3_metrichor']
VI3_alba = VI3_dict['VI3_albacore']

VI3_comp_dict = assess_matching(VI3_ref, VI3_met, VI3_alba)

#stats
for key in VI3_comp_dict.keys():
    print key + ': {} sites'.format(len(VI3_comp_dict[key]))

albacore_and_ref_match,metrichor_calls_SNP: 1 sites
metrichor_and_albacore_call_SNP: 20 sites
metrichor_and_ref_match,albacore_calls_SNP: 1 sites


## VI4 Validation

In [92]:
VI4_dict = read_fasta('/Users/alliblk/Desktop/draft-genome-trial/seq-validation/VI4-seq-comparisons-aligned.fasta')

VI4_ref = VI4_dict['KJ776791.2_reference']
VI4_met = VI4_dict['VI4_metrichor']
VI4_alba = VI4_dict['VI4_albacore']
VI4_msq = VI4_dict['VI4_miseq']

VI4_comp_dict = assess_matching(VI4_ref, VI4_met, VI4_alba, miseq_seq = VI4_msq)

#stats
for key in VI4_comp_dict.keys():
    print key + ': {} sites'.format(len(VI4_comp_dict[key]))

albacore_calls_SNP,miseq_and_ref_match: 1 sites
all_platforms_call_same_SNP: 30 sites


## VI5 Validation

In [94]:
VI5_dict = read_fasta('/Users/alliblk/Desktop/draft-genome-trial/seq-validation/VI5-seq-comparisons-aligned.fasta')

VI5_ref = VI5_dict['KJ776791.2_reference']
VI5_met = VI5_dict['VI5_metrichor']
VI5_alba = VI5_dict['VI5_albacore']

VI5_comp_dict = assess_matching(VI5_ref, VI5_met, VI5_alba)

#stats
for key in VI5_comp_dict.keys():
    print key + ': {} sites'.format(len(VI5_comp_dict[key]))

albacore_and_ref_match,metrichor_calls_SNP: 3 sites
metrichor_and_albacore_call_SNP: 35 sites
metrichor_and_ref_match,albacore_calls_SNP: 1 sites


## VI19 Validation

In [95]:
VI19_dict = read_fasta('/Users/alliblk/Desktop/draft-genome-trial/seq-validation/VI19-seq-comparisons-aligned.fasta')

VI19_ref = VI19_dict['KJ776791.2_reference']
VI19_met = VI19_dict['VI19_metrichor']
VI19_alba = VI19_dict['VI19_albacore']

VI19_comp_dict = assess_matching(VI19_ref, VI19_met, VI19_alba)

#stats
for key in VI19_comp_dict.keys():
    print key + ': {} sites'.format(len(VI19_comp_dict[key]))

metrichor_and_albacore_call_SNP: 40 sites
