**Table of Contents**
<div id="toc"></div>

# IMPORTS

In [618]:
from ete3 import Tree
import collections
from collections import Counter
from Bio import AlignIO
from Bio import SeqIO
import numpy as np

# FUNCTION DEFINITIONS

In [619]:
# FUNCTION DEFINITIONS #

def check_and_count(tree, old_count, snp, major_variant, minor_variant, minor_freq):
    check = minor_major_monophyly_check(tree, snp, major_variant, minor_variant, minor_freq)
    return tuple(map(lambda x, y: x + y, old_count, check))

def minor_major_monophyly_check(tree, snp, major_variant, minor_variant, minor_freq):
#     print(snp, major_variant, minor_variant)
    count = (0,0,0,0)
    if minor_freq==1:
        snp_classifier['only_occurs_once'].append((snp, minor_variant))
        count = (1,0,0,0)
    else:
        minor_monophyly_check = tree.check_monophyly(values=[minor_variant], target_attr="snp") 
        if minor_monophyly_check[0]:
            snp_classifier['minor_monophyly'].append((snp, minor_variant))
            count = (0,1,0,0)
        elif not minor_monophyly_check[0]:
            major_monophyly_check = tree.check_monophyly(values=[major_variant], target_attr="snp")
            if major_monophyly_check[0]:
                snp_classifier['major_monophyly'].append((snp, major_variant))
                count = (0,0,1,0)
            elif not major_monophyly_check[0]:
                snp_classifier['not_monophyly'].append((snp, major_variant + ':'+minor_variant))
                count = (0,0,0,1)
    return count

# CREATING THE SNP DICT 


In [620]:
# CREATING THE SNP DICT #
snp_dict = {}

a = AlignIO.read("EBOV_Reference_Set_with_marginalAncestralStates.fasta","fasta")
aa = np.array([list(rec) for rec in a], np.character, order="F")

for i in range(len(a[0])): #for each base up to the length of the alignment
    bp = aa[:, i:i+1] 
    bases = []
    for item in bp:
        bases.append(item[0])
    c =1
    if i < 100 or i > 18880:
        for ambig in ['N','?','-']:
            if ambig in bases:
                c +=1
        if len(set(bases))>c:
            snp_dict['snp_'+str(i+1)]=bases
    elif 'N' in bases:
        if len(set(bases))>2:
            snp_dict['snp_'+str(i+1)]=bases
    else:
        if len(set(bases))>1:
            snp_dict['snp_'+str(i+1)]=bases


# CREATING THE INDEX DICT

In [617]:
# CREATING THE INDEX DICT #
# LETS YOU KNOW WHICH ITEM IN THE SNP DICT VALUE (I.E. BASES LIST) CORRESPONDS TO WHICH RECORD IN THE ALIGNMENT #
index_dict = {}
c=0
for record in a:
    c +=1
    index_dict[record.id]= c

# COUNTING AND CLASSIFYING SNPS

In [621]:
# DEFINING COUNTERS #
simple_cases_counter = 0
complex_cases_counter = 0
early_gap =0
late_gap =0
ambiguous_base=0

SIMPLE = (0,0,0,0)
COMPLEX = (0,0,0,0)

# CLASSIFYING THE SNPS #

snp_classifier = collections.defaultdict(list)

for snp in sorted(snp_dict, key= lambda x: float(x.split('_')[1])): 
    snp_count = Counter()
    
    t = Tree("raxml_results/RAxML_nodeLabelledRootedTree.RECON_with_OG_BRANCH_TREE",format=8)
    for node in t.traverse("preorder"):
        if node.is_leaf():
            node.add_features(snp=snp_dict[snp][index_dict[node.name]-1])  #add snp annotations to each tip
            snp_count[snp_dict[snp][index_dict[node.name]-1]]+=1 #count the leaf snp frequency
            
    snp_count = snp_count.most_common()   
    
    major_variant = snp_count[0][0]

    # SIMPLE CASE #
    if len(snp_count)==2:
        # e.g. snp_189 [('G', 68), ('A', 4)]
        simple_cases_counter +=1 
        
        minor_variant = snp_count[1][0]
        minor_freq = snp_count[1][1]
        
        if minor_variant != 'N' and minor_variant != '?':
            SIMPLE = check_and_count(t, SIMPLE, snp, major_variant, minor_variant, minor_freq)            
    else:
        # COMPLEX CASE #
        # eg. snp_18913 [('G', 68), ('N', 3), ('A', 2), ('R', 2), ('-', 2)] #
        complex_cases_counter +=1 
        
        for k in snp_count[1:]:
#             print(k)
            minor_variant = k[0]
            minor_freq = k[1]
        
            if float(snp.split('_')[1]) < 100 and k[0]=='-':
                early_gap += 1
            elif k[0]=='N' or k[0]=='?':
                ambiguous_base += 1
            elif float(snp.split('_')[1]) > 7565:
                if float(snp.split('_')[1]) > 18900 and k[0]=='-':
                    late_gap +=1
                else:
                    new_id_list = [i for i in index_dict.keys() if i not in ['EBOV|JQ352763|Kikwit|Kikwit_DRC|1995-05-04','EBOV|HQ613403|M-M|DRC|2007-08-31','EBOV|HQ613402|034-KS|DRC|2008-12-31']]    
                    t.prune(new_id_list)
                    COMPLEX = check_and_count(t, COMPLEX, snp, major_variant, minor_variant, minor_freq)  
            else:
                COMPLEX = check_and_count(t, COMPLEX, snp, major_variant, minor_variant, minor_freq) 



# RESULT PRINT OUT

In [623]:
# RESULT PRINT OUT #
print '##### RESULTS #####'
print 'Simple, Complex, Total: ',simple_cases_counter, '+', complex_cases_counter,'=', simple_cases_counter+complex_cases_counter
print '\n'
print '##### SNP Classifier Dict #####'
for k in snp_classifier:
    key_dict = collections.defaultdict(list)
    for i in snp_classifier[k]:
        key_dict[i[0]].append(i[1])
    print k,'(All:', len(snp_classifier[k]), ', Unique:', len(key_dict), ')'
print '\n'    
print '##### Simple cases #####'
print '(only a single snp, minor monophyly, major monophyly, not monophyletic)'
print SIMPLE
print '\n'        
print '##### Not quite as simple cases #####'
print 'minor N or ? ', ambiguous_base
print 'early - ', early_gap
print 'late - ', late_gap
print '(only a single snp, minor monophyly, major monophyly, not monophyletic)'
print COMPLEX
print '\n'   
print '##### Tally of the two #####'
print '(', SIMPLE[0]+COMPLEX[0], SIMPLE[1]+COMPLEX[1], SIMPLE[2]+COMPLEX[2], SIMPLE[3]+COMPLEX[3], ')'

##### RESULTS #####
Simple, Complex, Total:  641 + 901 = 1542


##### SNP Classifier Dict #####
only_occurs_once (All: 303 , Unique: 301 )
not_monophyly (All: 101 , Unique: 100 )
major_monophyly (All: 103 , Unique: 103 )
minor_monophyly (All: 1065 , Unique: 1054 )


##### Simple cases #####
(only a single snp, minor monophyly, major monophyly, not monophyletic)
(121, 441, 38, 41)


##### Not quite as simple cases #####
minor N or ?  882
early -  5
late -  3
(only a single snp, minor monophyly, major monophyly, not monophyletic)
(182, 624, 65, 60)


##### Tally of the two #####
( 303 1065 103 101 )
