In [1]:
# determine_strain_mismatch.ipynb
#
# The goal of this notebook is to determine the mismatch between an 
# aligned FASTA and a Newick tree built from that same file but with
# some number of strains removed during treebuilding.

In [2]:
from Bio import Phylo, SeqIO

In [3]:
fasta_file = "aligned_fastas/aligned-nuc_cdc_h3n2_ha_2y_cell_hi.fasta"
newick_file = "newick_trees/tree-raw_cdc_h3n2_ha_2y_cell_hi.nwk"

In [4]:
s = set()
with open(fasta_file, "rU") as handle:
    for record in SeqIO.parse(handle, "fasta") :
        s.add(record.id)

In [5]:
tree = Phylo.read(newick_file, 'newick')

In [6]:
t = set( [ n.name for n in tree.get_terminals() ])

In [7]:
print(len(t))
print(len(s))

890
891


In [8]:
diff = s - t
print(diff)

set(['A/Fujian-Licheng/1729/2017'])


In [9]:
with open("aligned_fastas/trimmed.fasta", "w") as output_handle:
    with open(fasta_file, "rU") as handle:
        sequences = set ([ seq for seq in SeqIO.parse(handle, "fasta") if seq.id not in diff ])
        SeqIO.write(sequences, output_handle, "fasta")
print(len(sequences))

890
