In [41]:
import re
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import AlignIO

In [80]:
def change_gaps(gene):
    gene_alignment = '../../seasonal-flu/results/aligned_cdc_h3n2_'+str(gene)+'_12y_cell_hi.fasta'
    
    n_gaps_records = []
    
    with open(gene_alignment, "r") as aligned_handle:
        for virus in SeqIO.parse(aligned_handle, "fasta"):
#             ambiguous_bases = ['-','R','Y','K','M','W','S','B','D','H','V']
            ambiguous_bases = '-RYKMWSBDHV'
            virus_seq = Seq(str(virus.seq).translate({ord(x):'N' for x in ambiguous_bases}))
            new_record = SeqRecord(seq = virus_seq, 
                                   id = virus.id, description = virus.description)
            n_gaps_records.append(new_record)
    
    with open(gene_alignment, 'w') as output_handle:
        SeqIO.write(n_gaps_records, output_handle, "fasta")

In [90]:
def truncate_to_coding_seq_only(gene):
    gene_alignment = '../../seasonal-flu/results/aligned_cdc_h3n2_'+str(gene)+'_12y_cell_hi.fasta'
    
    coding_records = []
    
    coding_pos = {'pb1': FeatureLocation(9, 2283), 
                  'pb2': FeatureLocation(12, 2292), 
                  'pa': FeatureLocation(10, 2161), 
                  'na': FeatureLocation(3, 1410), 
                  'ha': FeatureLocation(0, 1701)}

    
    with open(gene_alignment, "r") as aligned_handle:
        for virus in SeqIO.parse(aligned_handle, "fasta"):
            new_record = SeqRecord(seq = coding_pos[gene].extract(virus.seq), 
                                   id = virus.id, description = virus.description)
            coding_records.append(new_record)
    
    with open(gene_alignment, 'w') as output_handle:
        SeqIO.write(coding_records, output_handle, "fasta")

In [65]:
def truncate_meta_file(gene):
    gene_alignment = '../../seasonal-flu/results/aligned_cdc_h3n2_'+str(gene)+'_12y_cell_hi.fasta'
    metafile = '../../seasonal-flu/results/metadata_h3n2_'+str(gene)+'.tsv'
    
    aligned_ids = []
    
    with open(gene_alignment, "r") as aligned_handle:
        for virus in SeqIO.parse(aligned_handle, "fasta"):
            aligned_ids.append(virus.id)
    
    aligned_ids_df = pd.DataFrame(aligned_ids, columns=['strain'])

    meta = pd.read_csv(metafile, sep = '\t')
    
    truncate_meta = meta.merge(aligned_ids_df, how='right', on='strain')
    
    truncate_meta.to_csv('../../seasonal-flu/results/metadata_h3n2_'+(gene)+'.tsv', index = False, sep='\t')
    
    

In [92]:
genes = ['ha', 'na', 'pa', 'pb1', 'pb2']
# genes = ['ha1', 'ha2']
for gene in genes:
    change_gaps(gene)
    truncate_to_coding_seq_only(gene)
    truncate_meta_file(gene)

In [83]:
ha_reference = '../../seasonal-flu/config/reference_h3n2_ha.gb'
ha_alignment = '../../seasonal-flu/results/aligned_cdc_h3n2_ha_12y_cell_hi.fasta'
ha_metafile = '../../seasonal-flu/results/metadata_h3n2_ha.tsv'

ha1_pos = ''
ha2_pos = ''
for seq_record in SeqIO.parse(ha_reference, "genbank"):
    for feature in seq_record.features:
        if feature.type == 'CDS':
            if feature.qualifiers['product'][0] == 'HA1 protein':
                ha1_pos = feature.location
            elif feature.qualifiers['product'][0] == 'HA2 protein':
                ha2_pos = feature.location 

ha1_records = []
ha1_ids = []
ha2_records = []
ha2_ids = []

#write ha1 and ha2 alignement files for sequences that cover these genes
with open(ha_alignment, "r") as aligned_handle:
    for virus in SeqIO.parse(aligned_handle, "fasta"):
        ha1_record = SeqRecord(seq = ha1_pos.extract(virus.seq), 
                        id = virus.id, description = 'HA1')
        if len(ha1_record.seq.ungap("N")) > 900:            
            ha1_records.append(ha1_record)
            ha1_ids.append(ha1_record.id)
            
        ha2_record = SeqRecord(seq = ha2_pos.extract(virus.seq), 
                id = virus.id, description = 'HA2')
        if len(ha2_record.seq.ungap("N")) > 600:            
            ha2_records.append(ha2_record)
            ha2_ids.append(ha2_record.id)
            
#write meta files with appropriate strains only
ha_meta = pd.read_csv(ha_metafile, sep = '\t')

ha1_strains = pd.DataFrame(ha1_ids, columns=['strain'])
ha1_meta = ha_meta.merge(ha1_strains)
ha1_meta.to_csv('../../seasonal-flu/results/metadata_h3n2_ha1.tsv', index = False, sep='\t')

ha2_strains = pd.DataFrame(ha2_ids, columns=['strain'])
ha2_meta = ha_meta.merge(ha2_strains)
ha2_meta.to_csv('../../seasonal-flu/results/metadata_h3n2_ha2.tsv', index = False, sep='\t')


            
            
with open('../../seasonal-flu/results/aligned_cdc_h3n2_ha1_12y_cell_hi.fasta', "w") as output_handle:
    SeqIO.write(ha1_records, output_handle, "fasta")

with open('../../seasonal-flu/results/aligned_cdc_h3n2_ha2_12y_cell_hi.fasta', "w") as output_handle2:
    SeqIO.write(ha2_records, output_handle2, "fasta")