In [68]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from dateutil.parser import parse
from collections import Counter
import datetime
import re

In [66]:
#fasta header downloaded as gb_accession|strain_name|segment|date|host|location|subtype|species
#Put date in 'yyyy-mm-dd' format
#group sequences by segment and genotype

#map segment number to segment 1-letter abbreviation
segment_num_mapper = {'1':'R', '2':'C', '3':'M', '4':'P', '5':'A', '6':'I', 
                      '7':'T', '8':'N', '9':'G', '10':'E', '11':'H'}

#length of reference sequence for each segment
#require sequence to have at least 75% of the segment unambiguously sequenced
segment_lengths = {'G':1062, 'P':2362, 'I':1356, 'R':3302, 'C':2693, 'M':2591, 
                   'A':1614, 'N':1059, 'T':1105, 'E':751, 'H':667}

#keep track of sequences by segment
seq_records = {'G':[], 'P':[], 'I':[], 'R':[], 'C':[], 'M':[], 'A':[], 'N':[], 'T':[], 'E':[], 'H':[]}

for record in SeqIO.parse(open(f"rotavirusA_vipr.fasta","r"), "fasta"):

    accession=record.description.split('|')[0]
    date=record.description.split('|')[3]
    strain_name=record.description.split('|')[1]
    segment=record.description.split('|')[2]
    location=record.description.split('|')[5]
    subtype=record.description.split('|')[6]
    
    
    #some dates are given as e.g. 2012/2014 in the date column, but the strain_name lists the actual date
    if len(date)==9 and '/' in date:
        #the strain names that have dates in them are in the format Hu_wt/ITA/PA525/14/2014/G12P8
        #date is always second to last, then GP-genotype
        if len(strain_name.split('/'))>=4:
            date_from_name = strain_name.split('/')[-2]
            
            #if just year is given
            if len(date_from_name) ==4:
                #assign this as the date of the sequence
                date = date_from_name
            #otherwise, dates are given in the format ddmmyy (241018)
            else:
                #a couple of them have an underscore at the beginning for some reason
                if date_from_name[0] == '_':
                    date_from_name = date_from_name[1:]
                #all are from the 2000s:
                date = f'20{date_from_name[-2:]}-{date_from_name[2:4]}-{date_from_name[:2]}'

    #weird labeling of segment 6 in some sequences    
    if segment == 'RNA6':
        segment = '6'
    
    #only take sequences where the segment is specified
    if segment in segment_num_mapper.keys():
        segment_name = segment_num_mapper[segment]
        
        
        #samples labeled as "mixed_genotype" have multiple genotypes present for one of the segments 
        #(likely due to coinfection)
        #if the segment that was sequenced has multiple genotypes, and it's not clear which 
        #genotype the sequence is from, exclude it. However, if the sequenced segment is unambiguous, keep it
        genotypes_by_segment = subtype.split('_')
        if 'mixed' in genotypes_by_segment:
            genotypes_by_segment.remove('mixed')
            genotypes_by_segment.remove('genotype')

        #find genotype OR genotypes of the segment that was sequenced
        genotype_this_segment_list = []
        for sg in genotypes_by_segment:
            if sg[0] == segment_name:
                genotype_this_segment_list.append(sg)
                
        if len(genotype_this_segment_list)==1:
            genotype_this_segment = genotype_this_segment_list[0]
        
        #for a couple sequences, genotype is completely unlabeled
        if len(genotype_this_segment_list)==0:
            genotype_this_segment = None

        #sometimes which of the multiple genotypes was sequenced is in the strain name
        if len(genotype_this_segment_list)>1:
            genotype_from_strain_name = strain_name.split('/')[-1]
            possible_genotypes_in_name = []
            for possible_genotype in genotype_this_segment_list:
                if possible_genotype in genotype_from_strain_name:
                    possible_genotypes_in_name.append(possible_genotype)
                    
            if len(possible_genotypes_in_name) == 1:
                genotype_this_segment = possible_genotypes_in_name[0]
            #if there are no genotypes labeled for this segment, or if there are multiple
            else:
                genotype_this_segment = None

        
        
        #only take sequences with an assigned genotype
        if genotype_this_segment!=None:
        
            #only take sequences where at least 75% of the segment has been sequenced
            if len(record.seq)>= 0.75*(segment_lengths[segment_name]):

                #only want sequences with dates
                #there are a few sequences with date of 2017/2018, and the genbank file doesn't give any more info
                if date!='NA' and date!='' and date!= '2017/2018':
                    # if date only has year, add -XX-XX for month and day
                    if len(date)==4:
                        formatted_date = date+'-XX-XX'
                    else:
                        formatted_date = date.replace('_', '-')

                        # if date only has month, add -XX for day
                        if len(formatted_date)<10:
                            formatted_date = formatted_date+'-XX'

                        list_of_info = [accession, strain_name, formatted_date, segment, segment_name, location, subtype, genotype_this_segment]
                        new_record_info = '|'.join(list_of_info)
                        seq_records[segment_name].append(SeqRecord(record.seq, id=new_record_info, description=new_record_info))  


#write sequence file for each segment, with all genotypes included
for seg, seqs in seq_records.items():
    SeqIO.write(seqs, f'rotavirusA_{seg}_all.fasta', "fasta")
    


In [91]:
#write a fasta file for the predominant genotype groups for each segment

for seg, seqs in seq_records.items():
    #find the most common genotypes
    genotypes_observed = []
    for record in seqs:
        genotypes_observed.append(record.description.split('|')[-1])
    genotype_counts = Counter(genotypes_observed)
    most_common = genotype_counts.most_common(2)[0][0]
    second_most_common = genotype_counts.most_common(2)[1][0]

    #write fasta files for each genotype
    seg_most_common_records = []
    seg_second_most_common_records = []
    
    for record in seqs:
        if record.description.split('|')[-1] == most_common:
            seg_most_common_records.append(record)
        elif record.description.split('|')[-1] == second_most_common:
            seg_second_most_common_records.append(record)
            
            
    SeqIO.write(seg_most_common_records, f'rotavirusA_{seg}_{most_common}.fasta', "fasta")
    SeqIO.write(seg_second_most_common_records, f'rotavirusA_{seg}_{second_most_common}.fasta', "fasta")
    
            

In [95]:
#find the most prevalent genotype constallation 
#'G1_P[8]_I1_R1_C1_M1_A1_N1_T1_E1_H1' is the most prevalent genotype constellation
#for now I will hardcode this genotype as THE rotavirusA genotype for adaptation analysis
genotype_constellations = {}

for seg, seqs in seq_records.items():

    genotypes_observed = []
    for record in seqs:
        genotypes_observed.append(record.description.split('|')[-2])
        
    genotype_constellations[seg] = Counter(genotypes_observed).most_common(3)
    
for seg, c in genotype_constellations.items():
    print(seg)
    print(c)

G
[('G1_P[8]_I1_R1_C1_M1_A1_N1_T1_E1_H1', 497), ('G9_P[x]_Ix_Rx_Cx_Mx_Ax_Nx_Tx_Ex_Hx', 423), ('G1_P[8]_Ix_Rx_Cx_Mx_Ax_Nx_Tx_Ex_Hx', 335)]
P
[('G1_P[8]_I1_R1_C1_M1_A1_N1_T1_E1_H1', 497), ('Gx_P[8]_Ix_Rx_Cx_Mx_Ax_Nx_Tx_Ex_Hx', 296), ('G1_P[8]_I2_R2_C2_M2_A2_N2_T2_E2_H2', 192)]
I
[('G1_P[8]_I1_R1_C1_M1_A1_N1_T1_E1_H1', 498), ('G1_P[8]_I2_R2_C2_M2_A2_N2_T2_E2_H2', 192), ('G2_P[4]_I2_R2_C2_M2_A2_N2_T2_E2_H2', 182)]
R
[('G1_P[8]_I1_R1_C1_M1_A1_N1_T1_E1_H1', 496), ('G1_P[8]_I2_R2_C2_M2_A2_N2_T2_E2_H2', 192), ('G2_P[4]_I2_R2_C2_M2_A2_N2_T2_E2_H2', 184)]
C
[('G1_P[8]_I1_R1_C1_M1_A1_N1_T1_E1_H1', 498), ('G1_P[8]_I2_R2_C2_M2_A2_N2_T2_E2_H2', 192), ('G2_P[4]_I2_R2_C2_M2_A2_N2_T2_E2_H2', 182)]
M
[('G1_P[8]_I1_R1_C1_M1_A1_N1_T1_E1_H1', 498), ('G1_P[8]_I2_R2_C2_M2_A2_N2_T2_E2_H2', 192), ('G2_P[4]_I2_R2_C2_M2_A2_N2_T2_E2_H2', 178)]
A
[('G1_P[8]_I1_R1_C1_M1_A1_N1_T1_E1_H1', 495), ('G1_P[8]_I2_R2_C2_M2_A2_N2_T2_E2_H2', 192), ('G2_P[4]_I2_R2_C2_M2_A2_N2_T2_E2_H2', 183)]
N
[('G1_P[8]_I1_R1_C1_M1_A1_N1_T1_