In [41]:
import pandas as pd
from Bio.Seq import Seq
import matplotlib.pyplot as plt
import seaborn as sns

In [111]:
def translate_linked_codons(codons, muts_to_look_for, wildtype_genotype):
    
    #read in the nucleotide codons at the specified positions
    #these are counts of all genotypes observed at these two positions from the entired aligned gisaid file
    #format of file is codon1_codon2 | count
    tsv_file = f'linkage_{codons[0]}-{codons[1]}.tsv'
    linkage_df = pd.read_csv(tsv_file, sep='\t', names=[f'{codons[0]}_{codons[1]}','count'], header=0)
    
    ambiguous_count = 0
    aminoacid_linkage = []

    ambiguous_bases = ['N','-','W','S','V','M','K','Y','R','H','D','B']

    for k,v in linkage_df.iterrows():
        codon1, codon2 = v[f'{codons[0]}_{codons[1]}'].split('_')
        # only look at the definitive sequences, with no ambiguous bases
        if any(b in codon1 for b in ambiguous_bases) or any(b in codon2 for b in ambiguous_bases):
            ambiguous_count+=1
        else:
            aa1 = Seq(codon1).translate()
            aa2 = Seq(codon2).translate()
            aminoacid_linked = f'{aa1}_{aa2}'
            aminoacid_linkage.append({'linked_aas':aminoacid_linked, 'count': v['count']})


    aa_df = pd.DataFrame(aminoacid_linkage)
    #aggregate same aa genotypes that were produced from different codons
    aa_df = aa_df.groupby(aa_df['linked_aas'])['count'].sum().reset_index()
    
    aa_df[f'{codons[0]}'] = aa_df['linked_aas'].str.split('_').str[0]
    aa_df[f'{codons[1]}'] = aa_df['linked_aas'].str.split('_').str[1]
    
    #give a summary 
    print(f'ALL OBSERVED SEQUENCES WITH {codons[0]}{muts_to_look_for[0]}:')
    codon2genotypes_observed_with_mut1 = aa_df[aa_df[str(codons[0])]==muts_to_look_for[0]][str(codons[1])].unique()
    for x2 in codon2genotypes_observed_with_mut1:
        num_mut1_x2 = int(aa_df[(aa_df[str(codons[0])]==muts_to_look_for[0])&(aa_df[str(codons[1])]==x2)]['count'])
        print(f'{codons[0]}{muts_to_look_for[0]} observed with {codons[1]}{x2}({num_mut1_x2})')
    
    print('\n')
    print(f'ALL OBSERVED SEQUENCES WITH {codons[1]}{muts_to_look_for[1]}:')
    codon1genotypes_observed_with_mut2 = aa_df[aa_df[str(codons[1])]==muts_to_look_for[1]][str(codons[0])].unique()
    for x1 in codon1genotypes_observed_with_mut2:
        num_mut2_x1 = int(aa_df[(aa_df[str(codons[1])]==muts_to_look_for[1])&(aa_df[str(codons[0])]==x1)]['count'])
        print(f'{codons[1]}{muts_to_look_for[1]} observed with {codons[0]}{x1}({num_mut2_x1})')
    
    seq_total = aa_df['count'].sum()
    print('\n')
    print(f'TOTAL NUMBER OF SEQUENCES THAT ARE UNAMBIGUOUS AT {codons[0]} and {codons[1]}: {seq_total}')
    
    
    
    
    

In [112]:
translate_linked_codons(codons = [453,501], muts_to_look_for = ['F', 'T'], wildtype_genotype = ['Y','N'])

ALL OBSERVED SEQUENCES WITH 453F:
453F observed with 501N(2069)
453F observed with 501Y(20)


ALL OBSERVED SEQUENCES WITH 501T:
501T observed with 453Y(5835)


TOTAL NUMBER OF SEQUENCES THAT ARE UNAMBIGUOUS AT 453 and 501: 8410875


In [113]:
translate_linked_codons(codons = [453,486], muts_to_look_for = ['F', 'L'], wildtype_genotype = ['Y','F'])

ALL OBSERVED SEQUENCES WITH 453F:
453F observed with 486F(2092)
453F observed with 486S(1)


ALL OBSERVED SEQUENCES WITH 486L:
486L observed with 453Y(425)


TOTAL NUMBER OF SEQUENCES THAT ARE UNAMBIGUOUS AT 453 and 486: 8545522
