In [355]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [356]:
import sys
sys.path.append('../')

In [357]:
import pandas as pd
from src.mutations import *
import src.bjorn_support as bs
from src.data import GENE2POS

In [358]:
alignment_filepath = "/Users/al/Documents/scripps/analysis/bjorn/issue22/gisaid_b16172_2021_07_17_01.aligned.fasta"
mutations_1_filepath = "/Users/al/Documents/scripps/analysis/bjorn/issue22/gisaid_samples_2021_07_17_01.mutations_old.csv"
mutations_2_filepath = "/Users/al/Documents/scripps/analysis/bjorn/issue22/gisaid_samples_2021_07_17_01.mutations_new.csv"

In [359]:
m1 = pd.read_csv(mutations_1_filepath)
m1 = m1.loc[(m1['gene']!='Non-coding region')]
m1.columns

Index(['idx', 'seq_len', 'replacements', 'pos', 'gene', 'gene_start_pos',
       'codon_num', 'codon_start', 'ref_codon', 'alt_codon', 'ref_aa',
       'alt_aa', 'mutation', 'type', 'del_positions', 'del_len',
       'relative_coords', 'absolute_coords', 'del_seq', 'prev_5nts',
       'next_5nts', 'pos_in_codon', 'deletion_codon_coords', 'is_frameshift',
       'is_synonymous'],
      dtype='object')

In [360]:
# drop redundant amino acid mutations per sample due to multi-ucleotide point mutations
m1.drop_duplicates(subset=['idx', 'mutation'], inplace=True)

In [361]:
m1_aggregated = m1.groupby(['gene', 'mutation']).agg(num_samples=('idx', 'nunique')).reset_index()
# m1_aggregated[m1_aggregated['gene']=='S']

In [362]:
m2 = pd.read_csv(mutations_2_filepath)
m2 = m2.loc[(m2['gene']!='Non-coding region')]
m2.columns

Index(['idx', 'seq_len', 'replacements', 'pos', 'gene', 'gene_start_pos',
       'codon_num', 'codon_start', 'ref_codon', 'alt_codon', 'ref_aa',
       'alt_aa', 'mutation', 'type', 'del_positions', 'del_len',
       'relative_coords', 'absolute_coords', 'del_seq', 'prev_5nts',
       'next_5nts', 'pos_in_codon', 'deletion_start_position',
       'deletion_start_codon', 'deletion_end_codon', 'deletion_name',
       'deletion_codon_coords', 'is_frameshift', 'oof_backshift_signal',
       'is_synonymous'],
      dtype='object')

In [363]:
# main deletion display names
m1.loc[m1['type']=='deletion', 'mutation'].unique()

array(['ORF1a:DEL3675/3677', 'S:DEL69/70', 'S:DEL144/145', 'S:DEL157/158',
       'ORF8:DEL119/120', 'S:DEL247/253', 'ORF7a:DEL70/77',
       'ORF7a:DEL97/100'], dtype=object)

In [365]:
# issue22_fix deletion display names
m2.loc[m2['type']=='deletion', 'mutation'].unique()

array(['ORF1a:DEL3676/3678', 'S:DEL69/70', 'S:DEL144/145', 'S:DEL157/158',
       'ORF8:DEL120/121', 'S:DEL247/253', 'ORF7a:DEL70/78',
       'ORF7a:DEL97/101'], dtype=object)

In [366]:
# issue22_fix deletion coordinates
m2.loc[m2['type']=='deletion', 'deletion_name'].unique()

array(['ORF1a:DEL3675.3/3678.3', 'S:DEL68.7/70.7', 'S:DEL144.0/145.0',
       'S:DEL156.7/158.7', 'ORF8:DEL119.3/121.3', 'S:DEL246.7/253.7',
       'ORF7a:DEL69.7/78.0', 'ORF7a:DEL96.7/101.3'], dtype=object)

In [367]:
prev_muts = m1['mutation'].unique()
ans = (m2.loc[~m2['mutation'].isin(prev_muts)]
 .groupby(['mutation'])
 .agg(num_samples=('idx', 'nunique'),
      samples=('idx', 'unique')).reset_index())
ans['mutation'].unique()

array(['ORF1a:DEL3676/3678', 'ORF1a:S3675K', 'ORF7a:DEL70/78',
       'ORF7a:DEL97/101', 'ORF8:D119I', 'ORF8:DEL120/121', 'S:E156G',
       'S:I68I', 'S:R246N'], dtype=object)

In [336]:
m2['oof_backshift_signal']

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
      ... 
838    2.0
839    2.0
840    1.0
841    1.0
842    1.0
Name: oof_backshift_signal, Length: 825, dtype: float64

In [368]:
m2.loc[(m2['type']=='deletion'), 'deletion_name'].unique()

array(['ORF1a:DEL3675.3/3678.3', 'S:DEL68.7/70.7', 'S:DEL144.0/145.0',
       'S:DEL156.7/158.7', 'ORF8:DEL119.3/121.3', 'S:DEL246.7/253.7',
       'ORF7a:DEL69.7/78.0', 'ORF7a:DEL96.7/101.3'], dtype=object)

In [369]:
m2_aggregated = (m2.groupby(['gene', 'mutation'])
                 .agg(num_samples=('idx', 'nunique')).reset_index())
m2_aggregated[m2_aggregated['gene']=='S']

Unnamed: 0,gene,mutation,num_samples
185,S,S:A570D,6
186,S,S:A575A,1
187,S,S:D1118H,5
188,S,S:D614G,19
189,S,S:D950N,6
190,S,S:DEL144/145,6
191,S,S:DEL157/158,6
192,S,S:DEL247/253,5
193,S,S:DEL69/70,6
194,S,S:E156G,5


In [370]:
# drop redundant amino acid mutations per sample due to multi-nucleotide point mutations
m2.drop_duplicates(subset=['idx', 'mutation'], inplace=True)

In [371]:
m2.loc[m2['type']=='deletion', ['mutation', 'deletion_start_codon', 'deletion_end_codon']]

Unnamed: 0,mutation,deletion_start_codon,deletion_end_codon
684,ORF1a:DEL3676/3678,3675.3,3678.3
685,S:DEL69/70,68.7,70.7
686,S:DEL144/145,144.0,145.0
688,ORF1a:DEL3676/3678,3675.3,3678.3
689,S:DEL69/70,68.7,70.7
690,S:DEL144/145,144.0,145.0
691,ORF1a:DEL3676/3678,3675.3,3678.3
692,S:DEL69/70,68.7,70.7
693,S:DEL144/145,144.0,145.0
694,ORF1a:DEL3676/3678,3675.3,3678.3


In [217]:
m2_aggregated = m2.groupby(['gene', 'mutation']).agg(num_samples=('idx', 'nunique')).reset_index()
m2_mutations = set(m2_aggregated['mutation'].unique())

In [218]:
m2_mutations

{'5UTR:R81C',
 '5UTR:V70V',
 'M:I82T',
 'N:D377Y',
 'N:D63G',
 'N:G215C',
 'N:L139F',
 'N:R203M',
 'N:R385K',
 'N:T205I',
 'Non-coding region:D9853Y',
 'Non-coding region:DEL0',
 'ORF1a:A1306S',
 'ORF1a:A3209V',
 'ORF1a:A498V',
 'ORF1a:D2907D',
 'ORF1a:D3936D',
 'ORF1a:D827D',
 'ORF1a:D828D',
 'ORF1a:DEL3675/3677',
 'ORF1a:E1196V',
 'ORF1a:F924F',
 'ORF1a:G334G',
 'ORF1a:H2092Y',
 'ORF1a:I1232I',
 'ORF1a:I1274I',
 'ORF1a:I431M',
 'ORF1a:I671I',
 'ORF1a:K261N',
 'ORF1a:L3201P',
 'ORF1a:M3934I',
 'ORF1a:N1709N',
 'ORF1a:P1640L',
 'ORF1a:P2046L',
 'ORF1a:P2287S',
 'ORF1a:P309L',
 'ORF1a:S2500S',
 'ORF1a:S3675K',
 'ORF1a:S3949N',
 'ORF1a:T265I',
 'ORF1a:T3255I',
 'ORF1a:T3646A',
 'ORF1a:T3750I',
 'ORF1a:V2930L',
 'ORF1a:V3420V',
 'ORF1a:V3689V',
 'ORF1a:V3718A',
 'ORF1a:V665I',
 'ORF1a:Y4227Y',
 'ORF1b:A1918V',
 'ORF1b:D1926D',
 'ORF1b:G662S',
 'ORF1b:H2285Y',
 'ORF1b:H590H',
 'ORF1b:K2310N',
 'ORF1b:L2265L',
 'ORF1b:M115I',
 'ORF1b:M2269I',
 'ORF1b:M809V',
 'ORF1b:P1000L',
 'ORF1b:P314L',

In [219]:
m1_mutations

{'5UTR:R81C',
 '5UTR:V70V',
 'M:I82T',
 'N:D377Y',
 'N:D63G',
 'N:G215C',
 'N:L139F',
 'N:R203M',
 'N:R385K',
 'N:T205I',
 'Non-coding region:D9853Y',
 'Non-coding region:DEL0',
 'ORF1a:A1306S',
 'ORF1a:A3209V',
 'ORF1a:A498V',
 'ORF1a:D2907D',
 'ORF1a:D3936D',
 'ORF1a:D827D',
 'ORF1a:D828D',
 'ORF1a:DEL3675/3677',
 'ORF1a:E1196V',
 'ORF1a:F924F',
 'ORF1a:G334G',
 'ORF1a:H2092Y',
 'ORF1a:I1232I',
 'ORF1a:I1274I',
 'ORF1a:I431M',
 'ORF1a:I671I',
 'ORF1a:K261N',
 'ORF1a:L3201P',
 'ORF1a:M3934I',
 'ORF1a:N1709N',
 'ORF1a:P1640L',
 'ORF1a:P2046L',
 'ORF1a:P2287S',
 'ORF1a:P309L',
 'ORF1a:S2500S',
 'ORF1a:S3949N',
 'ORF1a:T265I',
 'ORF1a:T3255I',
 'ORF1a:T3646A',
 'ORF1a:T3750I',
 'ORF1a:V2930L',
 'ORF1a:V3420V',
 'ORF1a:V3689V',
 'ORF1a:V3718A',
 'ORF1a:V665I',
 'ORF1a:Y4227Y',
 'ORF1b:A1918V',
 'ORF1b:D1926D',
 'ORF1b:G662S',
 'ORF1b:H2285Y',
 'ORF1b:H590H',
 'ORF1b:K2310N',
 'ORF1b:L2265L',
 'ORF1b:M115I',
 'ORF1b:M2269I',
 'ORF1b:M809V',
 'ORF1b:P1000L',
 'ORF1b:P314L',
 'ORF1b:R1078C',

In [220]:
m1_mutations = set(m1_aggregated['mutation'].unique())
m2_mutations - m1_mutations

{'ORF1a:S3675K', 'ORF8:D119I', 'S:DEL144', 'S:E156G'}

## DEV

In [165]:
patient_zero = "NC_045512.2"
test=False
gene2pos = GENE2POS
start_pos=265
end_pos=29674
min_del_len=1
max_del_len=500
min_seq_len=20000
max_num_subs=5000
cns = bs.load_fasta(alignment_filepath, is_aligned=True, is_gzip=False)

In [173]:
seqs, ref_seq = process_cns_seqs(cns, patient_zero, start_pos, end_pos)
print(f"Initial cleaning...")
# load into dataframe
seqsdf = (pd.DataFrame(index=seqs.keys(), data=seqs.values(), 
                       columns=['sequence'])
            .reset_index().rename(columns={'index': 'idx'}))
if test:
        seqsdf = seqsdf.sample(100)
# compute length of each sequence
seqsdf['seq_len'] = seqsdf['sequence'].str.len()
seqsdf = seqsdf[seqsdf['seq_len']>min_seq_len]
print(f"Identifying deletions...")
# identify deletion positions
seqsdf['del_positions'] = seqsdf['sequence'].apply(find_deletions)
# dump sequences to save mem, boost speed
seqsdf.drop(columns=['sequence'], inplace=True)
gc.collect();
# sequences with one or more deletions
seqsdf = seqsdf.loc[seqsdf['del_positions'].str.len() > 0]
# sequences with less than 500 deletions
seqsdf = seqsdf.loc[seqsdf['del_positions'].str.len() < max_del_len]
seqsdf = seqsdf.explode('del_positions')
# compute length of each deletion
seqsdf['del_len'] = seqsdf['del_positions'].apply(len)
# only consider deletions longer than 1nts
seqsdf = seqsdf[seqsdf['del_len'] >= min_del_len]
# only consider deletions shorter than 500nts
seqsdf = seqsdf[seqsdf['del_len'] < max_del_len]
# fetch coordinates of each deletion
seqsdf['relative_coords'] = seqsdf['del_positions'].apply(get_indel_coords)
seqsdf['type'] = 'deletion'
# adjust coordinates to account for the nts trimmed from beginning e.g. 265nts
seqsdf['absolute_coords'] = seqsdf['relative_coords'].apply(adjust_coords, args=(start_pos+1,))
seqsdf['pos'] = seqsdf['absolute_coords'].apply(lambda x: int(x.split(':')[0])+1)
print(f"Mapping Genes to mutations...")
# approximate the gene where each deletion was identified
seqsdf['gene'] = seqsdf['pos'].apply(map_gene_to_pos)
seqsdf.loc[seqsdf['gene'].isna(), 'gene'] = 'Non-coding region'
# filter our substitutions in non-gene positions
seqsdf.loc[seqsdf['gene']=='nan', 'gene'] = 'Non-coding region'
print(f"Computing codon numbers...")
seqsdf['codon_num'] = seqsdf.apply(compute_codon_num, args=(gene2pos,), axis=1)
print(f"Fetching reference codon...")
# fetch the reference codon for each substitution
print(f"Mapping amino acids...")
# fetch the reference and alternative amino acids
# record the deletion subsequence
seqsdf['del_seq'] = seqsdf['absolute_coords'].apply(get_deletion, args=(ref_seq,))
# record the 5 nts before each deletion (based on reference seq)
seqsdf['prev_5nts'] = seqsdf['absolute_coords'].apply(lambda x: ref_seq[int(x.split(':')[0])-5:int(x.split(':')[0])])
# record the 5 nts after each deletion (based on reference seq)
seqsdf['next_5nts'] = seqsdf['absolute_coords'].apply(lambda x: ref_seq[int(x.split(':')[1])+1:int(x.split(':')[1])+6])
print("Naming deletions")
seqsdf['pos'] = seqsdf['absolute_coords'].apply(lambda x: int(x.split(':')[0]))
seqsdf['ref_codon'] = seqsdf['del_seq'].copy()
seqsdf['gene_start_pos'] = seqsdf['gene'].apply(lambda x: gene2pos.get(x, {}).get('start', 0))
seqsdf['pos_in_codon'] = (seqsdf['pos'] - seqsdf['gene_start_pos']) % 3
seqsdf['mutation'] = seqsdf[['pos_in_codon', 'gene', 'codon_num', 'del_len']].apply(assign_deletion_v2, axis=1)
seqsdf['deletion_start_position'] = seqsdf['absolute_coords'].apply(lambda x: int(x.split(':')[0]))
seqsdf['deletion_start_codon'] = seqsdf[['pos_in_codon', 'codon_num', 'del_len']].apply(assign_deletion_start_number, axis=1)
seqsdf['deletion_codon_coords'] = seqsdf[['pos_in_codon', 'gene', 'codon_num', 'del_len']].apply(assign_deletion_codon_coords, axis=1)
seqsdf['is_frameshift'] = seqsdf['del_len'].apply(is_frameshift)
oof_mutations = identify_oof_replacements_per_sample(seqsdf.copy(), cns)
seqsdf = pd.concat([seqsdf, oof_mutations])

Initial cleaning...
Identifying deletions...
Mapping Genes to mutations...
Computing codon numbers...
Fetching reference codon...
Mapping amino acids...
Naming deletions
Mapping Genes to mutations...
Computing codon numbers...
Fetching reference codon...
Fetching alternative codon...
Mapping amino acids...
Naming substitutions


In [174]:
seqsdf[['mutation', 'codon_num', 'absolute_coords', 'gene_start_pos']]

Unnamed: 0,mutation,codon_num,absolute_coords,gene_start_pos
1,ORF1a:DEL3675/3677,3675,11288:11296,265
1,S:DEL144,144,21991:21993,21562
1,Non-coding region:DEL0,0,28271:28271,0
2,Non-coding region:DEL0,0,28271:28271,0
3,S:DEL157/158,157,22029:22034,21562
3,ORF8:DEL119/120,119,28248:28253,27893
3,Non-coding region:DEL0,0,28271:28271,0
4,S:DEL157/158,157,22029:22034,21562
4,ORF8:DEL119/120,119,28248:28253,27893
4,Non-coding region:DEL0,0,28271:28271,0


In [124]:
seqs, ref_seq = process_cns_seqs(cns, patient_zero,
                                     start_pos=0, end_pos=29674)
#     ref_seq = get_seq_from_fasta(ref_path)
seqsdf = (pd.DataFrame(index=seqs.keys(), 
                       data=seqs.values(), 
                       columns=['sequence'])
            .reset_index()
            .rename(columns={'index': 'idx'}))
if test:
    seqsdf = seqsdf.sample(100)
# compute length of each sequence
seqsdf['seq_len'] = seqsdf['sequence'].str.len()
# filter out seqs that are too short
seqsdf = seqsdf[seqsdf['seq_len']>min_seq_len]
print(f"Identifying mutations...")
# for each sample, identify list of substitutions (position:alt)
seqsdf['replacements'] = seqsdf['sequence'].apply(find_replacements, 
                                                args=(ref_seq,))
# sequences with one or more substitutions
seqsdf = seqsdf.loc[seqsdf['replacements'].str.len() > 0]
seqsdf = seqsdf.loc[seqsdf['replacements'].str.len() < max_num_subs]
seqs = dict(zip(seqsdf['idx'], seqsdf['sequence']))
# drop the actual sequences to save mem
seqsdf.drop(columns=['sequence'], inplace=True)
seqsdf = compute_replacement_information(seqsdf, seqs)

Identifying mutations...
Mapping Genes to mutations...
Computing codon numbers...
Fetching reference codon...
Fetching alternative codon...
Mapping amino acids...
Naming substitutions


In [125]:
seqsdf_subs = seqsdf.copy()

In [140]:
seqsdf_subs.loc[seqsdf_subs['mutation']=='S:D614G']

Unnamed: 0,idx,seq_len,replacements,pos,gene,gene_start_pos,codon_num,codon_start,ref_codon,alt_codon,ref_aa,alt_aa,mutation,type
1,hCoV-19/USA/NY-PRL-2021_03_15_00K10/2021|EPI_I...,29674,23402:G,23402,S,21562,614,23401,GAT,GGT,D,G,S:D614G,substitution
2,hCoV-19/USA/NY-PRL-2021_03_12_00C01/2021|EPI_I...,29674,23402:G,23402,S,21562,614,23401,GAT,GGT,D,G,S:D614G,substitution
3,hCoV-19/England/CAMC-142DAEA/2021|EPI_ISL_1454...,29674,23402:G,23402,S,21562,614,23401,GAT,GGT,D,G,S:D614G,substitution
4,hCoV-19/Australia/WA658/2021|EPI_ISL_1508996|2...,29674,23402:G,23402,S,21562,614,23401,GAT,GGT,D,G,S:D614G,substitution
5,hCoV-19/France/IDF-HMN-21072080202/2021|EPI_IS...,29674,23402:G,23402,S,21562,614,23401,GAT,GGT,D,G,S:D614G,substitution
6,hCoV-19/France/IDF-HMN-21072080424/2021|EPI_IS...,29674,23402:G,23402,S,21562,614,23401,GAT,GGT,D,G,S:D614G,substitution
7,hCoV-19/France/IDF-HMN-21072090063/2021|EPI_IS...,29674,23402:G,23402,S,21562,614,23401,GAT,GGT,D,G,S:D614G,substitution
8,hCoV-19/France/IDF-HMN-21072080265/2021|EPI_IS...,29674,23402:G,23402,S,21562,614,23401,GAT,GGT,D,G,S:D614G,substitution


In [149]:
def compute_out_of_frame_backshift(x):
    backshift = np.round(np.modf(x)[0], 1)
    if backshift==0.3:
        return 1
    elif backshift==0.7:
        return 2
    return 0

In [150]:
def find_oof_replacements(x, seqs):
    start_pos = x['deletion_start_position'] - x['oof_backshift_signal']
    sub_seq = seqs[x['idx']][start_pos:].replace('-', '')[:3]
    return [f'{start_pos+i}:{n}' for i, n in enumerate(sub_seq) if n!='n']

In [172]:
def identify_oof_replacements_per_sample(dels_df: pd.DataFrame, cns, 
                                         patient_zero: str="NC_045512.2") -> pd.DataFrame:
    seqs, ref_seq = process_cns_seqs(cns, patient_zero,
                                     start_pos=0, end_pos=29674)
    cols = ['idx', 'deletion_start_position', 'oof_backshift_signal']
    dels_df['oof_backshift_signal'] = dels_df['deletion_start_codon'].apply(compute_out_of_frame_backshift)
    oof_filter = (dels_df['is_frameshift']==False) & (dels_df['oof_backshift_signal']>0)
    oof_df = dels_df.loc[oof_filter][cols].copy()
    # compute length of each sequence
    oof_df['seq_len'] = oof_df['idx'].apply(lambda x: len(seqs[x]))
    oof_df['replacements'] = oof_df.apply(find_oof_replacements, args=(seqs,), axis=1)
    oof_df = compute_replacement_information(oof_df, seqs, mutation_type='out-of-frame')
    return oof_df

In [152]:
oof = identify_oof_replacements_per_sample(seqsdf, cns)
oof

Mapping Genes to mutations...
Computing codon numbers...
Fetching reference codon...
Fetching alternative codon...
Mapping amino acids...
Naming substitutions


Unnamed: 0,idx,deletion_start_position,oof_backshift_signal,seq_len,replacements,pos,gene,gene_start_pos,codon_num,codon_start,ref_codon,alt_codon,ref_aa,alt_aa,mutation,type
1,hCoV-19/USA/NY-PRL-2021_03_15_00K10/2021|EPI_I...,11288,1,29674,11287:A,11287,ORF1a,265,3675,11287,TCT,AAG,S,K,ORF1a:S3675K,substitution
1,hCoV-19/USA/NY-PRL-2021_03_15_00K10/2021|EPI_I...,11288,1,29674,11288:A,11288,ORF1a,265,3675,11287,TCT,AAG,S,K,ORF1a:S3675K,substitution
1,hCoV-19/USA/NY-PRL-2021_03_15_00K10/2021|EPI_I...,11288,1,29674,11289:G,11289,ORF1a,265,3675,11287,TCT,AAG,S,K,ORF1a:S3675K,substitution
3,hCoV-19/England/CAMC-142DAEA/2021|EPI_ISL_1454...,22029,2,29674,22027:G,22027,S,21562,156,22027,GAG,GGA,E,G,S:E156G,substitution
3,hCoV-19/England/CAMC-142DAEA/2021|EPI_ISL_1454...,22029,2,29674,22028:G,22028,S,21562,156,22027,GAG,GGA,E,G,S:E156G,substitution
3,hCoV-19/England/CAMC-142DAEA/2021|EPI_ISL_1454...,22029,2,29674,22029:A,22029,S,21562,156,22027,GAG,GGA,E,G,S:E156G,substitution
3,hCoV-19/England/CAMC-142DAEA/2021|EPI_ISL_1454...,28248,1,29674,28247:A,28247,ORF8,27893,119,28247,GAT,ATC,D,I,ORF8:D119I,substitution
3,hCoV-19/England/CAMC-142DAEA/2021|EPI_ISL_1454...,28248,1,29674,28248:T,28248,ORF8,27893,119,28247,GAT,ATC,D,I,ORF8:D119I,substitution
3,hCoV-19/England/CAMC-142DAEA/2021|EPI_ISL_1454...,28248,1,29674,28249:C,28249,ORF8,27893,119,28247,GAT,ATC,D,I,ORF8:D119I,substitution
4,hCoV-19/Australia/WA658/2021|EPI_ISL_1508996|2...,22029,2,29674,22027:G,22027,S,21562,156,22027,GAG,GGA,E,G,S:E156G,substitution


In [146]:
def compute_replacement_information(seqsdf: pd.DataFrame, seqs,
                                    mutation_type: str="") -> pd.DataFrame:
    # wide-to-long data manipulation
    seqsdf = seqsdf.explode('replacements')
    # initialize position column
    seqsdf['pos'] = -1
    # populate position column
    seqsdf.loc[~seqsdf['replacements'].isna(), 'pos'] = (seqsdf.loc[~seqsdf['replacements'].isna(), 'replacements']
    .apply(lambda x: int(x.split(':')[0])))
    # filter out non-substitutions
    seqsdf = seqsdf.loc[seqsdf['pos']!=-1]
    print(f"Mapping Genes to mutations...")
    # identify gene of each substitution
    seqsdf['gene'] = seqsdf['pos'].apply(map_gene_to_pos)
    seqsdf.loc[seqsdf['gene'].isna(), 'gene'] = 'Non-coding region'
    seqsdf.loc[seqsdf['gene']=='nan', 'gene'] = 'Non-coding region'
    # filter our substitutions in non-gene positions
    seqsdf = seqsdf.loc[seqsdf['gene']!='nan']
    print(f"Computing codon numbers...")
    # compute codon number of each substitution
    seqsdf['gene_start_pos'] = seqsdf['gene'].apply(lambda x: gene2pos.get(x, {}).get('start', 0))
    seqsdf['codon_num'] = np.ceil((seqsdf['pos'] - seqsdf['gene_start_pos'] + 1) / 3).astype(int)
#     seqsdf['codon_num'] = np.floor(((seqsdf['pos'] - seqsdf['gene_start_pos']) / 3)+1).astype(int)
    print(f"Fetching reference codon...")
    # fetch the reference codon for each substitution
    seqsdf['codon_start'] = seqsdf['gene_start_pos'] + (3*(seqsdf['codon_num'] - 1))
    seqsdf['ref_codon'] = seqsdf['codon_start'].apply(lambda x: ref_seq[x:x+3].upper())
    print(f"Fetching alternative codon...")
    if mutation_type=='out-of-frame':
        # fetch the alternative codon for each substitution
        seqsdf['alt_codon'] = seqsdf[['idx', 'codon_start']].apply(get_alt_oof_codon, args=(seqs,), axis=1)
    else:
        # fetch the alternative codon for each substitution
        seqsdf['alt_codon'] = seqsdf[['idx', 'codon_start']].apply(get_alt_codon, args=(seqs,), axis=1)
    del seqs
    gc.collect();
    print(f"Mapping amino acids...")
    # fetch the reference and alternative amino acids
    seqsdf['ref_aa'] = seqsdf['ref_codon'].apply(get_aa)
    seqsdf['alt_aa'] = seqsdf['alt_codon'].apply(get_aa)
    # filter out substitutions with non-amino acid alternates (bad consensus calls)
    seqsdf = seqsdf.loc[seqsdf['alt_aa']!='nan']
    print("Naming substitutions")
    seqsdf['mutation'] = seqsdf['gene'] + ':' + seqsdf['ref_aa'] + seqsdf['codon_num'].astype(str) + seqsdf['alt_aa']
    seqsdf['type'] = 'substitution'
    return seqsdf

In [100]:
seqsdf[['mutation', 'codon_num', 'absolute_coords', 'pos_in_codon', 'deletion_start_codon', 'del_len']]

Unnamed: 0,mutation,codon_num,absolute_coords,pos_in_codon,deletion_start_codon,del_len
1,ORF1a:DEL3675/3677,3675,11288:11296,1,3675.3,9
1,S:DEL144,144,21991:21993,0,144.0,3
1,Non-coding region:DEL0,0,28271:28271,2,0.7,1
2,Non-coding region:DEL0,0,28271:28271,2,0.7,1
3,S:DEL157/158,157,22029:22034,2,157.7,6
3,ORF8:DEL119/120,119,28248:28253,1,119.3,6
3,Non-coding region:DEL0,0,28271:28271,2,0.7,1
4,S:DEL157/158,157,22029:22034,2,157.7,6
4,ORF8:DEL119/120,119,28248:28253,1,119.3,6
4,Non-coding region:DEL0,0,28271:28271,2,0.7,1


In [57]:
['deletion_codon_coords']

0                                                NaN
1                                                NaN
2                                                NaN
3                                                NaN
4                                                NaN
                           ...                      
265    ORF8:DEL119.66666666666667/121.66666666666667
266          Non-coding region:DEL0.6666666666666666
267                                 S:DEL157.0/159.0
268    ORF8:DEL119.66666666666667/121.66666666666667
269          Non-coding region:DEL0.6666666666666666
Name: deletion_codon_coords, Length: 270, dtype: object

In [30]:
m1

Unnamed: 0,idx,seq_len,replacements,pos,gene,gene_start_pos,codon_num,codon_start,ref_codon,alt_codon,...,del_len,relative_coords,absolute_coords,del_seq,prev_5nts,next_5nts,pos_in_codon,deletion_codon_coords,is_frameshift,is_synonymous
0,hCoV-19/USA/NY-PRL-2021_03_15_00K10/2021|EPI_I...,29674,240:T,240,5UTR,0,81,240.0,CGT,TGT,...,,,,,,,,,,False
1,hCoV-19/USA/NY-PRL-2021_03_15_00K10/2021|EPI_I...,29674,3036:T,3036,ORF1a,265,924,3034.0,TTC,TTT,...,,,,,,,,,,True
2,hCoV-19/USA/NY-PRL-2021_03_15_00K10/2021|EPI_I...,29674,3851:T,3851,ORF1a,265,1196,3850.0,GAA,GTA,...,,,,,,,,,,False
3,hCoV-19/USA/NY-PRL-2021_03_15_00K10/2021|EPI_I...,29674,3960:T,3960,ORF1a,265,1232,3958.0,ATC,ATT,...,,,,,,,,,,True
4,hCoV-19/USA/NY-PRL-2021_03_15_00K10/2021|EPI_I...,29674,9866:C,9866,ORF1a,265,3201,9865.0,CTT,CCT,...,,,,,,,,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,hCoV-19/France/IDF-HMN-21072090063/2021|EPI_IS...,29409,,28248,ORF8,27895,119,,TTTCAT,,...,6.0,27982:27987,28248:28253,TTTCAT,TTTAG,TCTAA,2.0,ORF8:DEL119.66666666666667/121.66666666666667,False,False
266,hCoV-19/France/IDF-HMN-21072090063/2021|EPI_IS...,29409,,28271,Non-coding region,0,0,,A,,...,1.0,28005:28005,28271:28271,A,AACTA,AATGT,2.0,Non-coding region:DEL0.6666666666666666,True,False
267,hCoV-19/France/IDF-HMN-21072080265/2021|EPI_IS...,29409,,22029,S,21564,157,,TTCAGA,,...,6.0,21763:21768,22029:22034,TTCAGA,AGTGA,AGTTT,0.0,S:DEL157.0/159.0,False,False
268,hCoV-19/France/IDF-HMN-21072080265/2021|EPI_IS...,29409,,28248,ORF8,27895,119,,TTTCAT,,...,6.0,27982:27987,28248:28253,TTTCAT,TTTAG,TCTAA,2.0,ORF8:DEL119.66666666666667/121.66666666666667,False,False


In [54]:
m2_s = set(m2.loc[m2['idx']=='hCoV-19/England/CAMC-142DAEA/2021|EPI_ISL_1454606|2021-03-18', 'mutation'].unique())

In [55]:
m2_s

{'5UTR:*9R',
 '5UTR:C18R',
 '5UTR:G3R',
 '5UTR:I16R',
 '5UTR:I1R',
 '5UTR:K2R',
 '5UTR:L14R',
 '5UTR:L4R',
 '5UTR:L6R',
 '5UTR:N12R',
 '5UTR:P7R',
 '5UTR:Q10R',
 '5UTR:Q13R',
 '5UTR:R81C',
 '5UTR:R8R',
 '5UTR:S15R',
 '5UTR:S17R',
 '5UTR:T11R',
 '5UTR:V70V',
 '5UTR:Y5R',
 'M:I82T',
 'N:D377Y',
 'N:D63G',
 'N:G215C',
 'N:R203M',
 'Non-coding region:*9424*',
 'Non-coding region:DEL0',
 'ORF1a:A1306S',
 'ORF1a:D2907D',
 'ORF1a:F924F',
 'ORF1a:I671I',
 'ORF1a:M3934I',
 'ORF1a:P2046L',
 'ORF1a:P2287S',
 'ORF1a:T3255I',
 'ORF1a:T3646A',
 'ORF1a:V2930L',
 'ORF1a:V3689V',
 'ORF1b:A1918V',
 'ORF1b:G662S',
 'ORF1b:P1000L',
 'ORF1b:P314L',
 'ORF1b:R1078C',
 'ORF1b:R1413S',
 'ORF3a:S26L',
 'ORF7a:T120I',
 'ORF7a:V82A',
 'ORF7b:T40I',
 'ORF8:D119I',
 'ORF8:DEL119/120',
 'ORF8:F120I',
 'S:D614G',
 'S:D950N',
 'S:DEL157/158',
 'S:E156G',
 'S:F157E',
 'S:G142D',
 'S:L452R',
 'S:P681R',
 'S:R158E',
 'S:T19R',
 'S:T478K'}

In [48]:
	Spike D614G, 
    Spike D950N, 
    Spike E156G, 
    Spike F157del, 
    Spike G142D, 
    Spike L452R, 
    Spike P681R, 
    Spike R158del, 
    Spike T19R, 
    Spike T478K, 
    M I82T, N D63G, N D377Y, N R203M, N R385K, NS3 S26L, NS7a L116F, NS7a T120I, NS7a V82A, NSP2 P129L, NSP3 H1274Y, NSP3 H1880L, NSP3 P822L, NSP4 A446V, NSP6 V149A, NSP12 G671S, NSP12 P323L, NSP13 P77L, NSP15 H234Y

SyntaxError: invalid syntax (<ipython-input-48-0d20241870fa>, line 1)

In [43]:
m1_mutations - m2_mutations

set()

In [44]:
m1_mutations

{'5UTR:R81C',
 '5UTR:V70V',
 'M:I82T',
 'N:D377Y',
 'N:D63G',
 'N:G215C',
 'N:L139F',
 'N:R203M',
 'N:R385K',
 'N:T205I',
 'Non-coding region:D9853Y',
 'Non-coding region:DEL0',
 'ORF1a:A1306S',
 'ORF1a:A3209V',
 'ORF1a:A498V',
 'ORF1a:D2907D',
 'ORF1a:D3936D',
 'ORF1a:D827D',
 'ORF1a:D828D',
 'ORF1a:DEL3675/3677',
 'ORF1a:E1196V',
 'ORF1a:F924F',
 'ORF1a:G334G',
 'ORF1a:H2092Y',
 'ORF1a:I1232I',
 'ORF1a:I1274I',
 'ORF1a:I431M',
 'ORF1a:I671I',
 'ORF1a:K261N',
 'ORF1a:L3201P',
 'ORF1a:M3934I',
 'ORF1a:N1709N',
 'ORF1a:P1640L',
 'ORF1a:P2046L',
 'ORF1a:P2287S',
 'ORF1a:P309L',
 'ORF1a:S2500S',
 'ORF1a:S3949N',
 'ORF1a:T265I',
 'ORF1a:T3255I',
 'ORF1a:T3646A',
 'ORF1a:T3750I',
 'ORF1a:V2930L',
 'ORF1a:V3420V',
 'ORF1a:V3689V',
 'ORF1a:V3718A',
 'ORF1a:V665I',
 'ORF1a:Y4227Y',
 'ORF1b:A1918V',
 'ORF1b:D1926D',
 'ORF1b:G662S',
 'ORF1b:H2285Y',
 'ORF1b:H590H',
 'ORF1b:K2310N',
 'ORF1b:L2265L',
 'ORF1b:M115I',
 'ORF1b:M2269I',
 'ORF1b:M809V',
 'ORF1b:P1000L',
 'ORF1b:P314L',
 'ORF1b:R1078C',