Requires data generated from `analyzing_clusters.ipynb` and `analyzing_clusters.py` as well as `snapgene.fasta`.

In [None]:
import numpy as np
import plotly.express as px
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio import pairwise2

df_list_full = pd.read_csv("../out_data/2022-08-03_full_plasmid_features_w_subgroups.csv")


In [12]:

# this data was generated from pairwise_idf.py
final = pd.read_csv("../out_data/FINAL_idf_author_groups.csv")
final['is_para'] = final['auth_size'] > 1

cols = ['sseqid','Type','Description','subgroup_id','auth_size','num_of_occurances_in_subgroup','num_of_unique_pis','sequence', 'is_silent_mutation', 'is_most_common']
para = (final.loc[final['is_para'] == True,:]
        [cols]
        .drop_duplicates()
        .sort_values(by = ['auth_size'], ascending = False)
        )

cols = ['sseqid','Type','Description','subgroup_id','num_of_occurances_in_subgroup','num_of_unique_pis','sequence', 'is_silent_mutation', 'is_most_common']
lots_of_pis = df_list_full.query("num_of_unique_pis > 20").query('is_canon_seq == False')[cols].drop_duplicates()

In [23]:
#this also drops the SiriusGFP / Superfolder_GFP duplicates
sg = list(SeqIO.parse("../in_data/snapgene.fasta","fasta"))

# this method loses the auth_size info on the variants that have more than 20 PIs
#variants_of_interest = lots_of_pis.append(para).drop_duplicates().drop_duplicates(subset=['sequence'])

# this method collects that info, then fills all the rest with 1s
# then I replaces 1s where we didnt do the computation with NaNs
variants_of_interest = para.merge(lots_of_pis, how = 'outer').drop_duplicates(subset=para.columns.difference(['Description','sseqid']))
variants_of_interest['auth_size'] = variants_of_interest['auth_size'].fillna(1)

MAX_SUBGROUP_SIZE = 1205
variants_of_interest['auth_size'] = np.where((variants_of_interest['num_of_occurances_in_subgroup'] > MAX_SUBGROUP_SIZE) &
         (variants_of_interest['auth_size'] == 1), 
         np.nan, 
         variants_of_interest['auth_size'])


variants_of_interest['tot_pis_sseqid'] = variants_of_interest.groupby('sseqid')['num_of_unique_pis'].transform('sum')

def align(row):
    variant = row['sequence'].lower()
    try:
        canon = str([_ for _ in sg if _.id == row['sseqid']][0].seq).lower()
    except IndexError:
        return None
    alignment = pairwise2.align.globalms(variant, canon, 4, -1, -5, -1)[0]
    align = "".join(["." if a==b else a.upper() if b == "-" else a for a,b, in zip(alignment.seqA, alignment.seqB)])
    return align
variants_of_interest['alignment'] = variants_of_interest.apply(align, axis = 1)

def is_5_prime_del(alignment):
    try:
        return (alignment[0] == '-')
    except TypeError:
        return False
def is_3_prime_del(alignment):
    try:
        return (alignment[-1] == '-')
    except TypeError:
        return False
variants_of_interest['is_5_prime_del'] = variants_of_interest['alignment'].apply(is_5_prime_del)
variants_of_interest['is_3_prime_del'] = variants_of_interest['alignment'].apply(is_3_prime_del)

variants_of_interest['possible_fusion_protein'] = (((variants_of_interest['is_5_prime_del'] == True) | (variants_of_interest['is_3_prime_del'] == True)) & (variants_of_interest['Type'] == "CDS"))

def translate_CDS(row):
    if row['Type'] == "CDS":
        seq = Seq(row['sequence']).replace("-","")
        return str(seq.translate())
    else:
        return None

variants_of_interest['translation'] = variants_of_interest.apply(translate_CDS, axis = 1)
variants_of_interest["has_internal_stop_codon"] = variants_of_interest['translation'].str[:-1].str.contains("\*")

variants_of_interest = variants_of_interest.sort_values(by = ['possible_fusion_protein','tot_pis_sseqid','num_of_unique_pis'], ascending = [True,False,False])

# filters out CDSs that have gaps on 3' or 5' end
# these are likely to be fusion proteins, which we dont care about here
variants_of_interest = variants_of_interest.query('((is_5_prime_del == False) & (is_3_prime_del == False)) | (Type != "CDS")').copy(deep=True)


Partial codon, len(sequence) not a multiple of three. Explicitly trim the sequence or add trailing N before translation. This may become an error in future.



In [24]:
# this removes 2 edge cases where it doesnt align to the correct hit, and that hit is problematic anyway,
# since it is not a 'coniguous' CDS
snapgene = list(SeqIO.parse("../in_data/snapgene.fasta", "fasta"))
sg_desc = pd.read_csv("https://raw.githubusercontent.com/barricklab/pLannotate/master/plannotate/data/data/snapgene.csv")
sg = pd.DataFrame([(_.id, str(_.seq)) for _ in snapgene], columns=['sseqid','seq'])
sg = sg.merge(sg_desc, on='sseqid', how='left')
sg_cds = sg.loc[sg["Type"] == 'CDS',:].copy(deep=True)
sg_cds['translation'] = sg_cds['seq'].apply(lambda x: str(Seq(x).translate()))
sg_cds['final_aa_pos'] = sg_cds['translation'].str[-1:]
sg_cds['has_stop_codon'] = sg_cds['final_aa_pos'] == '*'
sg_cds['has_internal_stop_codon'] = sg_cds['translation'].str[:-1].str.contains('\*')

sg_cds['trans_subgroup_id'] = sg_cds.groupby(['translation']).ngroup()
sg_cds["trans_subgroup_id_size"] = sg_cds.groupby(['trans_subgroup_id'])['trans_subgroup_id'].transform('size')

problem_sseqids = sg_cds.query("has_internal_stop_codon == True")['sseqid']

sg_cds_dedup = sg_cds.sort_values(by=["trans_subgroup_id_size","trans_subgroup_id"], ascending=False).drop_duplicates(subset=['translation'])

variants_of_interest = variants_of_interest.loc[~variants_of_interest['sseqid'].isin(problem_sseqids),:]

In [25]:
# this uses the snapgene library to find silent mutations that were miscategorized as something else
# eg: a dCas9 that was very similar in nucleotide sequence to WT Cas9 so it came up as a Cas9 hit
# basically, this removes silent mutations that were miscategorized since the nuc. seq. is more similar
# to a different sequence while the AA sequence is the same

variants_of_interest['translation_no_stop'] = variants_of_interest['translation'].str.replace("*","", regex=False)
sg_cds_dedup['translation_no_stop'] = sg_cds_dedup['translation'].str.replace("*","", regex=False)


filtered = pd.read_csv("../out_data/2022-08-03_full_plasmid_features_w_subgroups_FILTERED.csv")
filtered = filtered.query("is_silent_mutation == False").copy(deep=True)
filtered_cds = filtered.query("Type == 'CDS'").copy(deep=True)
filtered_cds['translation'] = filtered_cds['sequence'].apply(lambda x: str(Seq(x.replace("-","")).translate()))
filtered_cds['final_aa_pos'] = filtered_cds['translation'].str[-1:]
filtered_cds['has_stop_codon'] = filtered_cds['final_aa_pos'] == '*'
filtered_cds['has_internal_stop_codon'] = filtered_cds['translation'].str[:-1].str.contains('*', regex=False)
filtered_cds['translation_no_stop'] = filtered_cds['translation'].str.replace("*","", regex=False)

filtered_cds['translation_no_stop'] = filtered_cds['translation'].str.replace("*","", regex=False)
other_silient_mut_subgroups = filtered_cds.merge(sg_cds_dedup, on='translation_no_stop', how='inner')['subgroup_id'].unique()

variants_of_interest = variants_of_interest.loc[~variants_of_interest['subgroup_id'].isin(other_silient_mut_subgroups),:]


In [26]:
variants_of_interest = variants_of_interest.query("is_silent_mutation == False")

In [27]:
variants_of_interest#.to_csv("../out_data/variants_of_interest_2022-08-18.csv", index=False)

Unnamed: 0,sseqid,Type,Description,subgroup_id,auth_size,num_of_occurances_in_subgroup,num_of_unique_pis,sequence,is_silent_mutation,is_most_common,tot_pis_sseqid,alignment,is_5_prime_del,is_3_prime_del,possible_fusion_protein,translation,has_internal_stop_codon,translation_no_stop
240,ori,rep_origin,high-copy-number ColE1/pMB1/pBR322/pUC origin ...,19505,,38693,2955,TTGAGATCCTTTTTTTCTGCGCGTAATCTGCTGCTTGCAAACAAAA...,False,True,3288,.................................................,False,False,False,,,
13,ori,rep_origin,high-copy-number ColE1/pMB1/pBR322/pUC origin ...,19366,4.0,154,64,TTGAGATCCTTTTTTTCTGCGCGTAATCTGCTGCTTGCAAACAAAA...,False,False,3288,.................................................,False,False,False,,,
286,ori,rep_origin,high-copy-number ColE1/pMB1/pBR322/pUC origin ...,19406,1.0,221,39,TTGAGATCCTTTTTTTCTGCGCGTAATCTGCTGCTTGCAAACAAAA...,False,False,3288,.................................................,False,False,False,,,
252,ori,rep_origin,high-copy-number ColE1/pMB1/pBR322/pUC origin ...,19507,1.0,439,35,TTGAGATCCTTTTTTTCTGCGCGTAATCTGCTGCTTGCAAACAAAA...,False,False,3288,.................................................,False,False,False,,,
287,ori,rep_origin,high-copy-number ColE1/pMB1/pBR322/pUC origin ...,19390,1.0,168,22,TTGAGATCCTTTTTTTCTGCGCGTAATCTGCTGCTTGCAAACAAAA...,False,False,3288,.................................................,False,False,False,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,WPRE_(2),misc_feature,woodchuck hepatitis virus posttranscriptional ...,486,2.0,8,2,AATCAACCTCTGGATTACAAAATTTGTGAAAGATTGACTGGTATTC...,False,False,2,.................................................,False,False,False,,,
172,attB1,protein_bind,mutant version of attB; recombination site for...,571,2.0,18,2,ACAACTTTGTACAAAAAAGCAGGCT,False,False,2,....c....................,False,False,False,,,
178,UbC_promoter_(2),promoter,human ubiquitin C promoter,17179,2.0,4,2,GGCCTCCGCGCCGGGTTTTGGCGCCTCCCGCGGGCGCCCCCCTCCT...,False,False,2,.................................................,False,False,False,,,
181,IRES_(3),misc_feature,internal ribosome entry site (IRES) of the enc...,870,2.0,2,2,ACGTTACTGGCCGAAGCCGCTTGGAATAAGGCCGGTGTGCGTTTGT...,False,False,2,.................................................,False,False,False,,,


Added manual annotations (including header) to this the above csv file. It can be found in the supplementary data.

In [4]:
import pandas as pd
import plotly.express as px

voi = pd.read_csv("../out_data/variants_of_interest_2023-02-14_annotated.csv")

#both = ((voi['num_of_unique_pis'] >= 20) & (voi['auth_size'] > 1)).replace({True: "both", False: ""})
convergent = ((voi['num_of_unique_pis'] < 20)  & (voi['auth_size'] > 1)).replace({True: "convergent", False: ""})
# frequent_analyzed   = ((voi['num_of_unique_pis'] > 20)  & (voi['auth_size'] == 1)).replace({True: "frequent_only", False: ""})
# frequent_unanalyzed = ((voi['num_of_unique_pis'] > 20)  & (voi['auth_size'].isna())).replace({True: "frequent_unanalyzed", False: ""})
frequent = (voi['num_of_unique_pis'] >= 20).replace({True: "frequent", False: ""})


voi['categorization_method'] = convergent + frequent #+ frequent_unanalyzed
voi['categorization_method'] = voi['categorization_method'].replace({"both": "convergent"})

voi['category_size'] = voi.groupby(['category'])['category'].transform('size')
voi = voi.sort_values(by=['category_size'], ascending=[False])

fig = px.histogram(voi, 
             y="category", 
             color='categorization_method',
             template='plotly_white',
             hover_data=['category_size'],
             color_discrete_sequence=['#fcba03','#7795FF','#AEFF91'], #'#FDD35D'
             barmode='stack',
             facet_col='organism_type',
             labels={'category': 'Category', 
                     'categorization_method': 'Categorization Method', 
                     },
             width=1000,
             height=375,
             )
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[1]))

x_axis_label = 'Number of Variants'
fig['layout']['xaxis']['title']['text']  = x_axis_label
fig['layout']['xaxis2']['title']['text'] = x_axis_label
fig['layout']['xaxis3']['title']['text'] = x_axis_label

fig.for_each_annotation(lambda a: a.update(text=f'<b>{a.text}</b>'))
fig.update_annotations(font=dict( size=16))

# fig.write_image("../out_data/figures/voi_organism_method.svg")
fig

In [204]:
voi['category_size'] = voi.groupby(["sseqid"])["sseqid"].transform("count")#.sort_values(ascending=False)
#voi['category_size_method'] = voi.groupby(["sseqid","categorization_method"])["categorization_method"].transform("count")#.sort_values(ascending=False)

cols = ["sseqid", "Type",'category', 'category_size'] # 'category_size_method' 'categorization_method', 
plot = voi[cols].drop_duplicates().sort_values(by=['category_size'], ascending=[False])

plot['sseqid_clean'] = plot['sseqid'].str.replace("___", " ").str.replace("_", " ").str.replace(" .(\d+.)", "", regex=True)
plot['sseqid_clean'] = plot['sseqid_clean'].replace({'ori':'ColE1 origin of replication'})


fig = px.histogram(plot,
       x="sseqid_clean",
       y="category_size",
       # facet_col='category',
       # facet_col_wrap=3,
       color="category",
       log_y=True,
       height=515,
       width=1450,
       template='plotly_white',
       labels={'sseqid_clean': 'Part name',
               'category_size': 'Number of interesting variants'},
       range_y=[.5,65]
       #hover_data=['categorization_method'],
)

fig.for_each_yaxis(lambda a: a.update(title_text=a.title.text.replace("sum of", "")))
fig.update_layout(
    yaxis = dict(
        tickmode = 'array',
        tickvals = [1, 2, 4, 8, 16, 32, 64],
    )
)

# fig.write_image("../out_data/figures/voi_all.svg")

fig
