In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as pyo
import seaborn as sns
from natsort import index_natsorted
from scipy.stats import norm
from __future__ import print_function
import ipywidgets as widgets
from ipywidgets import fixed, interact, interact_manual, interactive
%load_ext line_profiler
%matplotlib inline
pyo.init_notebook_mode(connected=True)

In [2]:
df = pd.read_csv("./all_rgiout_2022-06-21", sep="\t")
df = df.sort_values(
    by=["Accession_Number"],
    ascending=True,
    key=lambda x: np.argsort(index_natsorted(df["Accession_Number"])),
    ignore_index=True,
)
single_acc_df = df.drop_duplicates(subset=['Accession_Number'])
year_dict = single_acc_df[['Accession_Number','Year_Cultured']].set_index('Accession_Number').to_dict()['Year_Cultured']
df[["Species", "Subspecies"]] = df["Strain"].str.split("subsp.", 1, expand=True)
df = df.replace(np.nan, "Null", regex=True)
rgiintegron = pd.read_csv('./rgi_subset_integronoverlap', sep='\t', names=df.columns.values)
rgiintegron["Species"] = rgiintegron["Strain"].str.split("subsp.", 1, expand=True)
rgiintegron["Subspecies"] = "Null"
contigdf = pd.concat([df['Accession_Number'], df['Contig'].str.split('_', expand=True)[0].str.split('|').str[-1]], axis=1).copy()

sns.set(rc={"figure.figsize":(20, 12)}) #width=3, #height=4

FileNotFoundError: [Errno 2] No such file or directory: './all_rgiout_2022-06-21'

In [None]:
blactamases = df.loc[df['AMR Gene Family'].str.contains('beta-lactamase')].copy()
pbp = df.loc[df['Best_Hit_ARO'].str.contains('PBP')].copy()

In [None]:
pbp['Strain'].value_counts()

In [None]:
fig = sns.kdeplot(data=pbp.loc[pbp['Strain'].str.match('Escherichia coli')], x="Year_Cultured", hue="Best_Hit_ARO", multiple="fill", cut=0)
plt.xlabel('Year Cultured')
# plt.savefig('./all-blactamases.png', dpi=200)

In [None]:
blactamases.sort_values('Year_Cultured')[['Accession_Number', 'Year_Cultured', 'Strain', 'Best_Hit_ARO']]

In [None]:

import matplotlib.pyplot as plt
from dna_features_viewer import BiopythonTranslator
from Bio import SeqIO
import numpy as np

record_dict = SeqIO.to_dict(SeqIO.parse("/n/scratch3/users/a/ak586/microtrawler/1_sequences/NCTC/ena/NCTC232/GCA_900453135/PROKKA_04192022/PROKKA_04192022.gbk", "genbank"))
# sub_record = record_dict['UGNE01000001.1'][1405899:1417399]
sub_record = record_dict['UGQG01000001.1'][748436:769575]
# sub_record = record_dict['UGQG01000001.1'][748436:775000]
# print(sub_record)
for features in sub_record.features:
    if 'gene' in features.qualifiers:
        gname = features.qualifiers['gene'][0]
        # print(features.qualifiers)
        if '_' in gname:
            features.qualifiers['gene'] = [gname.split('_')[0]]
        if gname == 'ampC':
            features.qualifiers['gene'] = ['DHA-1']
    else:
        prodName = features.qualifiers['product'][0]
        if 'NCBIFAM' in prodName:
            NewprodName = prodName.split(': ')[1]
            features.qualifiers['product'] = [NewprodName]
    #         if 'metalloregulator' in NewprodName:
    #             features.qualifiers['product'] = ['metalloregulator ArsR/SmtB TF']
                # print(features.qualifiers)
        # print(features.qualifiers)
graphic_record = BiopythonTranslator().translate_record(sub_record)
ax, _ = graphic_record.plot(figure_width=15, with_ruler=False, strand_in_label_threshold=7)
# ax.figure.savefig('NCTC232-DHA1-neigh.png', bbox_inches='tight', dpi=200)

In [None]:
record_dict = SeqIO.read("/n/scratch3/users/a/ak586/microtrawler/1_sequences/NCTC/ena/NCTC235/GCA_900635025/PROKKA_04192022/PROKKA_04192022.gbk", "genbank")
# print(record_dict)
# for seq_record in record_dict:
    # print(seq_record.id)

sub_record = record_dict[771870:793009]
# print(sub_record)
for features in sub_record.features:
    if 'gene' in features.qualifiers:
        gname = features.qualifiers['gene'][0]
        # print(features.qualifiers)
        if '_' in gname:
            features.qualifiers['gene'] = [gname.split('_')[0]]
        if gname == 'ampC':
            features.qualifiers['gene'] = ['DHA-22']
    else:
        prodName = features.qualifiers['product'][0]
        if 'NCBIFAM' in prodName:
            NewprodName = prodName.split(': ')[1]
            features.qualifiers['product'] = [NewprodName]
    #         if 'metalloregulator' in NewprodName:
    #             features.qualifiers['product'] = ['metalloregulator ArsR/SmtB TF']
                # print(features.qualifiers)
        # print(features.qualifiers)
graphic_record = BiopythonTranslator().translate_record(sub_record)
ax, _ = graphic_record.plot(figure_width=15, with_ruler=False, strand_in_label_threshold=1)
ax.figure.savefig('NCTC235-DHA22-neigh.png', bbox_inches='tight', dpi=200)

In [None]:
sns.set(rc = {'figure.figsize':(15,8)})

In [None]:
blactamases['AMR Gene Family'].value_counts().reset_index()
other = []
for i in blactamases['AMR Gene Family'].value_counts().reset_index().iloc():
    if i['AMR Gene Family'] < 10:
        other.append(i['index'])
blactamases['Gene Family'] = blactamases['AMR Gene Family']
blactamases.loc[blactamases['AMR Gene Family'].isin(other), ['Gene Family']] = 'Other'

In [None]:
fig = sns.kdeplot(data=blactamases, x="Year_Cultured", hue="Gene Family", multiple="fill", cut=0, bw_adjust=0.9)
plt.xlabel('Year Cultured')
sns.move_legend(fig, loc='upper left')
plt.savefig('./all-blactamases.png', dpi=200)

In [None]:
ecoli_blactamases = blactamases.loc[blactamases['Strain']=='Escherichia coli']
fig = sns.kdeplot(data=ecoli_blactamases, x="Year_Cultured", hue="AMR Gene Family", multiple="fill", cut=0)
plt.xlabel('Year Cultured')
sns.move_legend(fig, loc='upper left')
plt.savefig('./ecoli-all-blactamases.png', dpi=200)

In [None]:
ampc_ecoli_blactamases = ecoli_blactamases.loc[ecoli_blactamases['AMR Gene Family']=='ampC-type beta-lactamase']
sns.kdeplot(data=ampc_ecoli_blactamases, x="Year_Cultured", hue="Best_Hit_ARO", multiple="fill", cut=0)
plt.xlabel('Year Cultured')
plt.savefig('./ecoli-ampC-blactamases.png', dpi=200)

In [None]:
ec_ecoli_blactamases = ecoli_blactamases.loc[ecoli_blactamases['AMR Gene Family']=='EC beta-lactamase']
sns.kdeplot(data=ec_ecoli_blactamases, x="Year_Cultured", hue="Best_Hit_ARO", multiple="fill", cut=0, bw_adjust=0.5)
plt.xlabel('Year Cultured')
plt.savefig('./ecoli-ec-blactamases.png', dpi=200)

In [None]:
tem_ecoli_blactamases = ecoli_blactamases.loc[ecoli_blactamases['AMR Gene Family']=='TEM beta-lactamase']
sns.kdeplot(data=tem_ecoli_blactamases, x="Year_Cultured", hue="Best_Hit_ARO", multiple="fill", bw_adjust=0.7)

In [None]:
cmy_ecoli_blactamases = ecoli_blactamases.loc[ecoli_blactamases['AMR Gene Family']=='CMY beta-lactamase']
sns.kdeplot(data=cmy_ecoli_blactamases, x="Year_Cultured", hue="Best_Hit_ARO", multiple="fill", bw_adjust=0.7)

In [None]:
criticalcarbapenem_strains = df.loc[df['Strain'].str.contains('Acinetobacter baumannii|Pseudomonas aeruginosa|Klebsiella pneumonia|Escherichia coli|Enterobacter|Serratia|Proteus|Providencia|Morganella')]
carbapenemases = criticalcarbapenem_strains.loc[criticalcarbapenem_strains['Drug Class'].str.contains('carbapenem') & criticalcarbapenem_strains['AMR Gene Family'].str.contains('lactamase')]
sns.kdeplot(data=carbapenemases, x="Year_Cultured", hue="AMR Gene Family", multiple="fill")

In [None]:
criticalcarbapenem_strains = df.loc[df['Strain'].str.contains('Acinetobacter baumannii|Klebsiella pneumonia|Escherichia coli|Enterobacter|Serratia|Proteus|Providencia|Morganella')]
carbapenemases = criticalcarbapenem_strains.loc[criticalcarbapenem_strains['Drug Class'].str.contains('carbapenem') & criticalcarbapenem_strains['AMR Gene Family'].str.contains('lactamase')]
sns.kdeplot(data=carbapenemases, x="Year_Cultured", hue="AMR Gene Family", multiple="fill")

In [None]:
shv_carbapenemases = carbapenemases.loc[carbapenemases['AMR Gene Family'].str.contains('SHV beta-lactamase')]
sns.kdeplot(data=shv_carbapenemases, x="Year_Cultured", hue="Best_Hit_ARO", multiple="fill", bw_adjust=0.1)

In [None]:
shv_carbapenemases['Strain'].value_counts()

In [None]:
blactamases['AMR Gene Family'].value_counts().reset_index()
other = []
for i in blactamases['AMR Gene Family'].value_counts().reset_index().iloc():
    if i['AMR Gene Family'] < 10:
        other.append(i['index'])
blactamases['Gene Family'] = blactamases['AMR Gene Family']
blactamases.loc[blactamases['AMR Gene Family'].isin(other), ['Gene Family']] = 'Other'

In [None]:
kleb_carbapenemases = blactamases.loc[blactamases['Strain'].str.contains('Kleb')].copy()
other = []
for i in kleb_carbapenemases['Best_Hit_ARO'].value_counts().reset_index().iloc():
    if i['Best_Hit_ARO'] < 3:
        other.append(i['index'])
kleb_carbapenemases['Gene'] = kleb_carbapenemases['Best_Hit_ARO']
kleb_carbapenemases.loc[kleb_carbapenemases['Best_Hit_ARO'].isin(other), ['Gene']] = 'Other'
sns.kdeplot(data=kleb_carbapenemases, x="Year_Cultured", hue="Gene", multiple="fill")

In [None]:
df.loc[df['Best_Hit_ARO'].str.contains('CTX')][['Strain','Year_Cultured','Best_Hit_ARO']]

In [None]:
shv_kleb_carbapenemases = kleb_carbapenemases.loc[kleb_carbapenemases['AMR Gene Family'].str.contains('SHV')]
sns.kdeplot(data=shv_kleb_carbapenemases, x="Year_Cultured", hue="Best_Hit_ARO", multiple="fill")

In [None]:
pseud_blactamases = blactamases.loc[blactamases['Strain'].str.contains('Pseud')]
sns.kdeplot(data=pseud_blactamases, x="Year_Cultured", hue="Best_Hit_ARO", multiple="fill")