In [1]:
from __future__ import print_function
import pandas as pd
import plotly
import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objs as go
from natsort import index_natsorted
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import numpy as np

In [2]:
# %matplotlib inline
pyo.init_notebook_mode(connected=True)

In [3]:
full_nctcdf = pd.read_csv("./2022-10-06_nctcdb.tsv", sep='\t', index_col=0)
full_nctcdf = full_nctcdf.sort_values(by=["NCTC_Number"], ascending=True, key=lambda x: np.argsort(index_natsorted(full_nctcdf["NCTC_Number"])), ignore_index=True)
full_nctcdf[['Species','Subspecies']] = full_nctcdf['Current_Name'].str.split('subsp.', 1, expand=True)
full_nctcdf = full_nctcdf.replace(np.nan, 'Null', regex=True)
full_nctcdf

genomeandyear_nctcdf = full_nctcdf.loc[(full_nctcdf['Year_Cultured']!=-1) & (full_nctcdf['Assembly_Accessions']!='Null')].copy()


In a future version of pandas all arguments of StringMethods.split except for the argument 'pat' will be keyword-only.



In [4]:
family_name = genomeandyear_nctcdf['Species'].str.split(' ', expand=True)[0].str[0]
species_name = genomeandyear_nctcdf['Species'].str.split(' ', expand=True)[1]
genomeandyear_nctcdf['Specie_Name'] = family_name + ". " + species_name
specie_counts = genomeandyear_nctcdf['Specie_Name'].value_counts()
topx = specie_counts.head(5).index.tolist()
genomeandyear_nctcdf['TopSpecies'] = genomeandyear_nctcdf['Specie_Name']
genomeandyear_nctcdf.loc[~genomeandyear_nctcdf['TopSpecies'].isin(topx), ['TopSpecies']] = 'Other'
topx.insert(0, 'Other')

In [5]:
px.histogram(
    genomeandyear_nctcdf, 
    x="Year_Cultured",
    labels={'Year_Cultured':'Year Cultured'}, 
    title="Histogram of Year Cultured information from strains w/ genome assemblies in NCTC",
    marginal='box'
)

In [6]:
def plot_givenspecies(df, species):
    print("\nFound %s assemblies..." % df.loc[df['Species'].str.contains(species)].shape[0])
    x = px.histogram(
        df.loc[df['Species'].str.contains(species)], 
        x="Year_Cultured", 
        labels={'Year_Cultured':'Year Cultured',
                'NCTC_Number': 'NCTC Accession Number',
                'Isolated_From': "Isolated from",
                'Assembly_Accessions': "ENA Assembly Accession"
               }, 
        title="Histogram of Year Cultured information from " + species,
        marginal="rug",
        hover_data=["NCTC_Number", "Year_Cultured", "Isolated_From", 'Assembly_Accessions']
    )
    x.show()
    
interact(plot_givenspecies, 
         df=fixed(genomeandyear_nctcdf), 
         species=list(genomeandyear_nctcdf['Species'].value_counts().to_frame().index),
        );

interactive(children=(Dropdown(description='species', options=('Escherichia coli', 'Salmonella enterica ', 'St…