In [1]:
import pandas as pd
import plotly.express as px
import os 

def read_FP_tables(inputdir):
    df_FP = pd.DataFrame()
    for dirpath, dirnames, filenames in os.walk(inputdir):
        for file in filenames:
            if file == 'TableOfEstimates.csv':
                df = pd.read_csv(dirpath+"/"+file)
                df['folder'] = dirpath.split("/")[-1] 
                df_FP = pd.concat([df, df_FP])

    return df_FP
strain_color_dict = {  'ST1': px.colors.qualitative.D3[0],
                'ST2': px.colors.qualitative.D3[1],
                'ST4': px.colors.qualitative.D3[2],
                'ST5': px.colors.qualitative.D3[3],
                'ST6': px.colors.qualitative.D3[4],
                'ST7': px.colors.qualitative.D3[5],
                'ST8': px.colors.qualitative.D3[6],
                'ST9': px.colors.qualitative.D3[7]}

In [48]:
df_FP = read_FP_tables('/Volumes/sd/faith/MTCSB/projects/P4-barcoding_strains/20241029_analysis/rerun/P4C2T4T5/FP')
df_FP = df_FP.rename(columns = {'Unnamed: 0': 'sample_name',
                                'folder': 'strain'})
df_FP['sample_id'] = df_FP['sample_name'].apply(lambda x: str(x).split("_")[1].split('.')[0])
df_FP['project'] = df_FP['sample_name'].apply(lambda x: str(x).split("_")[0])
df_FP['seq_sample_type'] = df_FP['sample_name'].apply(lambda x: 'sample' if 's' in str(x) else str(x).split('_')[0])
df_FP


Unnamed: 0,sample_name,TotalReads,Number of barcodes,Ns_MinCutoff,Nb,Ns,AverageFrequency,CFU,Log10Ns,Log10CFU,CFU/Ns,strain,sample_id,project,seq_sample_type
0,P4C2T4_s1.A1_stool.DNA,9432,214,208.500000,39.970035,255.117647,312.727973,0,2.406741,-inf,0,ST6,s1,P4C2T4,sample
1,P4C2T4_s1.A6_stool.DNA,9957,195,214.333333,33.516052,226.882353,247.122033,0,2.355801,-inf,0,ST6,s1,P4C2T4,sample
2,P4C2T4_s11.F1_stool.DNA,13241,201,181.000000,82.555483,233.941176,345.310447,0,2.369107,-inf,0,ST6,s11,P4C2T4,sample
3,P4C2T4_s11.F6_stool.DNA,3522,161,183.666667,158.171979,208.500000,248.821111,0,2.319106,-inf,0,ST6,s11,P4C2T4,sample
4,P4C2T4_s13.G1_stool.DNA,21541,608,192.111111,113.285514,693.941176,661.952610,0,2.841323,-inf,0,ST6,s13,P4C2T4,sample
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132,NTC_G5_NTC,32254,220,455.285714,4.484329,486.714286,238.795714,0,2.687274,-inf,0,ST1,G5,NTC,NTC
133,NTC_H10_NTC,9676,25,35.545455,0.386335,44.636364,39.069849,0,1.649689,-inf,0,ST1,H10,NTC,NTC
134,NTC_H5_NTC,51524,1352,4757.666667,26.814889,5241.000000,1484.064300,0,3.719414,-inf,0,ST1,H5,NTC,NTC
135,WATER_A2_WATER,4348,43,46.454545,0.685682,77.363636,129.635761,0,1.888537,-inf,0,ST1,A2,WATER,WATER


In [49]:
df_plot = df_FP[['strain', 'sample_id', 'seq_sample_type','Nb', 'Ns', 'Ns_MinCutoff']].set_index(['strain', 'sample_id', 'seq_sample_type']).stack()
df_plot = df_plot.reset_index()
df_plot = df_plot.rename(columns = {'level_3': 'type', 0: 'value'})
display(df_plot)
fig = px.box(df_plot,
            y = 'value',
            x = 'seq_sample_type',
            color = 'type',
            points = 'all',
            template = 'simple_white',
            width = 800,
            height = 400,
            log_y = True)
fig.show()
fig = px.box(df_plot,
            y = 'value',
            x = 'seq_sample_type',
            facet_row = 'strain',
            color = 'type',
            points = 'all',
            template = 'simple_white',
            width = 800,
            height = 800,
            log_y = True)
fig.show()

Unnamed: 0,strain,sample_id,seq_sample_type,type,value
0,ST6,s1,sample,Nb,39.970035
1,ST6,s1,sample,Ns,255.117647
2,ST6,s1,sample,Ns_MinCutoff,208.500000
3,ST6,s1,sample,Nb,33.516052
4,ST6,s1,sample,Ns,226.882353
...,...,...,...,...,...
1639,ST1,A2,WATER,Ns,77.363636
1640,ST1,A2,WATER,Ns_MinCutoff,46.454545
1641,ST1,A7,WATER,Nb,0.607935
1642,ST1,A7,WATER,Ns,44.636364


# FP time course

In [50]:
df_meta1 = pd.read_csv('/Users/ruprec01/Documents/Faith_lab/Git/population_bottlenecks/input_files/metadata/P4C2T4_cr_mouse_sample - P4C2T4.tsv', sep = "\t")
df_meta2 = pd.read_csv('/Users/ruprec01/Documents/Faith_lab/Git/population_bottlenecks/input_files/metadata/P4C2T5_cr_mouse_sample - P4C2T5.tsv', sep = "\t")
df_meta = pd.concat([df_meta1, df_meta2], axis = 0)
df_results = pd.merge(df_FP, df_meta, on = ['sample_id', 'project'], how = 'left')
df_results


Unnamed: 0,sample_name,TotalReads,Number of barcodes,Ns_MinCutoff,Nb,Ns,AverageFrequency,CFU,Log10Ns,Log10CFU,...,strain_bcs,strains,tube_type,tube_weight_empty,tube_weight_full,stool_weight_mg,cfu_g_stool,cfu_incubation_time,dna_conc_ng_ul,dna_conc_ng_ul.1
0,P4C2T4_s1.A1_stool.DNA,9432,214,208.500000,39.970035,255.117647,312.727973,0,2.406741,-inf,...,ST1a|ST2d|ST4a|ST5a|ST6a,E. coli|E.coli|B.uniformis|B.longum|P.distasonis,2ml_screwcap,1651.8,1693.0,41.2,,,-0.040486,0.398801
1,P4C2T4_s1.A6_stool.DNA,9957,195,214.333333,33.516052,226.882353,247.122033,0,2.355801,-inf,...,ST1a|ST2d|ST4a|ST5a|ST6a,E. coli|E.coli|B.uniformis|B.longum|P.distasonis,2ml_screwcap,1651.8,1693.0,41.2,,,-0.040486,0.398801
2,P4C2T4_s11.F1_stool.DNA,13241,201,181.000000,82.555483,233.941176,345.310447,0,2.369107,-inf,...,ST1a|ST2d|ST4a|ST5a|ST6a,E. coli|E.coli|B.uniformis|B.longum|P.distasonis,2ml_screwcap,1694.6,1736.8,42.2,,,4.990206,0.521678
3,P4C2T4_s11.F6_stool.DNA,3522,161,183.666667,158.171979,208.500000,248.821111,0,2.319106,-inf,...,ST1a|ST2d|ST4a|ST5a|ST6a,E. coli|E.coli|B.uniformis|B.longum|P.distasonis,2ml_screwcap,1694.6,1736.8,42.2,,,4.990206,0.521678
4,P4C2T4_s13.G1_stool.DNA,21541,608,192.111111,113.285514,693.941176,661.952610,0,2.841323,-inf,...,ST1a|ST2d|ST4a|ST5a|ST6a,E. coli|E.coli|B.uniformis|B.longum|P.distasonis,2ml_screwcap,1651.3,1696.2,44.9,,,5.564207,0.629570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
543,NTC_G5_NTC,32254,220,455.285714,4.484329,486.714286,238.795714,0,2.687274,-inf,...,,,,,,,,,,
544,NTC_H10_NTC,9676,25,35.545455,0.386335,44.636364,39.069849,0,1.649689,-inf,...,,,,,,,,,,
545,NTC_H5_NTC,51524,1352,4757.666667,26.814889,5241.000000,1484.064300,0,3.719414,-inf,...,,,,,,,,,,
546,WATER_A2_WATER,4348,43,46.454545,0.685682,77.363636,129.635761,0,1.888537,-inf,...,,,,,,,,,,


In [51]:
df_results.columns

Index(['sample_name', 'TotalReads', 'Number of barcodes', 'Ns_MinCutoff', 'Nb',
       'Ns', 'AverageFrequency', 'CFU', 'Log10Ns', 'Log10CFU', 'CFU/Ns',
       'strain', 'sample_id', 'project', 'seq_sample_type', 'sample_type',
       'date_collected', 'time_point_h', 'cage_id', 'mouse_type',
       'pretreatment', 'mouse_id', 'gender', 'gavage_dose_200ul', 'strain_bcs',
       'strains', 'tube_type', 'tube_weight_empty', 'tube_weight_full',
       'stool_weight_mg', 'cfu_g_stool', 'cfu_incubation_time',
       'dna_conc_ng_ul', 'dna_conc_ng_ul.1'],
      dtype='object')

In [None]:
from pandas.api.types import CategoricalDtype

df_plot = df_results.copy() #.dropna(subset = 'cage_id')
df_plot['mouse_type'] = df_plot.apply(lambda x: x['seq_sample_type'] if x['seq_sample_type'] != 'sample' else x['mouse_type'], axis = 1)

df_plot = df_plot.loc[(df_plot['sample_type'].str.contains('stool'))& (df_plot['time_point_h'] == '24 h')|
                        (df_plot['seq_sample_type']!='sample')].copy()
#df_plot = df_plot.sort_values(['cage_id', 'time', 'strain'], ascending=True)
df_plot['x_coord'] = df_plot['mouse_type'] +"|"+ df_plot['pretreatment'].fillna('NA')

df_plot = df_plot.groupby(['x_coord' ,'strain','sample_id'])['Nb'].mean()
df_plot = df_plot.reset_index()
display(df_plot['x_coord'].unique())
#sort
li_order = ['NTC|NA','WATER|NA', 'SPF|NA', 'SPF|antibiotic_vancomycin',  'germ_free|donor_BSD2780','germ_free|NA']
cat_type = CategoricalDtype(categories=li_order, ordered=True)

df_plot['x_coord'] = df_plot['x_coord'].astype(cat_type)
df_plot = df_plot.sort_values('x_coord')
fig = px.box(df_plot,
                 x = 'strain', #pretreatment',
                 y = 'Nb',
                 color = 'x_coord',
                 template = 'simple_white',
                 points = 'all',
                 #color_discrete_map=strain_color_dict,
                 #facet_col = 'mouse_type',
                 #facet_row = 'mouse_id',
                 log_y = True,
                 width = 1000,
                 height = 600
                 )
fig.show()
fig.write_image('P4C2T4T5-FP.png', scale = 1)

array(['NTC|NA', 'SPF|NA', 'SPF|antibiotic_vancomycin', 'WATER|NA',
       'germ_free|NA', 'germ_free|donor_BSD2780'], dtype=object)