In [37]:
import pandas as pd
import plotly.express as px
import os 

def read_FP_tables(inputdir):
    df_FP = pd.DataFrame()
    for dirpath, dirnames, filenames in os.walk(inputdir):
        for file in filenames:
            if file == 'TableOfEstimates.csv':
                df = pd.read_csv(dirpath+"/"+file)
                df['folder'] = dirpath.split("/")[-1] 
                df_FP = pd.concat([df, df_FP])

    return df_FP
strain_color_dict = {  'ST1': px.colors.qualitative.D3[0],
                'ST2': px.colors.qualitative.D3[1],
                'ST4': px.colors.qualitative.D3[2],
                'ST5': px.colors.qualitative.D3[3],
                'ST6': px.colors.qualitative.D3[4],
                'ST7': px.colors.qualitative.D3[5],
                'ST8': px.colors.qualitative.D3[6],
                'ST9': px.colors.qualitative.D3[7]}

In [2]:
df_FP = read_FP_tables('/Volumes/sd/faith/MTCSB/projects/P4-barcoding_strains/20241029_analysis/population-bottlenecks/experiments/P4C1T8/FP')
df_FP['cage'] = df_FP['folder'].apply(lambda x: str(x).split("_")[0])
df_FP['strain'] = df_FP['folder'].apply(lambda x: str(x).split("_")[1])
df_FP = df_FP.rename(columns = {'Unnamed: 0': 'sample_name'})
df_FP['sample_id'] = df_FP['sample_name'].apply(lambda x: str(x).split("_")[0])
df_FP['sample_type'] = df_FP['sample_id'].apply(lambda x: 'sample' if 's' in str(x) else x)
df_FP

Unnamed: 0,sample_name,TotalReads,Number of barcodes,Ns_MinCutoff,Nb,Ns,AverageFrequency,CFU,Log10Ns,Log10CFU,CFU/Ns,folder,cage,strain,sample_id,sample_type
0,s107_SI.DNA,272153976,30843,17309.000000,199275.123474,1.073467e+06,647358.137318,0,6.030789,-inf,0,c5_ST4,c5,ST4,s107,sample
1,s108_SI.DNA,13767059,33191,22586.000000,253160.424163,1.298768e+06,586216.751585,0,6.113532,-inf,0,c5_ST4,c5,ST4,s108,sample
2,s117_Cecum.DNA,916213852,38349,23841.000000,265666.399773,1.651211e+06,791628.592118,0,6.217803,-inf,0,c5_ST4,c5,ST4,s117,sample
3,s118_Cecum.DNA,443324713,35013,22166.000000,236899.873212,1.396209e+06,609438.905726,0,6.144950,-inf,0,c5_ST4,c5,ST4,s118,sample
4,s127_Pr..Colon.DNA,1079846256,38083,24567.666667,280752.369659,1.632911e+06,783783.101241,0,6.212963,-inf,0,c5_ST4,c5,ST4,s127,sample
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14,NTC_H3_NTC,220639,8746,6270.411765,14.071304,1.002350e+04,36020.615573,0,4.001019,-inf,0,c1_ST1,c1,ST1,NTC,NTC
15,WATER_B3_WATER,14439,42,110.411765,10.150313,5.100000e+01,54.549464,0,1.707570,-inf,0,c1_ST1,c1,ST1,WATER,WATER
16,WATER_C6_WATER,134191,1226,804.333333,8.641978,1.319947e+03,5637.197143,0,3.120557,-inf,0,c1_ST1,c1,ST1,WATER,WATER
17,WATER_D2_WATER,250482,15145,4338.777778,19.385610,1.861638e+04,93435.803121,0,4.269895,-inf,0,c1_ST1,c1,ST1,WATER,WATER


In [19]:
df_plot = df_FP[['cage','strain', 'sample_id', 'sample_type','Nb', 'Ns', 'Ns_MinCutoff']].set_index(['cage','strain', 'sample_id', 'sample_type']).stack()
df_plot = df_plot.reset_index()
df_plot = df_plot.rename(columns = {'level_4': 'type', 0: 'value'})
display(df_plot)
fig = px.box(df_plot,
            y = 'value',
            x = 'sample_type',
            color = 'type',
            points = 'all',
            template = 'simple_white',
            width = 800,
            height = 400,
            log_y = True)
fig.show()
fig = px.box(df_plot,
            y = 'value',
            x = 'sample_type',
            facet_row = 'strain',
            color = 'type',
            points = 'all',
            template = 'simple_white',
            width = 800,
            height = 800,
            log_y = True)
fig.show()

Unnamed: 0,cage,strain,sample_id,sample_type,type,value
0,c5,ST4,s107,sample,Nb,1.992751e+05
1,c5,ST4,s107,sample,Ns,1.073467e+06
2,c5,ST4,s107,sample,Ns_MinCutoff,1.730900e+04
3,c5,ST4,s108,sample,Nb,2.531604e+05
4,c5,ST4,s108,sample,Ns,1.298768e+06
...,...,...,...,...,...,...
622,c1,ST1,WATER,WATER,Ns,1.861638e+04
623,c1,ST1,WATER,WATER,Ns_MinCutoff,4.338778e+03
624,c1,ST1,WATER,WATER,Nb,1.301382e+01
625,c1,ST1,WATER,WATER,Ns,9.629412e+01


# FP time course

In [None]:
df_meta = pd.read_csv('/Users/ruprec01/Documents/Faith_lab/Git/population_bottlenecks/input_files/metadata/P4C1T8-sample_sheets - P4C1T8.tsv', sep = "\t")
df_results = pd.merge(df_FP, df_meta, on = 'sample_id', how = 'left')
df_results

Unnamed: 0,sample_name,TotalReads,Number of barcodes,Ns_MinCutoff,Nb,Ns,AverageFrequency,CFU,Log10Ns,Log10CFU,...,strains,tube_type,tube_weight_empty,tube_weight_full,stool_weight_mg,cfu_g_stool,dna_conc_ng_ul,cfu_incubation_time,cfu_g_stool_spec,cfu_g_stool_cmr
0,s107_SI.DNA,272153976,30843,17309.000000,199275.123474,1.073467e+06,647358.137318,0,6.030789,-inf,...,E. coli|E.coli|B.uniformis,2ml_screwcap,1651.32,1674.9,23.58,,0.74,,,
1,s108_SI.DNA,13767059,33191,22586.000000,253160.424163,1.298768e+06,586216.751585,0,6.113532,-inf,...,E. coli|E.coli|B.uniformis,2ml_screwcap,1679.87,1701.5,21.63,,12.88,,,
2,s117_Cecum.DNA,916213852,38349,23841.000000,265666.399773,1.651211e+06,791628.592118,0,6.217803,-inf,...,E. coli|E.coli|B.uniformis,2ml_screwcap,1650.13,1684.7,34.57,,57.78,,,
3,s118_Cecum.DNA,443324713,35013,22166.000000,236899.873212,1.396209e+06,609438.905726,0,6.144950,-inf,...,E. coli|E.coli|B.uniformis,2ml_screwcap,1658.60,1747.3,88.70,,50.31,,,
4,s127_Pr..Colon.DNA,1079846256,38083,24567.666667,280752.369659,1.632911e+06,783783.101241,0,6.212963,-inf,...,E. coli|E.coli|B.uniformis,2ml_screwcap,1651.19,1670.4,19.21,,22.12,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,NTC_H3_NTC,220639,8746,6270.411765,14.071304,1.002350e+04,36020.615573,0,4.001019,-inf,...,,,,,,,,,,
205,WATER_B3_WATER,14439,42,110.411765,10.150313,5.100000e+01,54.549464,0,1.707570,-inf,...,,,,,,,,,,
206,WATER_C6_WATER,134191,1226,804.333333,8.641978,1.319947e+03,5637.197143,0,3.120557,-inf,...,,,,,,,,,,
207,WATER_D2_WATER,250482,15145,4338.777778,19.385610,1.861638e+04,93435.803121,0,4.269895,-inf,...,,,,,,,,,,


In [23]:
df_results.columns

Index(['sample_name', 'TotalReads', 'Number of barcodes', 'Ns_MinCutoff', 'Nb',
       'Ns', 'AverageFrequency', 'CFU', 'Log10Ns', 'Log10CFU', 'CFU/Ns',
       'folder', 'cage', 'strain', 'sample_id', 'sample_type_x', 'project',
       'sample_type_y', 'date_collected', 'time_point_h', 'cage_id',
       'mouse_id', 'gender', 'gavage_dose_200ul', 'strain_bcs', 'strains',
       'tube_type', 'tube_weight_empty', 'tube_weight_full', 'stool_weight_mg',
       'cfu_g_stool', 'dna_conc_ng_ul', 'cfu_incubation_time',
       'cfu_g_stool_spec', 'cfu_g_stool_cmr'],
      dtype='object')

In [52]:
df_plot = df_results.copy().dropna(subset = 'cage_id')
df_plot = df_plot.loc[(df_plot['sample_type_y'].str.contains('stool')) & (df_plot['strain'].str.contains("ST1|ST5|ST4"))].copy()
df_plot['time'] = df_plot['time_point_h'].apply(lambda x: str(x).split('h')[0]).astype(int)
df_plot = df_plot.sort_values(['cage_id', 'time', 'strain'], ascending=True)

fig = px.scatter(df_plot,
                 x = 'time',
                 y = 'Nb',
                 color = 'strain',
                 template = 'simple_white',
                 color_discrete_map=strain_color_dict,
                 facet_col = 'cage_id',
                 facet_row = 'mouse_id',
                 log_y = True,
                 width = 800,
                 height = 600
                 )
fig.write_image('P4C1T8-FP_timecourse_Nb.png', scale = 2)
# Create a mean box with stadard deviation for negative controls and plot

fig.show()
fig = px.scatter(df_plot,
                 x = 'time',
                 y = 'Ns',
                 color = 'strain',
                 template = 'simple_white',
                 color_discrete_map=strain_color_dict,
                 facet_col = 'cage_id',
                 facet_row = 'mouse_id',
                 log_y = True,
                 width = 800,
                 height = 600
                 )

# Create a mean box with stadard deviation for negative controls and plot

fig.show()
fig = px.scatter(df_plot,
                 x = 'time',
                 y = 'Ns_MinCutoff',
                 color = 'strain',
                 template = 'simple_white',
                 color_discrete_map=strain_color_dict,
                 facet_col = 'cage_id',
                 facet_row = 'mouse_id',
                 log_y = True,
                 width = 800,
                 height = 600
                 )

# Create a mean box with stadard deviation for negative controls and plot

fig.show()



# Cage 1 as gavage

In [59]:
df_FP = read_FP_tables('/Volumes/sd/faith/MTCSB/projects/P4-barcoding_strains/20241029_analysis/rerun/P4C1T8/c1gavageFP')
df_FP['cage'] = df_FP['folder'].apply(lambda x: str(x).split("_")[0])
df_FP['strain'] = df_FP['folder'].apply(lambda x: str(x).split("_")[1])
df_FP = df_FP.rename(columns = {'Unnamed: 0': 'sample_name'})
df_FP['sample_id'] = df_FP['sample_name'].apply(lambda x: str(x).split("_")[0])
df_FP['sample_type'] = df_FP['sample_id'].apply(lambda x: 'sample' if 's' in str(x) else x)
df_FP

df_meta = pd.read_csv('/Users/ruprec01/Documents/Faith_lab/Git/population_bottlenecks/input_files/metadata/P4C1T8-sample_sheets - P4C1T8.tsv', sep = "\t")
df_results = pd.merge(df_FP, df_meta, on = 'sample_id', how = 'left')
df_results

Unnamed: 0,sample_name,TotalReads,Number of barcodes,Ns_MinCutoff,Nb,Ns,AverageFrequency,CFU,Log10Ns,Log10CFU,...,strains,tube_type,tube_weight_empty,tube_weight_full,stool_weight_mg,cfu_g_stool,dna_conc_ng_ul,cfu_incubation_time,cfu_g_stool_spec,cfu_g_stool_cmr
0,s107_SI.DNA,272153976,30843,17237.666667,199275.123474,1.078987e+06,647358.137318,0,6.033016,-inf,...,E. coli|E.coli|B.uniformis,2ml_screwcap,1651.32,1674.9,23.58,,0.74,,,
1,s108_SI.DNA,13767059,33191,22541.000000,253160.424163,1.306909e+06,586216.751585,0,6.116245,-inf,...,E. coli|E.coli|B.uniformis,2ml_screwcap,1679.87,1701.5,21.63,,12.88,,,
2,s117_Cecum.DNA,916213852,38349,24037.000000,265666.399773,1.654711e+06,791628.592118,0,6.218722,-inf,...,E. coli|E.coli|B.uniformis,2ml_screwcap,1650.13,1684.7,34.57,,57.78,,,
3,s118_Cecum.DNA,443324713,35013,22186.000000,236899.873212,1.385529e+06,609438.905726,0,6.141616,-inf,...,E. coli|E.coli|B.uniformis,2ml_screwcap,1658.60,1747.3,88.70,,50.31,,,
4,s127_Pr..Colon.DNA,1079846256,38083,24681.000000,280752.369659,1.637511e+06,783783.101241,0,6.214184,-inf,...,E. coli|E.coli|B.uniformis,2ml_screwcap,1651.19,1670.4,19.21,,22.12,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240,NTC_H3_NTC,220639,8746,6295.117647,14.071176,1.003433e+04,36020.615573,0,4.001489,-inf,...,,,,,,,,,,
241,WATER_B3_WATER,14439,42,109.750000,10.150653,5.100000e+01,54.515747,0,1.707570,-inf,...,,,,,,,,,,
242,WATER_C6_WATER,134191,1231,801.000000,8.644377,1.322053e+03,5697.801405,0,3.121249,-inf,...,,,,,,,,,,
243,WATER_D2_WATER,250482,15145,4334.333333,19.385610,1.862814e+04,93435.803121,0,4.270170,-inf,...,,,,,,,,,,


In [60]:
df_plot = df_FP[['cage','strain', 'sample_id', 'sample_type','Nb', 'Ns', 'Ns_MinCutoff']].set_index(['cage','strain', 'sample_id', 'sample_type']).stack()
df_plot = df_plot.reset_index()
df_plot = df_plot.rename(columns = {'level_4': 'type', 0: 'value'})
display(df_plot)
fig = px.box(df_plot,
            y = 'value',
            x = 'sample_type',
            color = 'type',
            points = 'all',
            template = 'simple_white',
            width = 800,
            height = 400,
            log_y = True)
fig.show()
fig = px.box(df_plot,
            y = 'value',
            x = 'sample_type',
            facet_row = 'strain',
            color = 'type',
            points = 'all',
            template = 'simple_white',
            width = 800,
            height = 800,
            log_y = True)
fig.show()

Unnamed: 0,cage,strain,sample_id,sample_type,type,value
0,c5,ST4,s107,sample,Nb,1.992751e+05
1,c5,ST4,s107,sample,Ns,1.078987e+06
2,c5,ST4,s107,sample,Ns_MinCutoff,1.723767e+04
3,c5,ST4,s108,sample,Nb,2.531604e+05
4,c5,ST4,s108,sample,Ns,1.306909e+06
...,...,...,...,...,...,...
730,c1,ST1,WATER,WATER,Ns,1.862814e+04
731,c1,ST1,WATER,WATER,Ns_MinCutoff,4.334333e+03
732,c1,ST1,WATER,WATER,Nb,1.300778e+01
733,c1,ST1,WATER,WATER,Ns,9.511765e+01


In [61]:
df_plot = df_results.copy().dropna(subset = 'cage_id')
df_plot = df_plot.loc[(df_plot['sample_type_y'].str.contains('stool')) & (df_plot['strain'].str.contains("ST1|ST5|ST4"))].copy()
df_plot['time'] = df_plot['time_point_h'].apply(lambda x: str(x).split('h')[0]).astype(int)
df_plot = df_plot.sort_values(['cage_id', 'time', 'strain'], ascending=True)

fig = px.scatter(df_plot,
                 x = 'time',
                 y = 'Nb',
                 color = 'strain',
                 template = 'simple_white',
                 color_discrete_map=strain_color_dict,
                 facet_col = 'cage_id',
                 facet_row = 'mouse_id',
                 log_y = True,
                 width = 800,
                 height = 600
                 )
fig.write_image('P4C1T8-c1_gavage_FP_timecourse_Nb.png', scale = 2)
# Create a mean box with stadard deviation for negative controls and plot

fig.show()
fig = px.scatter(df_plot,
                 x = 'time',
                 y = 'Ns',
                 color = 'strain',
                 template = 'simple_white',
                 color_discrete_map=strain_color_dict,
                 facet_col = 'cage_id',
                 facet_row = 'mouse_id',
                 log_y = True,
                 width = 800,
                 height = 600
                 )

# Create a mean box with stadard deviation for negative controls and plot

fig.show()
fig = px.scatter(df_plot,
                 x = 'time',
                 y = 'Ns_MinCutoff',
                 color = 'strain',
                 template = 'simple_white',
                 color_discrete_map=strain_color_dict,
                 facet_col = 'cage_id',
                 facet_row = 'mouse_id',
                 log_y = True,
                 width = 800,
                 height = 600
                 )

# Create a mean box with stadard deviation for negative controls and plot

fig.show()

