In [54]:
import pandas as pd
import plotly.express as px
import os

In [None]:
def get_readtables(path):
    """
    Reads and merges molecule table CSV files from a directory.
    
    Separates files into two groups:
    - All files ending with 'moleculestable.csv' (excluding onlygavage)
    - Files ending with 'onlygavage_moleculestable.csv'
    
    Strategy (based on ST number in filename):
    - Same ST number → concat horizontally (axis=1) on umi_seq (add columns for different samples)
    - Different ST numbers → concat vertically (axis=0, add rows)
    
    Parameters:
    -----------
    path : str
        Directory path containing the CSV files
        
    Returns:
    --------
    df_total : pd.DataFrame
        Merged dataframe of all moleculestable files (non-gavage)
    df_gavageonly : pd.DataFrame
        Merged dataframe of onlygavage moleculestable files
    """
    # Get all CSV files in the directory
    all_files = [f for f in os.listdir(path) if f.endswith('.csv')]
    
    # Separate files into two groups
    gavage_files = [f for f in all_files if 'onlygavage_moleculestable.csv' in f and not f.startswith('._')]
    total_files = [f for f in all_files if 'moleculestable.csv' in f and 'onlygavage' not in f and not f.startswith('._')]
    
    # Function to merge multiple dataframes
    def merge_tables(file_list, directory):
        if not file_list:
            return pd.DataFrame()
        
        from collections import defaultdict
        
        # Group files by ST number
        st_groups = defaultdict(list)
        
        for file in file_list:
            # Extract ST number from filename using split('_')
            parts = file.split('_')
            st_num = None
            for part in parts:
                if 'ST' in part.upper():
                    st_num = part
                    break
            
            if st_num:
                st_groups[st_num].append(file)
        print(st_groups)
        # For each ST group, concat horizontally on umi_seq (same ST, different samples = add columns)
        df_merge = pd.DataFrame()            
        for st_num, files in sorted(st_groups.items()):
            print(st_num, files)
            if len(files)>1:
                df_strain = pd.DataFrame()
                for file in files:
                    df = pd.read_csv(os.path.join(directory, file))
                    df = df.set_index('umi_seq')
                    #drop columns that already exist in df_strain
                    drop_cols = [i for i in df.columns if i in df_strain.columns]
                    df.drop(columns = drop_cols, inplace = True)
                    df_strain = pd.concat([df, df_strain], axis = 1)
            else:
                df_strain = pd.read_csv(os.path.join(directory, files[0]))
                df_strain = df_strain.set_index('umi_seq')  

            #drop duplicate water or gavage columns

            # Concat horizontally for same ST number (axis=1, add columns)
            df_merge = pd.concat([df_strain, df_merge], axis=0)
        
        
        
        return df_merge
    
    # Merge both groups
    df_total = merge_tables(total_files, path)
    df_gavageonly = merge_tables(gavage_files, path)
    
    return df_total, df_gavageonly

# P4C1T8

In [35]:
#
df_total, df_gavageonly = get_readtables('/Volumes/sd/faith/MTCSB/projects/P4-barcoding_strains/r2_data/population-bottlenecks/fp_inputs/P4C1T8')


defaultdict(<class 'list'>, {'ST1': ['c1_ST1_moleculestable.csv', 'c2_ST1_moleculestable.csv', 'c3_ST1_moleculestable.csv', 'c4_ST1_moleculestable.csv', 'c5_ST1_moleculestable.csv'], 'ST2': ['c1_ST2_moleculestable.csv', 'c2_ST2_moleculestable.csv', 'c3_ST2_moleculestable.csv', 'c4_ST2_moleculestable.csv', 'c5_ST2_moleculestable.csv'], 'ST5': ['c1_ST5_moleculestable.csv', 'c2_ST5_moleculestable.csv', 'c3_ST5_moleculestable.csv', 'c4_ST5_moleculestable.csv'], 'ST6': ['c1_ST6_moleculestable.csv', 'c2_ST6_moleculestable.csv', 'c3_ST6_moleculestable.csv', 'c4_ST6_moleculestable.csv'], 'ST4': ['c5_ST4_moleculestable.csv']})
ST1 ['c1_ST1_moleculestable.csv', 'c2_ST1_moleculestable.csv', 'c3_ST1_moleculestable.csv', 'c4_ST1_moleculestable.csv', 'c5_ST1_moleculestable.csv']
ST2 ['c1_ST2_moleculestable.csv', 'c2_ST2_moleculestable.csv', 'c3_ST2_moleculestable.csv', 'c4_ST2_moleculestable.csv', 'c5_ST2_moleculestable.csv']
ST4 ['c5_ST4_moleculestable.csv']
ST5 ['c1_ST5_moleculestable.csv', 'c2_ST

In [84]:
df_total_stack = df_total.stack().reset_index()
df_total_stack.columns = ['umi_seq', 'sample', 'counts']
df_total_stack = df_total_stack.loc[df_total_stack['counts']>0].copy()
df_total_stack['sample_type'] = df_total_stack['sample'].apply(
                                                        lambda x: 'gavage' if 'gavage' in x 
                                                        else ('negative_control' if ('WATER' in x or 'NTC' in x) 
                                                        else 'sample')
                                                    )
df_total_stack['strain'] = df_total_stack['umi_seq'].apply(lambda x: str(x).split('-')[0])
df_total_stack
df_total_stack

Unnamed: 0,umi_seq,sample,counts,sample_type,strain
0,ST6-AAACAAACCAACCTGGAA,s8_gavage-DNA,14.0,gavage,ST6
30,ST6-AAACAAACCAACCTGGAA,s4_gavage-DNA,2.0,gavage,ST6
45,ST6-AAACAAACCAACCTGGAA,s2_gavage-DNA,375.0,gavage,ST6
53,ST6-AAACAAACCAACCTGGAA,s29_stool-DNA,40.0,sample,ST6
84,ST6-AAACAAAGCACCACGAGG,s8_gavage-DNA,7.0,gavage,ST6
...,...,...,...,...,...
590724670,ST1-TTTTTTACCAAAGCAGAT,s100_SI-DNA,1.0,sample,ST1
590724757,ST1-TTTTTTACCCATGCCACA,s119_Pr.-Colon-DNA,7.0,sample,ST1
590724845,ST1-TTTTTTGCTCACGTGGAG,s29_stool-DNA,20.0,sample,ST1
590724923,ST1-TTTTTTTCTCCAATAAAG,s109_Cecum-DNA,42.0,sample,ST1


In [124]:
agg_dict = {'total_count': ('counts', 'sum'),
        'total_unique_umi': ('umi_seq', 'nunique')}

df_total_gpd = df_total_stack.groupby(['sample_type', 'sample']).agg(**agg_dict)

df_total_gpd = df_total_gpd.reset_index()
df_total_gpd = df_total_gpd.set_index(['sample_type', 'sample']).stack()
df_total_gpd = df_total_gpd.reset_index()
df_total_gpd = df_total_gpd.rename(columns = {'level_2':'type', 0:'value'})
df_total_gpd

df_plot = df_total_gpd.loc[df_total_gpd['type'].str.contains('ST')==False]
fig = px.box(df_plot,
             x = 'sample_type',
             y = 'value',
             color = 'type',
             hover_data = ['sample'],
             template = 'simple_white',
             points = 'all',
             log_y = True,
            category_orders={'sample_type': sorted(df_plot['sample_type'].unique())}
             )

fig.update_xaxes(matches=None, categoryorder='trace', showticklabels=True)
fig.update_yaxes(matches=None, showticklabels=True)
fig.update_yaxes(col=1, range=[3, 10])
fig.update_yaxes(col=2, range=[2, 7])
fig.show()

In [131]:
df_strains_gpd = df_total_stack.groupby(['sample_type', 'sample', 'strain']).agg(**agg_dict)

df_strains_gpd = df_strains_gpd.reset_index()
df_strains_gpd = df_strains_gpd.set_index(['sample_type', 'sample','strain']).stack()
df_strains_gpd = df_strains_gpd.reset_index()
display(df_strains_gpd)
df_strains_gpd = df_strains_gpd.rename(columns = {'level_3':'type', 0:'value'})
df_strains_gpd

fig = px.box(df_strains_gpd,
             x = 'sample_type',
             y = 'value',
             color = 'strain',
             facet_row='type',
             hover_data = ['sample'],
             template = 'simple_white',
             points = 'all',
             log_y = True,
             )
#for i, facet in fig.facets
#fig.update_yaxes(matches = None, showticklabels = True)
fig.update_yaxes(row = 1,col = 1 ,range = [3, 10])
fig.update_yaxes(row = 2, col = 1,range = [1, 7])
fig.show()


Unnamed: 0,sample_type,sample,strain,level_3,0
0,gavage,s1_gavage-DNA,ST1,total_count,48588736.0
1,gavage,s1_gavage-DNA,ST1,total_unique_umi,298059.0
2,gavage,s1_gavage-DNA,ST2,total_count,40707080.0
3,gavage,s1_gavage-DNA,ST2,total_unique_umi,214452.0
4,gavage,s2_gavage-DNA,ST5,total_count,10611011.0
...,...,...,...,...,...
611,sample,s99_SI-DNA,ST2,total_unique_umi,15.0
612,sample,s99_SI-DNA,ST5,total_count,22163753.0
613,sample,s99_SI-DNA,ST5,total_unique_umi,7205.0
614,sample,s99_SI-DNA,ST6,total_count,2933937.0


In [144]:

df_gavage_stack = df_gavageonly.stack().reset_index()
df_gavage_stack.columns = ['umi_seq', 'sample', 'counts']
df_gavage_stack = df_gavage_stack.loc[df_gavage_stack['counts']>0].copy()
df_gavage_stack['sample_type'] = df_gavage_stack['sample'].apply(
                                                        lambda x: 'gavage' if 'gavage' in x 
                                                        else ('negative_control' if ('WATER' in x or 'NTC' in x) 
                                                        else 'sample')
                                                    )
df_gavage_stack['strain'] = df_gavage_stack['umi_seq'].apply(lambda x: str(x).split('-')[0])
df_gavage_stack

df_gavage_gpd = df_gavage_stack.groupby(['sample_type', 'sample']).agg(**{'total_count': ('counts', 'sum'),
                                                                        'total_unique_umi': ('umi_seq', 'nunique')})
df_gavage_gpd = df_gavage_gpd.reset_index()
df_gavage_gpd = df_gavage_gpd.set_index(['sample_type', 'sample']).stack()

df_gavage_gpd = df_gavage_gpd.reset_index()
df_gavage_gpd = df_gavage_gpd.rename(columns = {'level_2':'type', 0:'value'})
df_gavage_gpd

Unnamed: 0,sample_type,sample,type,value
0,gavage,s1_gavage-DNA,total_count,8.929582e+07
1,gavage,s1_gavage-DNA,total_unique_umi,5.125110e+05
2,gavage,s2_gavage-DNA,total_count,3.908464e+07
3,gavage,s2_gavage-DNA,total_unique_umi,5.073150e+05
4,gavage,s3_gavage-DNA,total_count,7.030600e+04
...,...,...,...,...
163,sample,s57_stool-DNA,total_unique_umi,6.347600e+04
164,sample,s58_stool-DNA,total_count,1.112938e+09
165,sample,s58_stool-DNA,total_unique_umi,6.724400e+04
166,sample,s99_SI-DNA,total_count,2.352472e+07


In [145]:
fig = px.box(df_gavage_gpd,
             x = 'sample_type',
             y = 'value',
             color = 'type',
             hover_data = ['sample'],
             template = 'simple_white',
             points = 'all',
             log_y = True,
             width = 800
             )
fig.update_yaxes(range = [2, 10])
fig.show()

In [152]:
df_gavage_strains = df_gavage_stack.groupby(['sample_type', 'sample', 'strain']).agg(**agg_dict)

df_gavage_strains = df_gavage_strains.reset_index()
df_gavage_strains = df_gavage_strains.set_index(['sample_type', 'sample','strain']).stack()
df_gavage_strains = df_gavage_strains.reset_index()
display(df_gavage_strains)
df_gavage_strains = df_gavage_strains.rename(columns = {'level_3':'type', 0:'value'})
df_gavage_strains

fig = px.box(df_gavage_strains,
             x = 'sample_type',
             y = 'value',
             color = 'strain',
             facet_row='type',
             hover_data = ['sample'],
             template = 'simple_white',
             points = 'all',
             log_y = True,
             width = 800
             )
#for i, facet in fig.facets

fig.show()


Unnamed: 0,sample_type,sample,strain,level_3,0
0,gavage,s1_gavage-DNA,ST1,total_count,48588736.0
1,gavage,s1_gavage-DNA,ST1,total_unique_umi,298059.0
2,gavage,s1_gavage-DNA,ST2,total_count,40707080.0
3,gavage,s1_gavage-DNA,ST2,total_unique_umi,214452.0
4,gavage,s2_gavage-DNA,ST5,total_count,10611011.0
...,...,...,...,...,...
585,sample,s99_SI-DNA,ST2,total_unique_umi,13.0
586,sample,s99_SI-DNA,ST5,total_count,5280812.0
587,sample,s99_SI-DNA,ST5,total_unique_umi,2871.0
588,sample,s99_SI-DNA,ST6,total_count,4107.0


# P4C2T45

In [155]:
df_total, df_gavageonly = get_readtables('/Volumes/sd/faith/MTCSB/projects/P4-barcoding_strains/r2_data/population-bottlenecks/fp_inputs/P4C2T4T5')


defaultdict(<class 'list'>, {'ST1': ['ST1_moleculestable.csv'], 'ST2': ['ST2_moleculestable.csv'], 'ST4': ['ST4_moleculestable.csv'], 'ST5': ['ST5_moleculestable.csv'], 'ST6': ['ST6_moleculestable.csv']})
ST1 ['ST1_moleculestable.csv']
ST2 ['ST2_moleculestable.csv']
ST4 ['ST4_moleculestable.csv']
ST5 ['ST5_moleculestable.csv']
ST6 ['ST6_moleculestable.csv']
defaultdict(<class 'list'>, {'ST1': ['ST1_onlygavage_moleculestable.csv'], 'ST2': ['ST2_onlygavage_moleculestable.csv'], 'ST4': ['ST4_onlygavage_moleculestable.csv'], 'ST5': ['ST5_onlygavage_moleculestable.csv'], 'ST6': ['ST6_onlygavage_moleculestable.csv']})
ST1 ['ST1_onlygavage_moleculestable.csv']
ST2 ['ST2_onlygavage_moleculestable.csv']
ST4 ['ST4_onlygavage_moleculestable.csv']
ST5 ['ST5_onlygavage_moleculestable.csv']
ST6 ['ST6_onlygavage_moleculestable.csv']


In [158]:
df_total_stack = df_total.stack().reset_index()
df_total_stack.columns = ['umi_seq', 'sample', 'counts']
df_total_stack = df_total_stack.loc[df_total_stack['counts']>0].copy()
df_total_stack['sample_type'] = df_total_stack['sample'].apply(
                                                        lambda x: 'gavage' if 'Gavage' in x 
                                                        else ('negative_control' if ('WATER' in x or 'NTC' in x) 
                                                        else 'sample')
                                                    )
df_total_stack['strain'] = df_total_stack['umi_seq'].apply(lambda x: str(x).split('-')[0])
df_total_stack
df_total_stack

Unnamed: 0,umi_seq,sample,counts,sample_type,strain
3,ST6-AAAAAAACCATTTCGGGG,P4C2T4_s82-A5_Gavage-DNA,1.0,gavage,ST6
144,ST6-AAAAAAACCCAAATATCT,P4C2T4_s81-B8_Gavage-DNA,1.0,gavage,ST6
290,ST6-AAAAAAACTAAACCGAAA,P4C2T5_s121-F8_Gavage-DNA,3.0,gavage,ST6
430,ST6-AAAAAAAGTAAATTGACA,P4C2T4_s81-B8_Gavage-DNA,1.0,gavage,ST6
575,ST6-AAAAAACCCGCCGCCATA,P4C2T4_s82-A5_Gavage-DNA,1.0,gavage,ST6
...,...,...,...,...,...
345360581,ST1-TTGTCAACCATGTCACAG,NTC_G10_NTC,21.0,negative_control,ST1
345360727,ST1-TTTCAAACTAATGCGAAC,NTC_H5_NTC,31.0,negative_control,ST1
345360867,ST1-TTTCAACCTGCGACATAT,NTC_G10_NTC,168.0,negative_control,ST1
345361013,ST1-TTTCATATCGCGTTAGCA,NTC_H5_NTC,1.0,negative_control,ST1


In [160]:
agg_dict = {'total_count': ('counts', 'sum'),
        'total_unique_umi': ('umi_seq', 'nunique')}

df_total_gpd = df_total_stack.groupby(['sample_type', 'sample']).agg(**agg_dict)

df_total_gpd = df_total_gpd.reset_index()
df_total_gpd = df_total_gpd.set_index(['sample_type', 'sample']).stack()
df_total_gpd = df_total_gpd.reset_index()
df_total_gpd = df_total_gpd.rename(columns = {'level_2':'type', 0:'value'})
df_total_gpd

df_plot = df_total_gpd.loc[df_total_gpd['type'].str.contains('ST')==False]
fig = px.box(df_plot,
             x = 'sample_type',
             y = 'value',
             color = 'type',
             hover_data = ['sample'],
             template = 'simple_white',
             points = 'all',
             log_y = True,
            category_orders={'sample_type': sorted(df_plot['sample_type'].unique())}
             )

fig.update_xaxes(matches=None, categoryorder='trace', showticklabels=True)
fig.update_yaxes(matches=None, showticklabels=True)
fig.update_yaxes(col=1, range=[2, 10])
fig.show()

In [162]:
df_strains_gpd = df_total_stack.groupby(['sample_type', 'sample', 'strain']).agg(**agg_dict)

df_strains_gpd = df_strains_gpd.reset_index()
df_strains_gpd = df_strains_gpd.set_index(['sample_type', 'sample','strain']).stack()
df_strains_gpd = df_strains_gpd.reset_index()
display(df_strains_gpd)
df_strains_gpd = df_strains_gpd.rename(columns = {'level_3':'type', 0:'value'})
df_strains_gpd

fig = px.box(df_strains_gpd,
             x = 'sample_type',
             y = 'value',
             color = 'strain',
             facet_row='type',
             hover_data = ['sample'],
             template = 'simple_white',
             points = 'all',
             log_y = True,
             )
fig.update_xaxes(matches=None, categoryorder='trace', showticklabels=True)
fig.update_yaxes(matches=None, showticklabels=True)
fig.update_yaxes(col=1, range=[2, 10])
fig.update_yaxes(col=2, range=[1, 7])

fig.show()


Unnamed: 0,sample_type,sample,strain,level_3,0
0,gavage,P4C2T4_s81-B3_Gavage-DNA,ST1,total_count,61718.0
1,gavage,P4C2T4_s81-B3_Gavage-DNA,ST1,total_unique_umi,4942.0
2,gavage,P4C2T4_s81-B3_Gavage-DNA,ST2,total_count,12077.0
3,gavage,P4C2T4_s81-B3_Gavage-DNA,ST2,total_unique_umi,975.0
4,gavage,P4C2T4_s81-B3_Gavage-DNA,ST4,total_count,3045434.0
...,...,...,...,...,...
1281,sample,P4C2T5_s99-C7_Pr.-Colon-DNA,ST4,total_unique_umi,1696.0
1282,sample,P4C2T5_s99-C7_Pr.-Colon-DNA,ST5,total_count,100681.0
1283,sample,P4C2T5_s99-C7_Pr.-Colon-DNA,ST5,total_unique_umi,2329.0
1284,sample,P4C2T5_s99-C7_Pr.-Colon-DNA,ST6,total_count,96.0


In [165]:
df_gavage_stack = df_gavageonly.stack().reset_index()
df_gavage_stack.columns = ['umi_seq', 'sample', 'counts']
df_gavage_stack = df_gavage_stack.loc[df_gavage_stack['counts']>0].copy()
df_gavage_stack['sample_type'] = df_gavage_stack['sample'].apply(
                                                        lambda x: 'gavage' if 'Gavage' in x 
                                                        else ('negative_control' if ('WATER' in x or 'NTC' in x) 
                                                        else 'sample')
                                                    )
df_gavage_stack['strain'] = df_gavage_stack['umi_seq'].apply(lambda x: str(x).split('-')[0])
df_gavage_stack

df_gavage_gpd = df_gavage_stack.groupby(['sample_type', 'sample']).agg(**{'total_count': ('counts', 'sum'),
                                                                        'total_unique_umi': ('umi_seq', 'nunique')})
df_gavage_gpd = df_gavage_gpd.reset_index()
df_gavage_gpd = df_gavage_gpd.set_index(['sample_type', 'sample']).stack()

df_gavage_gpd = df_gavage_gpd.reset_index()
df_gavage_gpd = df_gavage_gpd.rename(columns = {'level_2':'type', 0:'value'})
df_gavage_gpd

fig = px.box(df_gavage_gpd,
             x = 'sample_type',
             y = 'value',
             color = 'type',
             hover_data = ['sample'],
             template = 'simple_white',
             points = 'all',
             log_y = True,
             width = 800
             )
fig.update_yaxes(range = [2, 10])
fig.show()

In [166]:
df_gavage_strains = df_gavage_stack.groupby(['sample_type', 'sample', 'strain']).agg(**agg_dict)

df_gavage_strains = df_gavage_strains.reset_index()
df_gavage_strains = df_gavage_strains.set_index(['sample_type', 'sample','strain']).stack()
df_gavage_strains = df_gavage_strains.reset_index()
display(df_gavage_strains)
df_gavage_strains = df_gavage_strains.rename(columns = {'level_3':'type', 0:'value'})
df_gavage_strains

fig = px.box(df_gavage_strains,
             x = 'sample_type',
             y = 'value',
             color = 'strain',
             facet_row='type',
             hover_data = ['sample'],
             template = 'simple_white',
             points = 'all',
             log_y = True,
             width = 800
             )
#for i, facet in fig.facets

fig.show()

Unnamed: 0,sample_type,sample,strain,level_3,0
0,gavage,P4C2T4_s81-B3_Gavage-DNA,ST1,total_count,61718.0
1,gavage,P4C2T4_s81-B3_Gavage-DNA,ST1,total_unique_umi,4942.0
2,gavage,P4C2T4_s81-B3_Gavage-DNA,ST2,total_count,12077.0
3,gavage,P4C2T4_s81-B3_Gavage-DNA,ST2,total_unique_umi,975.0
4,gavage,P4C2T4_s81-B3_Gavage-DNA,ST4,total_count,3045434.0
...,...,...,...,...,...
1233,sample,P4C2T5_s99-C7_Pr.-Colon-DNA,ST4,total_unique_umi,1678.0
1234,sample,P4C2T5_s99-C7_Pr.-Colon-DNA,ST5,total_count,100642.0
1235,sample,P4C2T5_s99-C7_Pr.-Colon-DNA,ST5,total_unique_umi,2290.0
1236,sample,P4C2T5_s99-C7_Pr.-Colon-DNA,ST6,total_count,89.0
