### Testing dropping samples for Qadabra

In [1]:
import pandas as pd

In [19]:
def filter_taxonomy_by_metadata(taxonomy_df, metadata_df, metadata_column):
    """
    Filter the taxonomy DataFrame based on metadata values.

    Parameters:
    - taxonomy_df (pd.DataFrame): DataFrame with taxonomy information.
    - metadata_df (pd.DataFrame): DataFrame with metadata information.
    - metadata_column (str): Name of the metadata column to check.

    Returns:
    - pd.DataFrame: Filtered taxonomy DataFrame.
    """
    unique_metadata_values = metadata_df[metadata_column].unique()

    for metadata_value in unique_metadata_values:
        # Check if any sample in metadata has the specified value
        samples_with_value = metadata_df.loc[metadata_df[metadata_column] == metadata_value, 'sample_name'].values
        
        # Drop rows in the taxonomy DataFrame where all corresponding samples have the specified metadata value
        taxonomy_df = taxonomy_df[~(taxonomy_df[samples_with_value] == 0).all(axis=1)]
    
    return taxonomy_df

In [46]:
def prep_filter(fn, metric):
    
    meta = pd.read_csv('/panfs/cguccion/22_06_22_HCC_CRC_Amir/HCC-microbialDNA/processed_data/metadata/metadata_' + fn + '.tsv', sep = '\t')
    biom = pd.read_csv('/panfs/cguccion/22_06_22_HCC_CRC_Amir/HCC-microbialDNA/qadabra/' + fn + '/' + fn + '.tsv', sep = '\t')

    filtered_taxonomy = filter_taxonomy_by_metadata(biom, meta, metric)
    
    filtered_taxonomy.to_csv('/panfs/cguccion/22_06_22_HCC_CRC_Amir/HCC-microbialDNA/qadabra/' + fn + '/' + fn + '_filtered.tsv', sep = '\t', index=False)
    
    print('Run the following:')
    print('biom convert -i', '/panfs/cguccion/22_06_22_HCC_CRC_Amir/HCC-microbialDNA/qadabra/' + fn + '/' + fn + '_filtered.tsv -o /panfs/cguccion/22_06_22_HCC_CRC_Amir/HCC-microbialDNA/qadabra/' + fn + '/' + fn + '_filtered.biom --to-hdf5')
           

In [47]:
prep_filter('adj_HCC_v_CRC', 'tumor_type')

Run the following:
biom convert -i /panfs/cguccion/22_06_22_HCC_CRC_Amir/HCC-microbialDNA/qadabra/adj_HCC_v_CRC/adj_HCC_v_CRC_filtered.tsv -o /panfs/cguccion/22_06_22_HCC_CRC_Amir/HCC-microbialDNA/qadabra/adj_HCC_v_CRC/adj_HCC_v_CRC_filtered.biom --to-hdf5
