## Supplemental 1: Prove that we need to Rarify when doing RPCA / Beta Diversity 
**Goal: To show differences in depth without rarification for RPCA**

Note: This currently only works in the comad env due to deciode errors

### Imports

In [80]:
import pandas as pd

from qiime2 import Artifact, Metadata, Visualization
from qiime2.plugins.emperor.visualizers import biplot
from scipy.stats import mannwhitneyu

from python_scripts.beta_diversity import *

### Functions

In [52]:
def perform_mannwhitney(metadata_df, group_column, numeric_column):
    """
    Perform Mann-Whitney U test on groups defined in a DataFrame.

    Parameters:
    - metadata_df: DataFrame containing the metadata
    - group_column: Column name in metadata_df specifying the groups
    - numeric_column: Column name in metadata_df with numeric values

    Returns:
    - result: Mann-Whitney U test result object
    """
    group1 = metadata_df[metadata_df[group_column] == metadata_df[group_column].unique()[0]][numeric_column]
    group2 = metadata_df[metadata_df[group_column] == metadata_df[group_column].unique()[1]][numeric_column]

    # Perform Mann-Whitney U test
    result = mannwhitneyu(group1, group2)
    
    print(result)

In [86]:
def call_rpca_save(fn, metric, rarefaction=2000, zebra_level='0.1', decoide_min_feature_count=10):
    
    #Metadata
    metadata = pd.read_csv('processed_data/metadata/metadata_' + fn + '.tsv', sep='\t')
    
    #biom tables in df format
    df_rs210_genome = pd.read_csv('processed_data/pandas_df/' + fn + '_qiita15336_prep16181_pangenome_rs210_scrubbed_zebraFilter' + zebra_level + '.tsv' , sep='\t', index_col = 0)
    df_wol2_genome = pd.read_csv('processed_data/pandas_df/' + fn + '_qiita15336_prep16181_pangenome_wol2_scrubbed_zebraFilter' + zebra_level + '.tsv' , sep='\t', index_col = 0)
    
    #Calculate depth of biom tables
    depth_rs210 = df_rs210_genome.sum().to_dict()
    depth_wol2 = df_wol2_genome.sum().to_dict()
    
    #Calculate total # genomes per sample
    features_rs210_genome = (df_rs210_genome > 1).sum().to_dict()
    features_wol2_genome = (df_wol2_genome > 1).sum().to_dict()
    
    #Add all info to metadata
    metadata['depth_rs210'] = metadata['sample_name'].map(depth_rs210)
    metadata['features_rs210_genome'] = metadata['sample_name'].map(features_rs210_genome)
    metadata['depth_wol2'] = metadata['sample_name'].map(depth_wol2)
    metadata['features_wol2_genome'] = metadata['sample_name'].map(features_wol2_genome)
    
    #Convert metadata to qiime2 object
    q2_meta = metadata.copy()
    q2_meta.set_index('sample_name', inplace=True)
    q2_meta.rename(columns={'sample_name': 'sampleid'}, inplace=True)
    q2_meta = Metadata(q2_meta)
    
    display(metadata[:2])
    
    #Run only RPCA with non-rare data
    rs210_distanceMatrix_genome, wol2_distanceMatrix_genome = all_beta(
        df_rs210_genome, 
        df_wol2_genome, 
        metadata, 
        rarefaction,
        metric,
        numRares=None,
        decoide_min_feature_count= decoide_min_feature_count,
        return_biplot = True)
    
    #Create RPCA plot from distance matrix
    rs210_rpca_genome = biplot(biplot = rs210_distanceMatrix_genome, sample_metadata = q2_meta)
    wol2_rpca_genome = biplot(biplot = wol2_distanceMatrix_genome, sample_metadata = q2_meta)
    
    #Save visulization for investigation
    out_fn_rs210 = 'processed_data/RPCA/non_rare/' + fn + '_qiita15336_prep16181_pangenome_rs210_scrubbed_zebraFilter' + zebra_level
    out_fn_wol2 = 'processed_data/RPCA/non_rare/' + fn + '_qiita15336_prep16181_pangenome_wol2_scrubbed_zebraFilter' + zebra_level
    rs210_rpca_genome.visualization.save(out_fn_rs210)
    wol2_rpca_genome.visualization.save(out_fn_wol2)
    
    #Calculate permanova based on depth
    print('RS210 Depth')
    perform_mannwhitney(metadata, metric, 'depth_rs210')
    print()
    print('Wol2 Depth')
    perform_mannwhitney(metadata, metric, 'depth_wol2')
    print()
    
    #Calculate permanova based on # features
    print('RS210 Genome Feature Count')
    perform_mannwhitney(metadata, metric, 'features_rs210_genome')
    print()
    print('Wol2 Genome Feature Count')
    perform_mannwhitney(metadata, metric, 'features_wol2_genome')


### Datasets

### Pangenome Host Depletion
Note: alpha rarefaction was estimated using this Qiita analysis https://qiita.ucsd.edu/analysis/description/60472/

In [27]:
pd.set_option('display.max_columns', None)

#### HCC Tissue Tumor vs. CRC Tissue Tumor

In [87]:
call_rpca_save('tumor_HCC_v_CRC', 'tumor_type', rarefaction=2000, decoide_min_feature_count=1)


Unnamed: 0,sample_name,empo_1,empo_2,empo_3,empo_4,country,latitude,taxon_id,elevation,env_biome,...,host_body_product,qiita_sample_type,collection_timestamp,host_scientific_name,physical_specimen_location,qiita_study_id,depth_rs210,features_rs210_genome,depth_wol2,features_wol2_genome
0,15336.CRC.1.tumor,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,UBERON:parenchyma,tumor,2018,Homo sapiens,University of Florida,15336,266.0,32,185.0,23
1,15336.CRC.10.tumor,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,UBERON:parenchyma,tumor,2019,Homo sapiens,University of Florida,15336,3745.0,13,2635.0,4


RS210 Depth
MannwhitneyuResult(statistic=59.0, pvalue=0.07980688251326712)

Wol2 Depth
MannwhitneyuResult(statistic=58.0, pvalue=0.07273527863230315)

RS210 Genome Feature Count
MannwhitneyuResult(statistic=71.0, pvalue=0.20302422455236335)

Wol2 Genome Feature Count
MannwhitneyuResult(statistic=69.0, pvalue=0.17172529430042438)


#### CRC Adj Tissue vs. CRC Tumor Tissue

In [88]:
call_rpca_save('CRC_adj_v_tumor', 'host_sample_type', rarefaction=2000, decoide_min_feature_count=1)


Unnamed: 0,sample_name,empo_1,empo_2,empo_3,empo_4,country,latitude,taxon_id,elevation,env_biome,...,host_body_product,qiita_sample_type,collection_timestamp,host_scientific_name,physical_specimen_location,qiita_study_id,depth_rs210,features_rs210_genome,depth_wol2,features_wol2_genome
0,15336.CRC.1.Background,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,UBERON:parenchyma,tumor,2018,Homo sapiens,University of Florida,15336,5683.0,19,4035.0,9
1,15336.CRC.1.tumor,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,UBERON:parenchyma,tumor,2018,Homo sapiens,University of Florida,15336,266.0,32,185.0,23


RS210 Depth
MannwhitneyuResult(statistic=46.0, pvalue=0.17896668494669993)

Wol2 Depth
MannwhitneyuResult(statistic=48.0, pvalue=0.21535416447355094)

RS210 Genome Feature Count
MannwhitneyuResult(statistic=57.0, pvalue=0.4211427305811605)

Wol2 Genome Feature Count
MannwhitneyuResult(statistic=51.5, pvalue=0.28607392006434174)


#### HCC Tissue Tumor + HCC Tissue Adj.

In [89]:
call_rpca_save('HCC_adj_v_tumor', 'host_sample_type', rarefaction=2000, decoide_min_feature_count=1)


Unnamed: 0,sample_name,empo_1,empo_2,empo_3,empo_4,country,latitude,taxon_id,elevation,env_biome,...,host_body_product,qiita_sample_type,collection_timestamp,host_scientific_name,physical_specimen_location,qiita_study_id,depth_rs210,features_rs210_genome,depth_wol2,features_wol2_genome
0,15336.HCC.1.Background,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,UBERON:parenchyma,tumor,2018,Homo sapiens,University of Florida,15336,178.0,10,129.0,4
1,15336.HCC.1.tumor,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,UBERON:parenchyma,tumor,2018,Homo sapiens,University of Florida,15336,4775.0,13,3371.0,4


RS210 Depth
MannwhitneyuResult(statistic=108.0, pvalue=0.23118987493362808)

Wol2 Depth
MannwhitneyuResult(statistic=108.0, pvalue=0.23118987493362808)

RS210 Genome Feature Count
MannwhitneyuResult(statistic=125.5, pvalue=0.46912396752915336)

Wol2 Genome Feature Count
MannwhitneyuResult(statistic=89.0, pvalue=0.04748003482698928)


#### HCC Tissue Adj. vs. CRC Tissue Adj.

In [90]:
call_rpca_save('adj_HCC_v_CRC', 'tumor_type', rarefaction=2000, decoide_min_feature_count=1)


Unnamed: 0,sample_name,empo_1,empo_2,empo_3,empo_4,country,latitude,taxon_id,elevation,env_biome,...,host_body_product,qiita_sample_type,collection_timestamp,host_scientific_name,physical_specimen_location,qiita_study_id,depth_rs210,features_rs210_genome,depth_wol2,features_wol2_genome
0,15336.CRC.1.Background,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,UBERON:parenchyma,tumor,2018,Homo sapiens,University of Florida,15336,5683.0,19,4035.0,9
1,15336.CRC.10.Background,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,UBERON:parenchyma,tumor,2019,Homo sapiens,University of Florida,15336,743.0,12,518.0,5


RS210 Depth
MannwhitneyuResult(statistic=56.0, pvalue=0.06004321990124941)

Wol2 Depth
MannwhitneyuResult(statistic=57.0, pvalue=0.06615374712432388)

RS210 Genome Feature Count
MannwhitneyuResult(statistic=56.5, pvalue=0.06080649812826315)

Wol2 Genome Feature Count
MannwhitneyuResult(statistic=37.0, pvalue=0.003509062448263346)


#### HCC Blood vs. CRC Blood

In [91]:
call_rpca_save('blood_HCC_v_CRC', 'tumor_type', rarefaction=2000, decoide_min_feature_count=1)


Unnamed: 0,sample_name,empo_1,empo_2,empo_3,empo_4,country,latitude,taxon_id,elevation,env_biome,...,host_body_product,qiita_sample_type,collection_timestamp,host_scientific_name,physical_specimen_location,qiita_study_id,depth_rs210,features_rs210_genome,depth_wol2,features_wol2_genome
0,15336.CRC.1.Plasma,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal secretion (non-saline),USA,29.643946,1504969,54,urban biome,...,UBERON:plasma,plasma,2018,Homo sapiens,University of Florida,15336,2947.0,288,1675.0,166
1,15336.CRC.10.Plasma,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal secretion (non-saline),USA,29.643946,1504969,54,urban biome,...,UBERON:plasma,plasma,2019,Homo sapiens,University of Florida,15336,928.0,133,594.0,82


RS210 Depth
MannwhitneyuResult(statistic=43.0, pvalue=0.033330424727798694)

Wol2 Depth
MannwhitneyuResult(statistic=34.0, pvalue=0.009979828238707647)

RS210 Genome Feature Count
MannwhitneyuResult(statistic=58.5, pvalue=0.16211876318939528)

Wol2 Genome Feature Count
MannwhitneyuResult(statistic=62.5, pvalue=0.22162299026913856)
