
## Step 3: Split Data into Disease Groups
**Goal: To create dataframe of each disease groups for future analysis**

https: comad

### Imports

In [1]:
from biom import load_table
import pandas as pd
from qiime2 import Artifact
from qiime2.plugins import taxa

In [2]:
#Metadata
meta = pd.read_csv('qiita_downloads/pese_pangenome_align-RS210-masked/sample_information_from_prep_16181.tsv', sep = '\t')

#Micov pass/fail list
qza_list = ['processed_data/TechRep_merged/pese_pangenome_align-RS210_masked_none_scrubbed_micovFilter_T.qza']

In [3]:
#Import taxonomys to split between genome and species

#Path to taxonomic tress
taxonomy_path='/Users/cguccion/Dropbox/Storage/HelpfulLabDocs/taxonomy_trees'

#RS210
rs210_tax = pd.read_csv(taxonomy_path + '/RS210/RS210.txt', sep='\t', index_col=0, header=None)
rs210_tax.columns = ['Taxon']
rs210_tax.index.name = 'Feature ID'
rs210_tax = Artifact.import_data('FeatureData[Taxonomy]', rs210_tax)

#RS210 - no space
rs210_tax_ns = pd.read_csv(taxonomy_path + '/RS210/RS210_noSpace.txt', sep='\t', index_col=0, header=None)
rs210_tax_ns.columns = ['Taxon']
rs210_tax_ns.index.name = 'Feature ID'
rs210_tax_ns = Artifact.import_data('FeatureData[Taxonomy]', rs210_tax_ns)

#WOL2 (shouldn't need)
wol2_tax = pd.read_csv(taxonomy_path + '/WOL2/lineages.txt', sep='\t', index_col=0, header=None)
wol2_tax.columns = ['Taxon']
wol2_tax.index.name = 'Feature ID'
wol2_tax = Artifact.import_data('FeatureData[Taxonomy]', wol2_tax)


### Functions

In [4]:
def data_split(disease_type, tissue_type, fn, species=False):
    #Create metadata
    meta_custom =  meta[(meta.tumor_type.isin(disease_type)) & 
                        (meta.host_sample_type.isin(tissue_type))]
    meta_custom = meta_custom.reset_index(drop = True)
    
    ##Adjust the metadata for the technical replicates combined##

    #Find sample_names ending with '.2' (technical replicates that have been combined into thier non-.2 duplicate)
    reps_to_drop = meta_custom[meta_custom['sample_name'].str.endswith('.2')].index
    
    # Identify rows without a corresponding '.2' row (samples without a technical replicate -- we can't use these becuase not equal to other samples)
    singleRows_to_drop = meta_custom[~meta_custom['sample_name'].str.endswith('.2') & ~meta_custom['sample_name'].apply(lambda x: f"{x}.2").isin(meta_custom['sample_name'])].index
    
    #Drop rows
    rows_to_drop = reps_to_drop.union(singleRows_to_drop)
    meta_custom = meta_custom.drop(rows_to_drop)
    
    #Export metadata
    meta_filename = 'processed_data/metadata/metadata_' + fn + '.tsv'
    meta_custom.to_csv(meta_filename, sep = '\t', index = False)
    
    #Export taxonomy tables in pandas df format and biom format
    for qza_path in qza_list:
        
        #Import full qza table
        qza = Artifact.load(qza_path).view(pd.DataFrame).T
        
        #Subset qza table to just those features meta
        df_custom = qza[qza.columns.intersection(meta_custom['sample_name'].tolist())]
        
        #save as pandas df
        filename = 'processed_data/pandas_df/' + fn + '_' + qza_path.split('/')[-1].split('.qza')[0] + '.tsv'
        df_custom.to_csv(filename, sep = '\t')
        
        #save as qza
        custom_qza = Artifact.import_data("FeatureTable[Frequency]", df_custom.T)
        filename = 'processed_data/qza/' + fn + '_' + qza_path.split('/')[-1].split('.qza')[0] + '.qza'
        custom_qza.save(filename)
        
        #Save qza as biom table
        filename = 'processed_data/biom/' + fn + '_' + qza_path.split('/')[-1].split('.qza')[0]
        Artifact.export_data(custom_qza, filename)
        
        #Convert qza from genome to species
        if 'rs210' in qza_path or 'RS210' in qza_path:
            current_taxa = rs210_tax
        else:
            current_taxa = wol2_tax
        qza_custom_species = taxa.methods.collapse(table=custom_qza,
                                             taxonomy=current_taxa, level=7).collapsed_table
        df_custom_species = qza_custom_species.view(pd.DataFrame).T
        
        #Save species as pandas df 
        filename ='processed_data/pandas_df/' + fn + '_' + qza_path.split('/')[-1].split('.qza')[0] + '_species.tsv'
        df_custom_species.to_csv(filename, sep = '\t')
        
        #Save species as qza
        filename ='processed_data/qza/' + fn + '_' + qza_path.split('/')[-1].split('.qza')[0] + '_species.qza'
        qza_custom_species.save(filename)
        
        #Save sepcies as biom
        filename ='processed_data/biom/' + fn + '_' + qza_path.split('/')[-1].split('.qza')[0] + '_species'
        Artifact.export_data(qza_custom_species, filename)
    
    print(len(meta_custom), 'samples')
    display(meta_custom[:3])
    

### Data split based on Aims

#### HCC Tissue Tumor + HCC Tissue Adj.

In [21]:
data_split(['HCC'], ['Background', 'tumor'], 'HCC_adj_v_tumor')

32 samples


Unnamed: 0,sample_name,empo_1,empo_2,empo_3,empo_4,country,latitude,taxon_id,elevation,env_biome,...,scientific_name,host_common_name,host_sample_type,host_body_habitat,host_body_product,qiita_sample_type,collection_timestamp,host_scientific_name,physical_specimen_location,qiita_study_id
0,15336.HCC.1.Background,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,liver metagenome,human,Background,UBERON:hepatobiliary system,UBERON:parenchyma,tumor,2018,Homo sapiens,University of Florida,15336
2,15336.HCC.1.tumor,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,liver metagenome,human,tumor,UBERON:hepatobiliary system,UBERON:parenchyma,tumor,2018,Homo sapiens,University of Florida,15336
4,15336.HCC.10.Background,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,liver metagenome,human,Background,UBERON:hepatobiliary system,UBERON:parenchyma,tumor,2019,Homo sapiens,University of Florida,15336


#### CRC Tissue Tumor + CRC Tissue Adj.

In [22]:
data_split(['CRC'], ['Background', 'tumor'], 'CRC_adj_v_tumor')

22 samples


Unnamed: 0,sample_name,empo_1,empo_2,empo_3,empo_4,country,latitude,taxon_id,elevation,env_biome,...,scientific_name,host_common_name,host_sample_type,host_body_habitat,host_body_product,qiita_sample_type,collection_timestamp,host_scientific_name,physical_specimen_location,qiita_study_id
0,15336.CRC.1.Background,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,liver metagenome,human,Background,UBERON:hepatobiliary system,UBERON:parenchyma,tumor,2018,Homo sapiens,University of Florida,15336
2,15336.CRC.1.tumor,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,liver metagenome,human,tumor,UBERON:hepatobiliary system,UBERON:parenchyma,tumor,2018,Homo sapiens,University of Florida,15336
4,15336.CRC.10.Background,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,liver metagenome,human,Background,UBERON:hepatobiliary system,UBERON:parenchyma,tumor,2019,Homo sapiens,University of Florida,15336


#### HCC Tissue Tumor + CRC Tissue Tumor

In [5]:
data_split(['HCC', 'CRC'], ['tumor'], 'tumor_HCC_v_CRC_T')

27 samples


Unnamed: 0,sample_name,empo_1,empo_2,empo_3,empo_4,country,latitude,taxon_id,elevation,env_biome,...,scientific_name,host_common_name,host_sample_type,host_body_habitat,host_body_product,qiita_sample_type,collection_timestamp,host_scientific_name,physical_specimen_location,qiita_study_id
0,15336.CRC.1.tumor,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,liver metagenome,human,tumor,UBERON:hepatobiliary system,UBERON:parenchyma,tumor,2018,Homo sapiens,University of Florida,15336
2,15336.CRC.10.tumor,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,liver metagenome,human,tumor,UBERON:hepatobiliary system,UBERON:parenchyma,tumor,2019,Homo sapiens,University of Florida,15336
4,15336.CRC.11.tumor,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,liver metagenome,human,tumor,UBERON:hepatobiliary system,UBERON:parenchyma,tumor,2019,Homo sapiens,University of Florida,15336


#### HCC Blood + CRC Blood

In [6]:
data_split(['HCC', 'CRC'], ['Plasma'], 'blood_HCC_v_CRC_T')

25 samples


Unnamed: 0,sample_name,empo_1,empo_2,empo_3,empo_4,country,latitude,taxon_id,elevation,env_biome,...,scientific_name,host_common_name,host_sample_type,host_body_habitat,host_body_product,qiita_sample_type,collection_timestamp,host_scientific_name,physical_specimen_location,qiita_study_id
0,15336.CRC.1.Plasma,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal secretion (non-saline),USA,29.643946,1504969,54,urban biome,...,human blood metagenome,human,Plasma,UBERON:blood,UBERON:plasma,plasma,2018,Homo sapiens,University of Florida,15336
2,15336.CRC.10.Plasma,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal secretion (non-saline),USA,29.643946,1504969,54,urban biome,...,human blood metagenome,human,Plasma,UBERON:blood,UBERON:plasma,plasma,2019,Homo sapiens,University of Florida,15336
4,15336.CRC.11.Plasma,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal secretion (non-saline),USA,29.643946,1504969,54,urban biome,...,human blood metagenome,human,Plasma,UBERON:blood,UBERON:plasma,plasma,2019,Homo sapiens,University of Florida,15336


#### HCC Tissue Adj + CRC Tissue Adj

In [7]:
data_split(['HCC', 'CRC'], ['Background'], 'adj_HCC_v_CRC_T')

27 samples


Unnamed: 0,sample_name,empo_1,empo_2,empo_3,empo_4,country,latitude,taxon_id,elevation,env_biome,...,scientific_name,host_common_name,host_sample_type,host_body_habitat,host_body_product,qiita_sample_type,collection_timestamp,host_scientific_name,physical_specimen_location,qiita_study_id
0,15336.CRC.1.Background,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,liver metagenome,human,Background,UBERON:hepatobiliary system,UBERON:parenchyma,tumor,2018,Homo sapiens,University of Florida,15336
2,15336.CRC.10.Background,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,liver metagenome,human,Background,UBERON:hepatobiliary system,UBERON:parenchyma,tumor,2019,Homo sapiens,University of Florida,15336
4,15336.CRC.11.Background,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,liver metagenome,human,Background,UBERON:hepatobiliary system,UBERON:parenchyma,tumor,2019,Homo sapiens,University of Florida,15336
