
## Step 3: Split Data into Disease Groups
**Goal: To create dataframe of each disease groups for future analysis**


### Imports

In [23]:
from biom import load_table
import pandas as pd
from qiime2 import Artifact

In [24]:
#Metadata
meta = pd.read_csv('qiita_downloads/qiita15336_prep16181_pangenome/sample_information_from_prep_16181.tsv', sep = '\t')

#Taxonomy files to be split
qza_list = ['processed_data/Zebra_filtered/qiita15336_prep16181_pangenome_wol2_scrubbed_zebraFilter0.1.qza',
           'processed_data/Zebra_filtered/qiita15336_prep16181_pangenome_rs210_scrubbed_zebraFilter0.1.qza']

### Functions

In [25]:
def data_split(disease_type, tissue_type, fn):
    #Create metadata
    meta_custom =  meta[(meta.tumor_type.isin(disease_type)) & 
                        (meta.host_sample_type.isin(tissue_type))]
    meta_custom = meta_custom.reset_index(drop = True)
    
    #Combine TechReplicates
    #Find sample_names ending with '.2'
    reps_to_drop = meta_custom[meta_custom['sample_name'].str.endswith('.2')].index
    #print(len(reps_to_drop))
    
    # Identify rows without a corresponding '.2' row
    singleRows_to_drop = meta_custom[~meta_custom['sample_name'].str.endswith('.2') & ~meta_custom['sample_name'].apply(lambda x: f"{x}.2").isin(meta_custom['sample_name'])].index
    #print(singleRows_to_drop)
    
    #Drop rows
    rows_to_drop = reps_to_drop.union(singleRows_to_drop)
    meta_custom = meta_custom.drop(rows_to_drop)
    
    #Export metadata
    meta_filename = 'processed_data/metadata/metadata_' + fn + '.tsv'
    meta_custom.to_csv(meta_filename, sep = '\t', index = False)
    
    #Export taxonomy tables in pandas df format and biom format
    for qza_path in qza_list:
        qza = Artifact.load(qza_path).view(pd.DataFrame).T
        
        #save as pandas df
        filename = 'processed_data/pandas_df/' + fn + '_' + qza_path.split('/')[-1].split('.qza')[0] + '.tsv'
        df_custom = qza[qza.columns.intersection(meta_custom['sample_name'].tolist())]
        df_custom.to_csv(filename, sep = '\t')
        
        #Create custom qza table (just selected disease type)
        custom_qza = Artifact.import_data("FeatureTable[Frequency]", df_custom.T)
        
        #Save qza as biom table
        filename = 'processed_data/biom/' + fn + '_' + qza_path.split('/')[-1].split('.qza')[0]
        Artifact.export_data(custom_qza, filename)
        
    
    print(len(meta_custom), 'samples')
    display(meta_custom[:3])
    

### Data split based on Aims

#### HCC Tissue Tumor + HCC Tissue Adj.

In [26]:
test = data_split(['HCC'], ['Background', 'tumor'], 'HCC_adj_v_tumor')

32 samples


Unnamed: 0,sample_name,empo_1,empo_2,empo_3,empo_4,country,latitude,taxon_id,elevation,env_biome,...,scientific_name,host_common_name,host_sample_type,host_body_habitat,host_body_product,qiita_sample_type,collection_timestamp,host_scientific_name,physical_specimen_location,qiita_study_id
0,15336.HCC.1.Background,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,liver metagenome,human,Background,UBERON:hepatobiliary system,UBERON:parenchyma,tumor,2018,Homo sapiens,University of Florida,15336
2,15336.HCC.1.tumor,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,liver metagenome,human,tumor,UBERON:hepatobiliary system,UBERON:parenchyma,tumor,2018,Homo sapiens,University of Florida,15336
4,15336.HCC.10.Background,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,liver metagenome,human,Background,UBERON:hepatobiliary system,UBERON:parenchyma,tumor,2019,Homo sapiens,University of Florida,15336


#### CRC Tissue Tumor + CRC Tissue Adj.

In [27]:
data_split(['CRC'], ['Background', 'tumor'], 'CRC_adj_v_tumor')

22 samples


Unnamed: 0,sample_name,empo_1,empo_2,empo_3,empo_4,country,latitude,taxon_id,elevation,env_biome,...,scientific_name,host_common_name,host_sample_type,host_body_habitat,host_body_product,qiita_sample_type,collection_timestamp,host_scientific_name,physical_specimen_location,qiita_study_id
0,15336.CRC.1.Background,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,liver metagenome,human,Background,UBERON:hepatobiliary system,UBERON:parenchyma,tumor,2018,Homo sapiens,University of Florida,15336
2,15336.CRC.1.tumor,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,liver metagenome,human,tumor,UBERON:hepatobiliary system,UBERON:parenchyma,tumor,2018,Homo sapiens,University of Florida,15336
4,15336.CRC.10.Background,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,liver metagenome,human,Background,UBERON:hepatobiliary system,UBERON:parenchyma,tumor,2019,Homo sapiens,University of Florida,15336


#### HCC Tissue Tumor + CRC Tissue Tumor

In [28]:
data_split(['HCC', 'CRC'], ['tumor'], 'tumor_HCC_v_CRC')

27 samples


Unnamed: 0,sample_name,empo_1,empo_2,empo_3,empo_4,country,latitude,taxon_id,elevation,env_biome,...,scientific_name,host_common_name,host_sample_type,host_body_habitat,host_body_product,qiita_sample_type,collection_timestamp,host_scientific_name,physical_specimen_location,qiita_study_id
0,15336.CRC.1.tumor,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,liver metagenome,human,tumor,UBERON:hepatobiliary system,UBERON:parenchyma,tumor,2018,Homo sapiens,University of Florida,15336
2,15336.CRC.10.tumor,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,liver metagenome,human,tumor,UBERON:hepatobiliary system,UBERON:parenchyma,tumor,2019,Homo sapiens,University of Florida,15336
4,15336.CRC.11.tumor,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,liver metagenome,human,tumor,UBERON:hepatobiliary system,UBERON:parenchyma,tumor,2019,Homo sapiens,University of Florida,15336


#### HCC Blood + CRC Blood

In [29]:
data_split(['HCC', 'CRC'], ['Plasma'], 'blood_HCC_v_CRC')

25 samples


Unnamed: 0,sample_name,empo_1,empo_2,empo_3,empo_4,country,latitude,taxon_id,elevation,env_biome,...,scientific_name,host_common_name,host_sample_type,host_body_habitat,host_body_product,qiita_sample_type,collection_timestamp,host_scientific_name,physical_specimen_location,qiita_study_id
0,15336.CRC.1.Plasma,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal secretion (non-saline),USA,29.643946,1504969,54,urban biome,...,human blood metagenome,human,Plasma,UBERON:blood,UBERON:plasma,plasma,2018,Homo sapiens,University of Florida,15336
2,15336.CRC.10.Plasma,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal secretion (non-saline),USA,29.643946,1504969,54,urban biome,...,human blood metagenome,human,Plasma,UBERON:blood,UBERON:plasma,plasma,2019,Homo sapiens,University of Florida,15336
4,15336.CRC.11.Plasma,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal secretion (non-saline),USA,29.643946,1504969,54,urban biome,...,human blood metagenome,human,Plasma,UBERON:blood,UBERON:plasma,plasma,2019,Homo sapiens,University of Florida,15336


### HCC Tissue Adj + CRC Tissue Adj

In [30]:
data_split(['HCC', 'CRC'], ['Background'], 'adj_HCC_v_CRC')

27 samples


Unnamed: 0,sample_name,empo_1,empo_2,empo_3,empo_4,country,latitude,taxon_id,elevation,env_biome,...,scientific_name,host_common_name,host_sample_type,host_body_habitat,host_body_product,qiita_sample_type,collection_timestamp,host_scientific_name,physical_specimen_location,qiita_study_id
0,15336.CRC.1.Background,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,liver metagenome,human,Background,UBERON:hepatobiliary system,UBERON:parenchyma,tumor,2018,Homo sapiens,University of Florida,15336
2,15336.CRC.10.Background,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,liver metagenome,human,Background,UBERON:hepatobiliary system,UBERON:parenchyma,tumor,2019,Homo sapiens,University of Florida,15336
4,15336.CRC.11.Background,Host-associated,Host-associated (non-saline),Animal (non-saline),Animal corpus (non-saline),USA,29.643946,1685930,54,urban biome,...,liver metagenome,human,Background,UBERON:hepatobiliary system,UBERON:parenchyma,tumor,2019,Homo sapiens,University of Florida,15336
