## Subetting matrices

TODO: 
- subset `fill_Hartig_shared_gene_counts.csv.gz` and `jcoffman_007/jcoffman_007.adult_zebrafish_2015_tailfin.design.txt` to only include cols/rows (respectively) for 0, 2, 4 days (separately) in order with cort first and veh second
- subset `fill_sorted_matched_Hou_shared_gene_counts.csv.gz` and `sorted_labeled_GSE137971_cells.csv.gz` to only include columns/rows (respectively) that begin with 0dpa, 2dpa, 4dpa (separately)

In [2]:
import pandas as pd
import os
import numpy as np

# Bulk Design Files

In [3]:
bulk_design = pd.read_csv("jcoffman_007/jcoffman_007.adult_zebrafish_2015_tailfin.design.txt", sep='\t')

In [4]:
# reindex rows so that the dataframe is sorted to match name aggregation sorting (cort 0, 2, 4 then veh 0, 2, 4)
bulk_design = bulk_design.reindex([3, 4, 5, 9, 10, 11, 15, 16, 17, 0, 1, 2, 6, 7, 8, 12, 13, 14])
bulk_design = bulk_design.reset_index(drop = True)

In [5]:
# export the reindexed dataframe to use on all samples
bulk_design_file = os.path.join('.', 'all_bulk_design_file.csv.gz')
bulk_design.to_csv(bulk_design_file, index=False, compression="gzip")

In [6]:
# group the design dataframe by dpa and split into 0, 2, and 4 as separate dataframes
gb = bulk_design.groupby('dpa')    
bulk_design_0dpa, bulk_design_2dpa, bulk_design_4dpa = [gb.get_group(x) for x in gb.groups]

In [7]:
# check contents
#print(bulk_design_0dpa)
#print(bulk_design_2dpa)
#print(bulk_design_4dpa)

In [8]:
# export bulk 0dpa design
bulk_design_0dpa_file = os.path.join('.', 'bulk_design_0dpa.csv.gz')
bulk_design_0dpa.to_csv(bulk_design_0dpa_file, index=False, compression="gzip")

In [9]:
# export bulk 2dpa design
bulk_design_2dpa_file = os.path.join('.', 'bulk_design_2dpa.csv.gz')
bulk_design_2dpa.to_csv(bulk_design_2dpa_file, index=False, compression="gzip")

In [10]:
# export bulk 4dpa design
bulk_design_4dpa_file = os.path.join('.', 'bulk_design_4dpa.csv.gz')
bulk_design_4dpa.to_csv(bulk_design_4dpa_file, index=False, compression="gzip")

# Bulk Count Files

In [11]:
bulk_count = pd.read_csv("fill_Hartig_shared_gene_counts.csv.gz", compression="gzip")

In [12]:
# reindex cols so that the dataframe is sorted to match name aggregation sorting (cort 0, 2, 4 then veh 0, 2, 4)
bulk_count_col = list(bulk_count.columns.values.tolist())
col_order = [0, 4, 5, 6, 10, 11, 12, 16, 17, 18, 1, 2, 3, 7, 8, 9, 13, 14, 15]
bulk_count_col = [bulk_count_col[i] for i in col_order]
bulk_count = bulk_count[bulk_count_col]

In [13]:
# checking if the bulk count and bulk design files match in their ordering of samples 
# excluding the first value of the count list because it is the column name for 'genes'
count = bulk_count.columns.values.tolist()
design = bulk_design['sample'].tolist()
if count[1:] == design:
    print("match")
else: 
    print("no match")

match


In [14]:
# match is verified, export the reindexed dataframe to use on all samples
bulk_count_file = os.path.join('.', 'all_bulk_count_file.csv.gz')
bulk_count.to_csv(bulk_count_file,
           index=False,
           compression="gzip")

In [15]:
# create a list of the sample names for each day to use as reference
# adding 'genes' so that when subsetting columns, the gene column is included
samples_0dpa = ['genes'] + bulk_design_0dpa['sample'].tolist()
samples_2dpa = ['genes'] + bulk_design_2dpa['sample'].tolist()
samples_4dpa = ['genes'] + bulk_design_4dpa['sample'].tolist()

In [16]:
# subsetting bulk count into only 0dpa based on sample list from above
bulk_count_0dpa = bulk_count[bulk_count.columns[bulk_count.columns.isin(samples_0dpa)]]

In [17]:
# subsetting bulk count into only 2dpa based on sample list from above
bulk_count_2dpa = bulk_count[bulk_count.columns[bulk_count.columns.isin(samples_2dpa)]]

In [18]:
# subsetting bulk count into only 4dpa based on sample list from above
bulk_count_4dpa = bulk_count[bulk_count.columns[bulk_count.columns.isin(samples_4dpa)]]

In [19]:
# export bulk 0dpa count
bulk_count_0dpa_file = os.path.join('.', 'bulk_count_0dpa.csv.gz')
bulk_count_0dpa.to_csv(bulk_count_0dpa_file, index=False, compression="gzip")

In [20]:
# export bulk 2dpa count
bulk_count_2dpa_file = os.path.join('.', 'bulk_count_2dpa.csv.gz')
bulk_count_2dpa.to_csv(bulk_count_2dpa_file, index=False, compression="gzip")

In [21]:
# export bulk 4dpa count
bulk_count_4dpa_file = os.path.join('.', 'bulk_count_4dpa.csv.gz')
bulk_count_4dpa.to_csv(bulk_count_4dpa_file, index=False, compression="gzip")

# Single Count Files

In [22]:
single_count = pd.read_csv("fill_sorted_matched_Hou_shared_gene_counts.csv.gz", compression="gzip")

In [23]:
# filter the columns to only select those that start with 0dpa (which in this data is 'samp1' or 'samp2')
prefixes = ['genes', 'samp1', 'samp2']
filter_col = [col for col in single_count if col.startswith(tuple(prefixes))]
single_count_0dpa = single_count[filter_col]

In [24]:
# filter the columns to only select those that start with 2dpa
prefixes = ['genes', '2dpa']
filter_col = [col for col in single_count if col.startswith(tuple(prefixes))]
single_count_2dpa = single_count[filter_col]

In [25]:
# filter the columns to only select those that start with 4dpa
prefixes = ['genes', '4dpa']
filter_col = [col for col in single_count if col.startswith(tuple(prefixes))]
single_count_4dpa = single_count[filter_col]

In [26]:
# export single 0dpa count
single_count_0dpa_file = os.path.join('.', 'single_count_0dpa.csv.gz')
single_count_0dpa.to_csv(single_count_0dpa_file, index=False, compression="gzip")

In [27]:
# export single 2dpa count
single_count_2dpa_file = os.path.join('.', 'single_count_2dpa.csv.gz')
single_count_2dpa.to_csv(single_count_2dpa_file, index=False, compression="gzip")

In [28]:
# export single 4dpa count
single_count_4dpa_file = os.path.join('.', 'single_count_4dpa.csv.gz')
single_count_4dpa.to_csv(single_count_4dpa_file, index=False, compression="gzip")

# Single Design Files

In [29]:
single_design = pd.read_csv("sorted_labeled_GSE137971_cells.csv.gz", compression="gzip")

In [30]:
# filter design file rows by rows that contain 0dpa (which in this data is 'samp1' or 'samp2')
single_design_0dpa = single_design[single_design['samples'].str.contains('samp1|samp2')]

In [31]:
# filter design file rows by rows that contain 2dpa 
single_design_2dpa = single_design[single_design['samples'].str.contains('2dpa')]

In [32]:
# filter design file rows by rows that contain 4dpa 
single_design_4dpa = single_design[single_design['samples'].str.contains('4dpa')]

In [33]:
# export single 0dpa design
single_design_0dpa_file = os.path.join('.', 'single_design_0dpa.csv.gz')
single_design_0dpa.to_csv(single_design_0dpa_file, index=False, compression="gzip")

In [34]:
# export single 2dpa design
single_design_2dpa_file = os.path.join('.', 'single_design_2dpa.csv.gz')
single_design_2dpa.to_csv(single_design_2dpa_file, index=False, compression="gzip")

In [35]:
# export single 4dpa design
single_design_4dpa_file = os.path.join('.', 'single_design_4dpa.csv.gz')
single_design_4dpa.to_csv(single_design_4dpa_file, index=False, compression="gzip")