## Analyzing multiple scATAC-seq datasets for mouse brain using Catactor
* Risa K. Kawaguchi, et al (to be published).
* 2020.12.xx



## Download and preprocess datasets
* From GEO (gene expression omnibus)
 * GSE100033
 * GSE111586
 * GSE123576
 * GSE126074
 * GSE127257
 * GSE130399
* From BICCN database
 * BICCN (SnapATAC objects)
 * BICCN SMART-seq v2 data

## Assumed data structure
### Raw matrix data
raw_data/
* GSE100033
* GSE111586
* GSE123576
* GSE126074
* GSE127257
* GSE130399
* BICCN
* BICCN_rna

### Processed data
mat_data/
* GSE100033
* GSE111586
* GSE123576
* GSE126074
* GSE127257
* GSE1303990 <- Adult dataset 
* GSE1303991 <- Fetal dataset (unused)
* BICCN
* BICCN_rna

In [None]:
# Construct matrix data
import os
import subprocess
script_dir = "../script/data_processing"
input_dir = "../raw_data"
output_dir = "../mat_data"

GSE_list = ['GSE100033', 'GSE111586', 'GSE123576', 'GSE126074', 'GSE127257', 'GSE130399', 'BICCN_rna']
GSE_dir = ['GSE100033', 'GSE111586', 'GSE123576', 'GSE126074', 'GSE127257', 'GSE1303990', 'BICCN_rna']
for gse, dir in zip(GSE_list, GSE_dir):
    print(gse+' processing...')
    arg_list = ['python', os.path.join(script_dir, 'data_preprocess.py'), gse, os.path.join(input_dir, gse), os.path.join(output_dir, dir)]
    print(' '.join(arg_list))
    subprocess.run(arg_list)


GSE100033 processing...
python ../script/data_processing/data_preprocess.py GSE100033 ../raw_data/GSE100033 ../mat_data/GSE100033


In [9]:
# BICCN scATAC-seq data preprocessing
script_dir = "../script/data_processing"
input_dir = "../raw_data/BICCN"
output_dir = "../mat_data/BICCN"
# ulimit -s 100000 #if needed

arg_list = ['Rscript', os.path.join(script_dir, 'snap_atac_to_text.R'), input_dir, output_dir]
print(' '.join(arg_list))
# subprocess.run(arg_list)

['Rscript', '../script/data_processing/snap_atac_to_text.R', '../raw_data/BICCN', '../mat_data/BICCN']
['python', '../script/data_processing/biccn_data_reading.py', '../mat_data/BICCN', '../mat_data/BICCN']


In [None]:
arg_list = ['python', os.path.join(script_dir, 'biccn_data_reading.py'), output_dir, output_dir]
print(' '.join(arg_list))
subprocess.run(arg_list)

In [None]:
# Add a global id for each bin-size
import glob
for dir in ['GSE100033', 'GSE111586', 'GSE123576', 'GSE126074', 'GSE127257', 'GSE1303990', 'BICCN_rna', 'BICCN']:
    for fname in glob.glob(os.path.join("../mat_data/", dir), "*bin*sv"):
        if 'with_bins' in fname:
            continue
        if 'gene.csv' in fname:
            continue
        arg_list = ['python', '../src/Catactor', 'column_annotation', fname]
        print(' '.join(fname))
        #subprocess.run(arg_list)


In [None]:
# Associate the closest gene ids
import glob
for dir in ['GSE100033', 'GSE111586', 'GSE123576', 'GSE126074', 'GSE127257', 'GSE1303990', 'BICCN_rna', 'BICCN']:
    for fname in glob.glob(os.path.join("../mat_data/", dir), "*bin*sv"):
        if 'with_bins' in fname:
            continue
        if 'gene.csv' in fname:
            continue
        arg_list = ['python', '../src/Catactor', 'column_annotation', fname]
        print(' '.join(fname))
        #subprocess.run(arg_list)


## Data preprocessing to produce sparse matrix and csv matrices for Catactor
Place original data to a specific directory and apply preprocessing to make matrices in the same format.


In [None]:
## Annotation
* Add genomic bin ids
* Add closest gene ids


In [None]:
ls *bin_ng.csv | grep -v with_bins | Catactor column_annotation {} | bash
ls *with_bins.csv | xargs -I{} Rscript annotation_metadata.R promoter {}