## Data preprocessing

In [1]:
import scanpy as sc
import pandas as pd
import numpy as np

In [2]:
def make_anndata(adata, chrom, start, end, path):
    adata.var['chr'] = chrom
    adata.var['start'] = start
    adata.var['end'] = end
    
    sc.pp.filter_cells(adata, min_genes=0)
    sc.pp.filter_genes(adata, min_cells=0)
    
    thres = int(adata.shape[0]*0.01)
    adata = adata[:, adata.var['n_cells']>thres]

    chrs = ['chr'+str(i) for i in range(1,23)] + ['chrX', 'chrY']
    adata = adata[:, adata.var['chr'].isin(chrs)]
    
    print(adata)
    adata.write(path)
    return adata

In [3]:
adata = sc.read_h5ad("data/raw_mouse_brain.h5ad")
np.unique(adata.obs["Batch"])

array(['BoneMarrow_62016', 'BoneMarrow_62216', 'Cerebellum_62216',
       'HeartA_62816', 'Kidney_62016', 'LargeIntestineA_62816',
       'LargeIntestineB_62816', 'Liver_62016', 'Lung1_62216',
       'Lung2_62216', 'PreFrontalCortex_62216', 'SmallIntestine_62816',
       'Spleen_62016', 'Testes_62016', 'Thymus_62016',
       'WholeBrainA_62216', 'WholeBrainA_62816'], dtype=object)

In [4]:
adata_BoneMarrowB = adata[adata.obs["Batch"] == "BoneMarrow_62216"]
adata_Liver = adata[adata.obs["Batch"] == "Liver_62016"]

concat_adata = sc.AnnData.concatenate(adata_BoneMarrowB, adata_Liver)
print(concat_adata)

concat_adata = make_anndata(
    concat_adata,
    concat_adata.var["chrom"],
    concat_adata.var["chromStart"],
    concat_adata.var["chromEnd"],
    "preprocessed_data/BoneMarrowB_liver.h5ad",
)


See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html


AnnData object with n_obs × n_vars = 10537 × 436206
    obs: 'cell', 'tissue', 'tissue.replicate', 'cluster', 'subset_cluster', 'tsne_1', 'tsne_2', 'subset_tsne1', 'subset_tsne2', 'id', 'cell_label', 'Batch', 'CellType', 'batch'
    var: 'chrom', 'chromStart', 'chromEnd'
View of AnnData object with n_obs × n_vars = 10537 × 99915
    obs: 'cell', 'tissue', 'tissue.replicate', 'cluster', 'subset_cluster', 'tsne_1', 'tsne_2', 'subset_tsne1', 'subset_tsne2', 'id', 'cell_label', 'Batch', 'CellType', 'batch', 'n_genes'
    var: 'chrom', 'chromStart', 'chromEnd', 'chr', 'start', 'end', 'n_cells'


## Run CACNN

In [1]:
%cd CACNN

/data/user/luomai/SANGO/CACNN


In [2]:
!python main.py -i ../preprocessed_data/BoneMarrowB_liver.h5ad \
                -z 64 \
                -g mm9 \
                -o ../output/BoneMarrowB_liver \
                --max_epoch 300 \
                --device 3

INFO(20230926 14:04:31) [main.py:70]:
##time: Tue Sep 26 14:04:31 2023
##cwd: /data/user/luomai/SANGO/CACNN
##cmd: main.py -i ../preprocessed_data/BoneMarrowB_liver.h5ad -z 64 -g mm9 -o ../output/BoneMarrowB_liver --max_epoch 300 --device 3
##args: Namespace(alpha=0.0, batch_size=128, data='../preprocessed_data/BoneMarrowB_liver.h5ad', device=3, g='mm9', lr=0.01, max_epoch=300, num_workers=32, outdir='../output/BoneMarrowB_liver', seed=2020, seq_len=1344, use_reg_cell=False, w=False, z=64)
INFO(20230926 14:04:35) [main.py:107]:CACNN(
  (pre_conv): Sequential(
    (0): Conv1d(4, 288, kernel_size=(17,), stride=(1,), padding=(8,))
    (1): BatchNorm1d(288, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
    (3): ReLU()
  )
  (conv_towers): Sequential(
    (0): ConvTower(
      (conv1): Conv1d(288, 64, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
      (bn1): BatchNorm1d(64, eps=

## Run GraphTransFormer

In [7]:
%cd ../GraphTransformer

/data/user/luomai/SANGO/GraphTransformer


In [8]:
!python main.py --use_bn \
                --use_residual \
                --use_gumbel \
                --data_dir ../output/BoneMarrowB_liver/CACNN_output.h5ad \
                --train_name_list BoneMarrow_62216 --test_name Liver_62016 \
                --save_path ../output \
                --save_name BoneMarrowB_liver \
                --device 0

python: can't open file 'main.py': [Errno 2] No such file or directory
