In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from os.path import join as pjoin
from scipy.io import mmread
import scanpy as sc
import anndata
import squidpy as sq
from gpsa import VariationalGPSA, rbf_kernel
import torch
from IPython.display import display, clear_output
import os
import subprocess
# import sys
# sys.path.append("../../../../paste")
# from src.paste import PASTE, visualization
# import scanpy as sc




In [2]:
DATA_DIR = "../../../data/mouse_brain_slideseq/12_allMTXs_CCF/"
SAVE_DIR = "../../../data/mouse_brain_slideseq/pcp4_only/"

In [3]:
gene_list = pd.read_table(pjoin(DATA_DIR, "01_Gene_List.txt"), header=None).values.squeeze()

In [4]:
gene_name = "Pcp4"

In [5]:
fnames = os.listdir(DATA_DIR)
mtx_fnames = np.array([x for x in fnames if x.endswith(".mtx.gz")])
slice_nums = np.sort(np.array([int(x.split("_")[1][1:]) for x in mtx_fnames]))
# sorted_idx = np.argsort(slice_nums)
# mtx_fnames = mtx_fnames[sorted_idx]
# print(mtx_fnames[:5])

In [7]:
for ii, curr_slice_num in enumerate(slice_nums):
    
    print("Saving slice {} / {}".format(ii + 1, len(slice_nums)))
    
    save_fname = pjoin(SAVE_DIR, "{}_data_slice_{}.csv".format(gene_name, str(curr_slice_num)))
    if os.path.isfile(save_fname):
        continue
    
    curr_slice_num = slice_nums[ii]
    curr_fname = "MBASS_d{}_matrix.mtx".format(str(curr_slice_num))
    
    # Unzip file
    subprocess.check_output("gunzip -k {}".format(pjoin(DATA_DIR, curr_fname + ".gz")), shell=True)
    
    # Load data
    data_sparse = mmread(pjoin(DATA_DIR, curr_fname))
    print(data_sparse.shape)
    
    # Get one gene
    
    gene_idx = np.where(gene_list == gene_name)[0][0]
    
    # Load metadata
    metadata = pd.read_table(pjoin(DATA_DIR, "MBASS_d{}_metadata.tsv".format(str(curr_slice_num))))
    barcodes = pd.read_table(pjoin(DATA_DIR, "MBASS_d{}_barcodes.txt".format(str(curr_slice_num))), header=None).values.squeeze()
    metadata = pd.read_table(pjoin(DATA_DIR, "MBASS_d{}_metadata.tsv".format(str(curr_slice_num))), index_col=0)
    curr_gene_data = pd.DataFrame(data_sparse.tocsr()[gene_idx].toarray().T, index=barcodes, columns=[gene_name])

    # Merge and save
    curr_gene_data = pd.merge(curr_gene_data, metadata, left_index=True, right_index=True)
    curr_gene_data = curr_gene_data[~curr_gene_data.isOutsideCCF]
    curr_gene_data.to_csv(save_fname)
    
    # Delete .mtx file
    os.remove(pjoin(DATA_DIR, curr_fname))

Saving slice 1 / 101
Saving slice 2 / 101
Saving slice 3 / 101
(35936, 117761)
Saving slice 4 / 101
(35936, 122129)
Saving slice 5 / 101
(35936, 110160)
Saving slice 6 / 101
(35936, 100322)
Saving slice 7 / 101
(35936, 127757)
Saving slice 8 / 101
(35936, 104778)
Saving slice 9 / 101
(35936, 124788)
Saving slice 10 / 101
(35936, 83067)
Saving slice 11 / 101
(35936, 112952)
Saving slice 12 / 101
(35936, 118311)
Saving slice 13 / 101
(35936, 117099)
Saving slice 14 / 101
(35936, 126083)
Saving slice 15 / 101
(35936, 122257)
Saving slice 16 / 101
(35936, 189564)
Saving slice 17 / 101
(35936, 90374)
Saving slice 18 / 101
(35936, 94596)
Saving slice 19 / 101
(35936, 88155)
Saving slice 20 / 101
(35936, 124111)
Saving slice 21 / 101
(35936, 138872)
Saving slice 22 / 101
(35936, 121603)
Saving slice 23 / 101
(35936, 107946)
Saving slice 24 / 101
(35936, 151488)
Saving slice 25 / 101
(35936, 157859)
Saving slice 26 / 101
(35936, 147155)
Saving slice 27 / 101
(35936, 168498)
Saving slice 28 / 1