### Load the data

In [1]:
import einops
import scipy.sparse as sparse
import scipy.io as sio
import scipy.stats as stats
import numpy as np
import pandas as pd
import tensorly as tl
import matplotlib.pyplot as plt

In [2]:
tenx = sio.mmread("../data/tenx.mtx")

In [3]:
tenx_obs = pd.read_csv("../data/tenx_obs.csv", index_col=0)

### Delete every sample less than 1000 values

In [4]:
to_keep = list(filter(lambda x: x[1]>1000, list(
    sorted(
        list(zip(*np.unique(tenx_obs["sample_name"].values, return_counts=True))), 
        key=lambda x: x[1]
    )
)))
to_keep = [a[0] for a in to_keep]
to_keep = [a in to_keep for a in tenx_obs["sample_name"]]

In [5]:
tenx_obs.shape

(41580, 13)

In [43]:
tenx_obs

Unnamed: 0,sample_name,cell_barcode,cluster_color,cluster,cluster_id,cell_types,sex_label,batch_indices,cell_counts,n_genes,percent_mito,pass_count_filter,pass_mito_filter
0,10x_VMH_Female_Control_1,4_AAACCTGAGCGCCTCA,#9162FF,Nr5a1_4,24.0,undefined,F,0,1427.0,926,13.594954,True,False
1,10x_VMH_Female_Control_1,4_AAACCTGCACAGTCGC,#96FF2E,Tsix_Esr1_1,3.0,undefined,F,0,4508.0,2310,9.272405,True,False
2,10x_VMH_Female_Control_1,4_AAACCTGCAGATTGCT,#66ABC2,Dlk1_3,16.0,undefined,F,0,3447.0,1995,5.483029,True,False
3,10x_VMH_Female_Control_1,4_AAACCTGCATACTCTT,#96FF2E,Tsix_Esr1_1,3.0,undefined,F,0,7004.0,3286,4.483152,True,True
4,10x_VMH_Female_Control_1,4_AAACCTGGTCCGTTAA,#24EFCD,Scgn,20.0,undefined,F,0,2490.0,1492,9.277108,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
41575,10x_VMH_Male_Social_Fear_Singly-housed_2,23_TTTGCGCTCTAACTGG,#3184F2,Satb2_3,12.0,undefined,M,25,14472.0,4853,5.983969,True,False
41576,10x_VMH_Male_Social_Fear_Singly-housed_2,23_TTTGGTTGTAGCGCAA,#9162FF,Nr5a1_4,24.0,undefined,M,25,2541.0,1673,3.187721,True,True
41577,10x_VMH_Male_Social_Fear_Singly-housed_2,23_TTTGGTTTCTGCTTGC,#5472EB,Satb2_2,11.0,undefined,M,25,4949.0,2421,9.860579,True,False
41578,10x_VMH_Male_Social_Fear_Singly-housed_2,23_TTTGTCACACGGCTAC,#BDDFFF,Dlk1_1,14.0,undefined,M,25,13784.0,4886,3.801509,True,True


In [6]:
tenx_obs = tenx_obs[to_keep]

In [7]:
tenx_obs.shape

(39728, 13)

### Take the first 1000 rows for each sample

In [8]:
to_keep = []
to_keep_sn = []
for a in tenx_obs.groupby("sample_name"):
    to_keep_sn.extend([a[0]]*1000)
    to_keep.extend(a[1].index.values[0:1000])

In [9]:
to_keep = pd.DataFrame({"n": to_keep, "sn": to_keep_sn})

In [10]:
sns = to_keep["sn"].unique().tolist()

In [11]:
to_keep["sample_axis"] = to_keep["sn"].apply(lambda a: sns.index(a))

In [12]:
tenx.shape

(41580, 1999)

In [13]:
tenx = tenx[to_keep["n"].values, :]

### Reshape to (sample, cell, gene)

In [38]:
tenx = einops.rearrange(tenx, '(s c) g -> s c g', s=23, c=1000)

In [39]:
tenx.shape

(23, 1000, 1999)

In [41]:
tenx[0,:,:]

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        5.40639958],
       [0.        , 0.        , 0.        , ..., 0.        , 5.67369204,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 6.65517673,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 6.65208231,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [42]:
np.save("tenx.npy", tenx)