<a href="https://colab.research.google.com/github/cellatlas/cellatlas/blob/main/docs/PREPROCESS_MAT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

---
title: Preprocess Matrix
date: 2024-07-07
authors:
  - name: A. Sina Booeshaghi
---

Short description of the steps we are taking (insert image from cell atlas paper)

1. Filter matrix
2. Normalize matrix
3. Assign celltypes or cell categories

In [1]:
!pip install --quiet git+https://github.com/cellatlas/ec.git
!pip install --quiet git+https://github.com/cellatlas/mx.git

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for ec (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m73.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m124.4/124.4 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m84.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m71.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m87.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# https://www.10xgenomics.com/datasets/human-pbmc-from-a-healthy-donor-1-k-cells-v-2-2-standard-4-0-0
!wget --quiet --show-progress https://cf.10xgenomics.com/samples/cell-vdj/4.0.0/sc5p_v2_hs_PBMC_1k/sc5p_v2_hs_PBMC_1k_raw_feature_bc_matrix.tar.gz
!tar -xzf sc5p_v2_hs_PBMC_1k_raw_feature_bc_matrix.tar.gz
!gunzip raw_feature_bc_matrix/*



In [3]:
!cut -f2 raw_feature_bc_matrix/features.tsv > raw_feature_bc_matrix/genes.txt
!cut -f1 -d'-' raw_feature_bc_matrix/barcodes.tsv > raw_feature_bc_matrix/barcodes.txt

In [4]:
from scipy.io import mmread, mmwrite; mmwrite("matrix.mtx", mmread("raw_feature_bc_matrix/matrix.mtx").T.tocsr())

In [5]:
!ln -s raw_feature_bc_matrix/genes.txt .
!ln -s raw_feature_bc_matrix/barcodes.txt .

# Filter matrix

## Command line

In [7]:
!mx filter -c 2 2 -bi barcodes.txt -bo barcodes.filt.txt -o matrix.filt.mtx matrix.mtx

Filtered to 1,034 cells with at least 367 UMIs.


## Python

In [50]:
from mx.mx_filter import mx_filter
from scipy.io import mmread
import pandas as pd

mtx = mmread("matrix.mtx").tocsr()
bcs = pd.read_csv("barcodes.txt", index_col=0, header=None)
fbcs, fmtx = mx_filter(mtx.copy(), bcs.index.values, sum_axis=1, comps=[2,2], select_axis=None)

Filtered to 1,034 cells with at least 367 UMIs.


# Normalize counts

## Comand line

In [8]:
!mx normalize -m log1pPF -o matrix.norm.filt.mtx matrix.filt.mtx

## Python

In [53]:
from scipy.io import mmread, mmwrite
from mx.mx_normalize import mx_normalize

mtx = mmread("matrix.filt.mtx").tocsr()
nmtx = mx_normalize(mtx.copy(), "log1pPF")

# Assign cell types

In [9]:
!wget --quiet --show-progress https://raw.githubusercontent.com/cellatlas/human/main/markers/blood/markers.txt



## Command line

In [10]:
!ec clean -o markers.txt markers.txt

In [11]:
# these genes are not in the index so we remove it
!ec filter -bt <(printf "TM4SF19-TCTEX1D2\nFCGR2C\nCORO7-PAM16") -o markers.filt.txt markers.txt

In [12]:
# verify they are not in the file
!grep "TM4SF19-TCTEX1D2" markers.filt.txt
!grep "FCGR2C" markers.filt.txt
!grep "CORO7-PAM16" markers.filt.txt

In [13]:
!ec index -g groups.txt -t targets.txt -e markers.ec.txt markers.filt.txt

In [14]:
!mx extract \
-t targets.txt \
-gi genes.txt -go genes.extract.txt \
-o matrix.extract.norm.filt.mtx \
matrix.norm.filt.mtx

In [16]:
!mx clean --bad \
-gi genes.extract.txt -go genes.clean.extract.txt \
-bi barcodes.filt.txt -bo barcodes.clean.filt.txt \
-o matrix.clean.extract.norm.filt.mtx \
matrix.extract.norm.filt.mtx

Dropping 7 cells
Dropping 70 genes


In [23]:
!ec filter -bt genes.clean.extract.txt.bad -o markers.filt.txt markers.txt
!ec filter -bt <(printf "TM4SF19-TCTEX1D2\nFCGR2C\nCORO7-PAM16") -o markers.filt.txt markers.filt.txt

In [24]:
!ec index -g groups.txt -t targets.txt -e markers.ec.txt markers.filt.txt

In [25]:
!mx extract \
-t targets.txt \
-gi genes.txt -go genes.extract.txt \
-o matrix.extract.norm.filt.mtx \
matrix.norm.filt.mtx

In [26]:
!mx clean --bad \
-gi genes.extract.txt -go genes.clean.extract.txt \
-bi barcodes.filt.txt -bo barcodes.clean.filt.txt \
-o matrix.clean.extract.norm.filt.mtx \
matrix.extract.norm.filt.mtx

Dropping 7 cells
Dropping 0 genes


In [27]:
!mx normalize -m rank -o rank.mtx matrix.clean.extract.norm.filt.mtx

In [28]:
!mx assign -g groups.txt -gi genes.clean.extract.txt -bi barcodes.clean.filt.txt -e markers.ec.txt -o assignments.txt rank.mtx

Initialization 0
  Iteration 10	 time lapse 35.79607s	 ll change 0.00000
Initialization converged: True	 time lapse 35.79615s	 ll -2430205244024.92920


In [29]:
!head assignments.txt

barcodes	label_id	label	ent	CD300LB	ATF5	MED12L	GPR146	PIM2	KLF12	CX3CR1	BAIAP2	CDKN1A	TBC1D9	CTSW	ALDH2	COQ7	MGLL	NDRG1	VASH1	DPYSL2	DAB2	PLAAT3	CXCR4	FCER2	APLP2	IGHM	IL4R	PLPP5	TCL1A	IL1A	CLEC4A	CD209	CD163	CD36	IL1B	TNFSF10	CD14	CD37	MS4A1	TFRC	CD19	CD52	CD38	CD27	CD79B	CD24	CD74	BLNK	HSP90B1	PTPRC	CD79A	BANK1	CD40LG	MZB1	CD5	TNFSF13B	CDKN1C	C1QB	IFITM2	ITGAL	CSF3R	FCN1	CSF1R	MS4A7	LYN	C1QA	ITGAM	IFITM3	FCGR3B	HLA-DPA1	HLA-DPB1	IFITM1	HLA-DRB1	RHOC	PLBD1	S100A12	VCAN	TYROBP	RNASE2	CST3	LGALS2	CLEC4E	CRIP1	S100A8	CLU	RETN	MNDA	DPP4	CD93	ADAM28	DUSP2	DUSP10	VAV3	PPM1M	CSRP1	ZNF366	BTLA	CYB5R3	SLC24A4	NET1	LACC1	CLNK	CYP2E1	DNASE1L3	PRELID2	DENND1B	IL1RN	CLEC10A	CES1	NLRP3	FPR1	FCER1A	HBEGF	ANXA1	AOAH	MTMR11	IL13RA1	MPP7	HNMT	CSTA	EREG	F13A1	PDLIM5	LILRB1	TSPAN32	VAMP5	ARL4A	NAMPT	PLXNC1	RAB10	TCIRG1	IFI30	LILRB2	SIRPB1	DRAP1	CAMKK2	TXNIP	CDC42EP3	CD6	FOXP3	RTKN2	IL2RA	CD4	GBP2	ICA1	ARID5B	CORO1B	FANK1	CDH2	PROM1	MYC	IL1RAP	CD44	MAL	IFI44L	OAS1	TCF7	NOSIP	LEF1	TSHZ2	SELL	IFIT3	SOCS3	C

In [30]:
! tail -n +2 assignments.txt | cut -f 3 | sort | uniq -c | sort -nr

    249 CD14-positive monocyte
    217 Naive thymus-derived CD8-positive, alpha-beta T cell
    159 Naive thymus-derived CD4-positive, alpha-beta T cell
    114 T cell
     92 B cell
     74 Natural killer cell
     30 Myeloid cell
     24 Circulating tumor cell
     14 Platelet
     10 CD1C-, CD141- dendritic cell
     10 CD1C+ B dendritic cell
      9 Effector memory CD4-positive, alpha-beta T cell
      7 Plasmacytoid dendritic cell
      4 Tissue resident memory T cell
      3 Dendritic cell
      2 Effector memory CD8-positive, alpha-beta T cell
      1 Neoplastic cell
      1 Naive T cell
      1 Monocyte
      1 Erythrocyte
      1 Circulating fetal cell
      1 Central memory CD4-positive, alpha-beta T cell
      1 CD4-positive, CD25-positive, alpha-beta regulatory T cell
      1 CD14-low, CD16-positive monocyte
      1 Alpha-beta T cell


## Python