In [8]:
import pandas as pd
from scipy.io import mmread
import anndata as ad
import scanpy as sc

In [9]:
# SETTING UP VARIABLES TO IMPORT INFO -> DATAFRAME
# read file with gene counts for each cell and convert to a sparse matrix (also transpose from genexcell -> cellxgene)
X = mmread("mappingResultsSolo.out/Gene/raw/matrix.mtx").tocsr() 
cellGeneMatrix = X.T.tocsr()
# load cell barcodes into "cells" variable, storing as a dataframe
cells = pd.read_csv("mappingResultsSolo.out/Gene/raw/barcodes.tsv", header=None)[0] 
# load gene names/identifiers into "features" variable, storing as dataframe
features = pd.read_csv("mappingResultsSolo.out/Gene/raw/features.tsv", sep="\t", header=None)
# grab column with gene names from features variable, store as a list
genes = features[1].tolist()



# CREATE ANNDATA MATRIX
# 1. create pandas dataframe with the cell names, specify index (labels for each row) as cell barcodes
cellBarcodes = pd.DataFrame(index=cells)
# 2. create pandas dataframe for gene names 
geneNames = pd.DataFrame(index = genes)
# 3. create anndata object
adata = ad.AnnData(X=cellGeneMatrix, obs=cellBarcodes, var=geneNames)
# OR: adata = ad.AnnData(X=cellGeneMatrix, obs=cellBarcodes, varm=features(3005,3))
adata.var_names_make_unique()
print(adata)

AnnData object with n_obs × n_vars = 884736 × 3005


  utils.warn_names_duplicates("var")


In [None]:
# QUALITY CONTROL: filter data by taking out tRNA and mRNA
adata.var["tRNA"] = adata.var_names.str.startswith("tRNA")
adata.var["mRNA"] = adata.var_names.str.startswith("mRNA")

sc.pp.calculate_qc_metrics(adata, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True)