In [2]:
import os 
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import numpy as np
import anndata as ad
import pandas as pd
from scipy.sparse import csr_matrix


In [3]:
file_path = "normed_genecode.h5ad"
adata = ad.read_h5ad(file_path)


# Data exploration

In [4]:
adata

AnnData object with n_obs × n_vars = 26748 × 164607
    obs: 'compartment', 'organ_tissue', 'cell_ontology_class', 'free_annotation'
    var: 'gene_ID', 'gene_name', 'length', 'class', 'uniprot'
    layers: 'log1p', 'norm'

In [5]:
# Obs (samples)
adata.obs

Unnamed: 0_level_0,compartment,organ_tissue,cell_ontology_class,free_annotation
cell_list,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TSP2_Blood_NA_SS2_B114658_B133046_LinPos_A10_S10,immune,Blood,"cd4-positive, alpha-beta t cell",CD4 t cell
TSP2_Blood_NA_SS2_B114658_B133046_LinPos_A11_S11,immune,Blood,"cd4-positive, alpha-beta t cell",CD4 t cell
TSP2_Blood_NA_SS2_B114658_B133046_LinPos_A12_S12,immune,Blood,"cd4-positive, alpha-beta t cell",CD4 t cell
TSP2_Blood_NA_SS2_B114658_B133046_LinPos_A13_S13,immune,Blood,"cd4-positive, alpha-beta t cell",CD4 t cell
TSP2_Blood_NA_SS2_B114658_B133046_LinPos_A14_S14,immune,Blood,"cd4-positive, alpha-beta t cell",CD4 t cell
...,...,...,...,...
TSP3_Eye_noCornea_SS2_B114669_B133703_Epithelial_F8_L001,epithelial,Eye,conjunctival epithelial cell,conjunctiva - epithelial cells
TSP3_Eye_noCornea_SS2_B114669_B133703_Epithelial_G5_L001,epithelial,Eye,eye photoreceptor cell,retina - photoreceptor cells
TSP3_Eye_noCornea_SS2_B114669_B133703_Epithelial_H13_L001,epithelial,Eye,conjunctival epithelial cell,conjunctiva - epithelial cells
TSP3_Eye_noCornea_SS2_B114669_B133703_Epithelial_H6_L001,epithelial,Eye,corneal keratocyte,cornea - mesenchymal cells - stromal keratinoc...


In [6]:
# Tissues
adata.obs.organ_tissue.unique().tolist()

['Blood',
 'Bone_Marrow',
 'Muscle',
 'Large_Intestine',
 'Thymus',
 'Lymph_Node',
 'Small_Intestine',
 'Lung',
 'Bladder',
 'Vasculature',
 'Trachea',
 'Mammary',
 'Uterus',
 'Eye',
 'Spleen',
 'Salivary_Gland',
 'Tongue',
 'Prostate',
 'Pancreas',
 'Fat',
 'Skin',
 'Liver',
 'Kidney',
 'Heart']

In [7]:
# Cells
adata.obs.cell_ontology_class.unique().tolist()

['cd4-positive, alpha-beta t cell',
 'naive b cell',
 'cd8-positive, alpha-beta t cell',
 'nk cell',
 'memory b cell',
 'monocyte',
 'macrophage',
 'hematopoietic stem cell',
 'granulocyte',
 'plasma cell',
 'erythroid progenitor',
 'cd24 neutrophil',
 'plasmablast',
 'capillary endothelial cell',
 'endothelial cell of vascular tree',
 'skeletal muscle satellite stem cell',
 'endothelial cell of lymphatic vessel',
 'mature enterocyte',
 'transit amplifying cell of large intestine',
 'goblet cell',
 'immature enterocyte',
 'intestinal tuft cell',
 'intestinal crypt stem cell',
 'pericyte cell',
 'mesenchymal stem cell',
 't cell',
 'fibroblast',
 'vascular associated smooth muscle cell',
 'fast muscle cell',
 'mesothelial cell',
 'medullary thymic epithelial cell',
 'erythrocyte',
 'mast cell',
 'cd8-positive, alpha-beta memory t cell',
 'naive thymus-derived cd4-positive, alpha-beta t cell',
 'cd1c-positive myeloid dendritic cell',
 'innate lymphoid cell',
 'b cell',
 'neutrophil',
 'm

In [11]:
# Do we need to subset? Based on what?? Max samples per tissue?

organ_tissue_counts = adata.obs['organ_tissue'].value_counts()
organ_tissue_counts


Muscle             5043
Lymph_Node         2383
Bone_Marrow        2290
Spleen             2148
Blood              1832
Lung               1534
Tongue             1391
Bladder            1161
Pancreas            895
Vasculature         847
Skin                844
Salivary_Gland      832
Fat                 651
Prostate            633
Small_Intestine     606
Thymus              599
Mammary             580
Trachea             474
Large_Intestine     442
Eye                 417
Kidney              370
Uterus              286
Heart               277
Liver               213
Name: organ_tissue, dtype: int64

In [9]:
# Count Matrix 
adata.X

<26748x164607 sparse matrix of type '<class 'numpy.float32'>'
	with 120778929 stored elements in Compressed Sparse Row format>

In [45]:
adata.X = csr_matrix(adata.X)
count_matrix = adata.X
count_matrix = adata.X.toarray()

In [46]:
count_matrix

array([[   0.   ,    0.   ,    0.   , ...,    0.   ,    0.   ,    0.   ],
       [   0.   ,    0.   ,    0.   , ...,    0.   ,    0.   ,    0.   ],
       [   0.   ,    0.   ,    0.   , ...,    0.   ,    0.   ,    0.   ],
       ...,
       [   0.   ,    0.   ,    0.   , ...,  957.368,    0.   ,    0.   ],
       [   0.   ,    0.   ,    0.   , ..., 1548.   ,    0.   ,    0.   ],
       [   0.   ,    0.   ,    0.   , ...,    0.   ,    0.   ,    0.   ]],
      dtype=float32)

In [11]:
# Isoforms
adata.var_names

Index(['ENST00000641515.2', 'ENST00000426406.4', 'ENST00000332831.5',
       'ENST00000618779.5', 'ENST00000466827.1', 'ENST00000618323.5',
       'ENST00000478729.1', 'ENST00000474461.1', 'ENST00000341065.8',
       'ENST00000342066.8',
       ...
       'ENST00000616638.1', 'ENST00000621028.1', 'ENST00000611339.1',
       'ENST00000613216.4', 'ENST00000611746.1', 'ENST00000619792.1',
       'ENST00000617983.1', 'ENST00000613204.1', 'ENST00000621424.4',
       'ENST00000615165.1'],
      dtype='object', name='transcript_id', length=164607)

# Data Extraction

In [47]:
# Extract information
sample_names = adata.obs_names
isoform_names = adata.var_names

np.savetxt("count_matrix.csv", count_matrix, delimiter=",", header=",".join(sample_names), comments="")




OSError: [Errno 28] No space left on device