In [1]:
import os
import scanpy as sc
import anndata
from urllib.request import urlretrieve
import matplotlib.pyplot as plt
%matplotlib inline
import tables as tb
import h5py
import pandas as pd
import numpy as np

In [2]:
file_path_dox_plus = "C:\matlab scripts\Yamanaka_Parcial_Raton_Izpisua\GSE144600_liver4F_SC_plusDox_filtered_feature_bc_matrix.h5"

Tenemos un archivo con la matriz en formato .h5. Tenemos que acceder a esa matriz, para ello primero vemos la estructura de los datos. 

In [3]:
f = h5py.File(file_path_dox_plus, 'r')
print(list(f.keys()))

['matrix']


In [7]:
def get_h5_structure(f, level=0):
    """    prints structure of hdf5 file    """
    for key in f.keys():
        if isinstance(f[key], h5py._hl.dataset.Dataset):
            print(f"'  '*level DATASET: f[key].name")
        elif isinstance(f[key], h5py._hl.group.Group):
            print(f"'  '*level GROUP: key, f[key].name")
            level += 1
            get_h5_structure(f[key], level)
            level -= 1

        if f[key].parent.name == "/":
            print("n"*2)

get_h5_structure(f)

In [7]:
import collections
import scipy.sparse as sp_sparse
import tables
 
CountMatrix = collections.namedtuple('CountMatrix', ['feature_ref', 'barcodes', 'matrix'])
 
def get_matrix_from_h5(filename):
    with tables.open_file(filename, 'r') as f:
        mat_group = f.get_node(f.root, 'matrix')
        barcodes = f.get_node(mat_group, 'barcodes').read()
        data = getattr(mat_group, 'data').read()
        indices = getattr(mat_group, 'indices').read()
        indptr = getattr(mat_group, 'indptr').read()
        shape = getattr(mat_group, 'shape').read()
        matrix = sp_sparse.csc_matrix((data, indices, indptr), shape=shape)
         
        feature_ref = {}
        feature_group = f.get_node(mat_group, 'features')
        feature_ids = getattr(feature_group, 'id').read()
        feature_names = getattr(feature_group, 'name').read()
        feature_types = getattr(feature_group, 'feature_type').read()
        feature_ref['id'] = feature_ids
        feature_ref['name'] = feature_names
        feature_ref['feature_type'] = feature_types
        tag_keys = getattr(feature_group, '_all_tag_keys').read()
        for key in tag_keys:
            key = key.decode("utf-8")
            feature_ref[key] = getattr(feature_group, key).read()
         
        return CountMatrix(feature_ref, barcodes, matrix)
 

filtered_feature_bc_matrix = get_matrix_from_h5(file_path_dox_plus)

In [10]:
filtered_feature_bc_matrix.matrix

<27998x7765 sparse matrix of type '<class 'numpy.int32'>'
	with 3372872 stored elements in Compressed Sparse Column format>

In [11]:
np.savetxt("C:\matlab scripts\Yamanaka_Parcial_Raton_Izpisua\X_filtered_dox_plus.txt.gz",filtered_feature_bc_matrix.matrix.transpose().toarray().astype(np.uint8),delimiter = ",")

Ahora extraemos la matriz Dox minus. 

In [14]:
file_path_dox_minus = "C:\matlab scripts\Yamanaka_Parcial_Raton_Izpisua\GSE144600_liver4F_SC_minusDox_filtered_feature_bc_matrix.h5"

CountMatrix = collections.namedtuple('CountMatrix', ['feature_ref', 'barcodes', 'matrix'])
filtered_feature_bc_matrix_minus = get_matrix_from_h5(file_path_dox_minus)
    

In [15]:
filtered_feature_bc_matrix_minus.matrix

<27998x8277 sparse matrix of type '<class 'numpy.int32'>'
	with 4429550 stored elements in Compressed Sparse Column format>

In [16]:
np.savetxt("C:\matlab scripts\Yamanaka_Parcial_Raton_Izpisua\X_filtered_dox_minus.txt.gz",filtered_feature_bc_matrix_minus.matrix.transpose().toarray().astype(np.uint8),delimiter = ",")

# 1. Matrix Dox + (reprogramadas)

Tenemos un archivo .h5 que contiene la matriz (matrix) de conteos. Esta clase tiene 6 grupos: barcodes, data, features, indices, indptr y shape.

In [2]:
filename = "C:\matlab scripts\Yamanaka_Parcial_Raton_Izpisua\GSE144600_liver4F_SC_plusDox_filtered_feature_bc_matrix.h5"

dox_plus = h5py.File(filename, "r")

In [4]:
for key in dox_plus.keys():
    print(key) #Names of the root level object names in HDF5 file - can be groups or datasets.
    print(type(dox_plus[key])) # get the object type: usually group or dataset

matrix
<class 'h5py._hl.group.Group'>


In [5]:
# ES UNA CLASE (matrix) QUE TIENE: Barcodes, data, features, indices, indptr y shape

#Get the HDF5 group; key needs to be a group name from above
matrix_dox_plus = dox_plus["matrix"]

#Checkout what keys are inside that group.
for key in matrix_dox_plus.keys():
    print(key)

barcodes
data
features
indices
indptr
shape


### 1.Barcodes

In [89]:
barcodes_plus = matrix_dox_plus["barcodes"][()]
print(barcodes_plus)

[b'AAACCCAAGACTGTTC-1' b'AAACCCAAGGATACGC-1' b'AAACCCAAGGTCTACT-1' ...
 b'TTTGTTGCAATGACCT-1' b'TTTGTTGCACATACGT-1' b'TTTGTTGTCTCGCAGG-1']


### 2.Features

In [46]:
features = matrix_dox_plus["features"].keys()

In [47]:
print(features)

<KeysViewHDF5 ['_all_tag_keys', 'feature_type', 'genome', 'id', 'name']>


In [97]:
ids_plus = matrix_dox_plus["features"]["id"][()]
tag_keys = matrix_dox_plus["features"]["_all_tag_keys"][()]
feature_type = matrix_dox_plus["features"]["feature_type"][()]
genome = matrix_dox_plus["features"]["genome"][()]
name = matrix_dox_plus["features"]["name"][()]
print(name)

[b'Xkr4' b'Gm1992' b'Gm37381' ... b'DHRSX' b'Vmn2r122' b'CAAA01147332.1']


### 3.Indices

In [64]:
indices = matrix_dox_plus["indices"]
print(indices)

<HDF5 dataset "indices": shape (3372872,), type "<i8">


### 4.Indptr

In [65]:
indptr = matrix_dox_plus["indptr"]
print(indptr)

<HDF5 dataset "indptr": shape (7766,), type "<i8">


# 2. Matrix Dox - (control)

In [71]:
filename = "GSE144600_liver4F_SC_minusDox_filtered_feature_bc_matrix.h5"

dox_minus = h5py.File(filename, "r")

In [76]:
for key in dox_minus.keys():
    print(key) #Names of the root level object names in HDF5 file - can be groups or datasets.
    print(type(f[key])) # get the object type: usually group or dataset

matrix
<class 'h5py._hl.group.Group'>


In [77]:
# ES UNA CLASE (matrix) QUE TIENE: Barcodes, data, features, indices, indptr y shape

#Get the HDF5 group; key needs to be a group name from above
matrix_dox_minus = dox_minus["matrix"]

#Checkout what keys are inside that group.
for key in matrix_dox_minus.keys():
    print(key)

barcodes
data
features
indices
indptr
shape


### 1.Barcodes

In [91]:
barcodes_minus = matrix_dox_minus["barcodes"]
print(barcodes_minus)

<HDF5 dataset "barcodes": shape (8277,), type "|S18">


### 2.Features

In [83]:
features = matrix_dox_minus["features"].keys()
print(features)

<KeysViewHDF5 ['_all_tag_keys', 'feature_type', 'genome', 'id', 'name']>


In [96]:
ids_minus = matrix_dox_minus["features"]["id"]
tag_keys = matrix_dox_minus["features"]["_all_tag_keys"][()]
feature_type = matrix_dox_minus["features"]["feature_type"][()]
genome = matrix_dox_minus["features"]["genome"][()]
name = matrix_dox_minus["features"]["name"][()]
print(ids_minus)
print(tag_keys)
print(feature_type)
print(genome)
print(name)

<HDF5 dataset "id": shape (27998,), type "|S18">
[b'genome']
[b'Gene Expression' b'Gene Expression' b'Gene Expression' ...
 b'Gene Expression' b'Gene Expression' b'Gene Expression']
[b'mm10' b'mm10' b'mm10' ... b'mm10' b'mm10' b'mm10']
[b'Xkr4' b'Gm1992' b'Gm37381' ... b'DHRSX' b'Vmn2r122' b'CAAA01147332.1']


### 3.Indices

In [85]:
indices = matrix_dox_minus["indices"]
print(indices)

<HDF5 dataset "indices": shape (4429550,), type "<i8">


### 4.Indptr

In [None]:
indptr = matrix_dox_minus["indptr"]
print(indptr)