In [1]:
import h5py
import numpy as np
import time
import pandas as pd

  from ._conv import register_converters as _register_converters


In [2]:
! ls -l /cluster/home/mtuncel/sc-pipe/intermediate_files/

total 983792
-rw-rw---- 1 mtuncel mtuncel-group 136372956 Apr 24 13:15 cell_cycle_removed_melanomaS2.h5
-rw-rw---- 1 mtuncel mtuncel-group       669 Apr 11 12:09 cluster_celltype_confusionmatrix.txt
-rw-rw---- 1 mtuncel mtuncel-group 272188839 Apr 24 13:13 coding_region_only_melanomaS2.h5
-rw-rw---- 1 mtuncel mtuncel-group 136372956 Apr 23 14:52 log_cell_cycle_removed_melanomaS2.h5
-rw-rw---- 1 mtuncel mtuncel-group 177321089 Apr 24 13:12 raw_melanomaS2.h5
-rw-rw---- 1 mtuncel mtuncel-group 272188839 Apr 16 17:49 test.hdf5
-rw-rw---- 1 mtuncel mtuncel-group   8960976 Apr 24 13:15 zheng17_melanomaS2.h5


In [3]:
input_file = '/cluster/home/mtuncel/sc-pipe/intermediate_files/cell_cycle_removed_melanomaS2.h5'

In [4]:
h5f = h5py.File(input_file, 'r')

In [5]:
list(h5f['cell_attrs'].keys())

['cell_names', 'cell_phase']

In [6]:
h5f['cell_attrs']['cell_phase'].value

array([b'G1', b'G2M', b'G1', ..., b'G1', b'G1', b'G1'], dtype='|S10')

In [7]:
list(h5f['gene_attrs'].keys())

['gene_ids', 'gene_names']

In [8]:
matrix = h5f['matrix'][:]
barcodes = h5f['cell_attrs']['cell_names'].value
gene_ids = h5f['gene_attrs']['gene_ids'].value
gene_names = h5f['gene_attrs']['gene_names'].value
cell_phase = h5f['cell_attrs']['cell_phase'].value

In [9]:
decoder = np.vectorize(lambda t: t.decode('UTF-8'))

In [10]:
barcodes = decoder(barcodes)
gene_ids = decoder(gene_ids)
gene_names = decoder(gene_names)
cell_phase = decoder(cell_phase)

In [11]:
matrix.shape

(2216, 15324)

In [12]:
type(matrix)

numpy.ndarray

In [13]:
detected_genes_index = ~(matrix == 0).all(axis=0)

In [14]:
detected_genes_index

array([ True,  True,  True, ...,  True,  True,  True])

In [15]:
sum(detected_genes_index)

15324

In [16]:
gene_names.shape

(15324,)

In [17]:
gene_names[detected_genes_index].shape

(15324,)

In [18]:
len(np.where(~matrix.any(axis=0))[0])

0

In [19]:
matrix.shape

(2216, 15324)

In [20]:
matrix

array([[-4.96326108e-03, -1.25327274e-01, -3.53636523e-03, ...,
        -2.95644259e+00, -5.32092214e-01, -3.07317519e+00],
       [-4.42326861e-03, -1.30447447e-01, -6.06072601e-03, ...,
        -2.20450115e+00, -6.16022348e-01, -2.32758117e+00],
       [ 5.54294288e-01, -1.25789836e-01, -3.74200102e-03, ...,
        -2.75881171e+00, -5.57653606e-01, -3.76119757e+00],
       ...,
       [-4.72440710e-03, -1.25965908e-01, -3.83498566e-03, ...,
        -1.03707099e+00, -5.55587709e-01, -2.61977983e+00],
       [-4.78246063e-03, -1.26354054e-01, -4.03572107e-03, ...,
        -1.55430377e+00,  1.09639645e-01, -2.31915975e+00],
       [-4.96826693e-03, -1.28499746e-01, -5.13262395e-03, ...,
        -3.19688463e+00, -5.58300972e-01, -4.05960131e+00]], dtype=float32)

In [21]:
matrix.max()

115.58208

In [22]:
matrix.sum()

0.0004043579

In [23]:
b = (matrix==0).sum(axis=0)==0

In [24]:
b.searchsorted(True)

0

In [25]:
df_cell_attrs = pd.DataFrame(barcodes)

In [26]:
df_cell_attrs.columns = ['barcodes']

In [27]:
df_cell_attrs['cell_phase'] = cell_phase

In [28]:
df_cell_attrs.head()

Unnamed: 0,barcodes,cell_phase
0,AAACCTGAGGGCATGT,G1
1,AAACCTGAGTACTTGC,G2M
2,AAACCTGCATCTCCCA,G1
3,AAACCTGGTAAGGGCT,S
4,AAACCTGGTACCGTTA,G1


In [29]:
df_gene_attrs = pd.DataFrame(gene_ids)
df_gene_attrs.columns = ['gene_ids']
df_gene_attrs['gene_names'] = gene_names

In [30]:
df_gene_attrs.head()

Unnamed: 0,gene_ids,gene_names
0,ENSG00000187634,SAMD11
1,ENSG00000188976,NOC2L
2,ENSG00000187961,KLHL17
3,ENSG00000187583,PLEKHN1
4,ENSG00000188290,HES4


In [31]:
df = pd.DataFrame(data=matrix, columns=gene_ids, index=barcodes)

In [32]:
df = df.T

In [33]:
df.head()

Unnamed: 0,AAACCTGAGGGCATGT,AAACCTGAGTACTTGC,AAACCTGCATCTCCCA,AAACCTGGTAAGGGCT,AAACCTGGTACCGTTA,AAACCTGGTCTCCATC,AAACCTGGTTTGGCGC,AAACCTGTCCGAAGAG,AAACGGGAGAGTCGGT,AAAGATGAGATATGGT,...,TTTGGTTGTTCAGTAC,TTTGGTTGTTCCCTTG,TTTGGTTTCAACGGGA,TTTGGTTTCTAGAGTC,TTTGTCAAGAAGGCCT,TTTGTCAAGAGTACAT,TTTGTCAAGGCTCAGA,TTTGTCAAGGTGATAT,TTTGTCACATGAAGTA,TTTGTCACATTCTCAT
ENSG00000187634,-0.004963,-0.004423,0.554294,-0.001536,-0.005146,-0.003868,-0.004748,-0.00518,-0.004597,-0.004937,...,-0.0049,-0.004358,-0.005203,-0.004759,0.272808,-0.003811,-0.00445,-0.004724,-0.004782,-0.004968
ENSG00000188976,-0.125327,-0.130447,-0.12579,0.092678,-0.125018,-0.136631,-0.130622,-0.126498,-0.121224,-0.131385,...,-0.124994,-0.128359,0.80952,-0.123456,-0.127436,-0.122162,-0.127488,-0.125966,-0.126354,-0.1285
ENSG00000187961,-0.003536,-0.006061,-0.003742,0.00123,-0.003398,-0.009119,-0.006179,-0.004146,-0.001438,-0.006581,...,-0.003363,-0.005004,-0.004575,-0.002576,-0.004625,-0.001835,-0.004575,-0.003835,-0.004036,-0.005133
ENSG00000187583,-0.018352,-0.018859,-0.017636,-0.003442,-0.018806,-0.019784,-0.019982,-0.019565,-0.015375,-0.020925,...,-0.018004,-0.017734,-0.020011,-0.016873,-0.020217,-0.013255,-0.017648,-0.017863,-0.018221,-0.01976
ENSG00000188290,0.929565,-0.096414,0.453902,0.176531,-0.110581,-0.082841,-0.09929,0.658525,-0.110399,-0.100079,...,-0.108263,1.010146,-0.108019,-0.10896,0.610388,0.611434,-0.100624,-0.105281,-0.105318,-0.10423


In [45]:
df.shape

(15324, 2216)

In [41]:
df.to_csv('data/24.04.2018/log_transformed_cell_cycle_removed.tsv',sep='\t')

In [42]:
df_gene_attrs.to_csv('data/24.04.2018/log_transformed_cell_cycle_removed_gene_attrs.tsv', sep='\t')
df_cell_attrs.to_csv('data/24.04.2018/log_transformed_cell_cycle_removed_cell_attrs.tsv', sep='\t')

In [36]:
h5f.close()
#matrix = np.log1p(matrix)