In [1]:
import h5py
import numpy as np
import time
import pandas as pd

  from ._conv import register_converters as _register_converters


In [2]:
! ls -l /cluster/home/mtuncel/sc-pipe/intermediate_files/

total 983792
-rw-rw---- 1 mtuncel mtuncel-group 136372956 Apr 26 16:39 cell_cycle_removed_melanomaS2.h5
-rw-rw---- 1 mtuncel mtuncel-group       669 Apr 11 12:09 cluster_celltype_confusionmatrix.txt
-rw-rw---- 1 mtuncel mtuncel-group 272188839 Apr 26 16:37 coding_region_only_melanomaS2.h5
-rw-rw---- 1 mtuncel mtuncel-group 136372956 Apr 23 14:52 log_cell_cycle_removed_melanomaS2.h5
-rw-rw---- 1 mtuncel mtuncel-group 177321089 Apr 26 16:37 raw_melanomaS2.h5
-rw-rw---- 1 mtuncel mtuncel-group 272188839 Apr 16 17:49 test.hdf5
-rw-rw---- 1 mtuncel mtuncel-group   8960976 Apr 26 16:40 zheng17_melanomaS2.h5


In [3]:
input_file = '/cluster/home/mtuncel/sc-pipe/intermediate_files/cell_cycle_removed_melanomaS2.h5'

In [4]:
h5f = h5py.File(input_file, 'r')

In [5]:
list(h5f['cell_attrs'].keys())

['cell_names', 'cell_phase']

In [6]:
h5f['cell_attrs']['cell_phase'].value

array([b'G1', b'G1', b'G1', ..., b'G1', b'G1', b'G1'], dtype='|S10')

In [7]:
list(h5f['gene_attrs'].keys())

['gene_ids', 'gene_names']

In [8]:
matrix = h5f['matrix'][:]
barcodes = h5f['cell_attrs']['cell_names'].value
gene_ids = h5f['gene_attrs']['gene_ids'].value
gene_names = h5f['gene_attrs']['gene_names'].value
cell_phase = h5f['cell_attrs']['cell_phase'].value

In [9]:
matrix.shape

(2216, 15324)

In [10]:
decoder = np.vectorize(lambda t: t.decode('UTF-8'))

In [11]:
barcodes = decoder(barcodes)
gene_ids = decoder(gene_ids)
gene_names = decoder(gene_names)
cell_phase = decoder(cell_phase)

In [12]:
matrix.shape

(2216, 15324)

In [13]:
type(matrix)

numpy.ndarray

In [14]:
detected_genes_index = ~(matrix == 0).all(axis=0)

In [15]:
detected_genes_index

array([ True,  True,  True, ...,  True,  True,  True])

In [16]:
sum(detected_genes_index)

15324

In [17]:
gene_names.shape

(15324,)

In [18]:
gene_names[detected_genes_index].shape

(15324,)

In [19]:
len(np.where(~matrix.any(axis=0))[0])

0

In [20]:
matrix.shape

(2216, 15324)

In [21]:
matrix.max()

476.52835

In [22]:
col_names = gene_names[0:21]

In [23]:
len(col_names)

21

In [24]:
desc_df = pd.DataFrame(matrix).loc[:,0:20].describe()

In [25]:
desc_df.columns = col_names

In [41]:
desc_df

Unnamed: 0,SAMD11,NOC2L,KLHL17,PLEKHN1,HES4,ISG15,AGRN,C1orf159,TTLL10,TNFRSF18,...,SDF4,B3GALT6,FAM132A,UBE2J2,SCNN1D,ACAP3,PUSL1,CPSF3L,GLTPD1,TAS1R3
count,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0,...,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0
mean,0.00742,0.169079,0.006054,0.026391,0.188913,1.687615,0.054694,0.011719,0.000721,0.176678,...,0.352951,0.037969,0.004324,0.152746,0.001301,0.065855,0.051211,0.099535,0.050945,0.011828
std,0.087888,0.392432,0.075056,0.180083,0.615627,4.654374,0.245091,0.106088,0.026397,0.889562,...,0.527565,0.179855,0.062408,0.366156,0.039664,0.261058,0.225211,0.292213,0.238921,0.121741
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.361152,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,1.317552,0.0,0.0,0.0,0.0,...,0.661628,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.497957,3.470397,1.581898,2.807633,10.347481,70.388939,2.976759,1.698555,1.164632,18.025391,...,3.84211,2.089513,1.516347,2.880238,1.468939,3.308876,2.804122,2.686365,2.819334,2.361929


In [27]:
matrix.max()

476.52835

In [28]:
df_cell_attrs = pd.DataFrame(barcodes)

In [29]:
df_cell_attrs.columns = ['barcodes']

In [30]:
df_cell_attrs['cell_phase'] = cell_phase

In [31]:
df_cell_attrs.head()

Unnamed: 0,barcodes,cell_phase
0,AAACCTGAGGGCATGT,G1
1,AAACCTGAGTACTTGC,G1
2,AAACCTGCATCTCCCA,G1
3,AAACCTGGTAAGGGCT,S
4,AAACCTGGTACCGTTA,G1


In [32]:
df_gene_attrs = pd.DataFrame(gene_ids)
df_gene_attrs.columns = ['gene_ids']
df_gene_attrs['gene_names'] = gene_names

In [33]:
df_gene_attrs.head()

Unnamed: 0,gene_ids,gene_names
0,ENSG00000187634,SAMD11
1,ENSG00000188976,NOC2L
2,ENSG00000187961,KLHL17
3,ENSG00000187583,PLEKHN1
4,ENSG00000188290,HES4


In [34]:
df = pd.DataFrame(data=matrix, columns=gene_ids, index=barcodes)

In [35]:
df = df.T

In [36]:
df.head()

Unnamed: 0,AAACCTGAGGGCATGT,AAACCTGAGTACTTGC,AAACCTGCATCTCCCA,AAACCTGGTAAGGGCT,AAACCTGGTACCGTTA,AAACCTGGTCTCCATC,AAACCTGGTTTGGCGC,AAACCTGTCCGAAGAG,AAACGGGAGAGTCGGT,AAAGATGAGATATGGT,...,TTTGGTTGTTCAGTAC,TTTGGTTGTTCCCTTG,TTTGGTTTCAACGGGA,TTTGGTTTCTAGAGTC,TTTGTCAAGAAGGCCT,TTTGTCAAGAGTACAT,TTTGTCAAGGCTCAGA,TTTGTCAAGGTGATAT,TTTGTCACATGAAGTA,TTTGTCACATTCTCAT
ENSG00000187634,0.0,0.0,0.938745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.394191,0.0,0.0,0.0,0.0,0.0
ENSG00000188976,0.0,0.0,0.0,0.223157,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.420671,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000187961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000187583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000188290,2.210776,0.0,0.835106,0.348006,0.0,0.0,0.0,1.92618,0.0,0.0,...,0.0,2.699112,0.0,0.0,1.740235,0.592851,0.0,0.0,0.0,0.0


In [37]:
df.shape

(15324, 2216)

In [39]:
df.to_csv('data/26.04.2018/cell_cycle_corrected.tsv',sep='\t')

In [40]:
df_gene_attrs.to_csv('data/26.04.2018/cell_cycle_corrected_gene_attrs.tsv', sep='\t')
df_cell_attrs.to_csv('data/26.04.2018/cell_cycle_corrected_cell_attrs.tsv', sep='\t')

In [36]:
h5f.close()
#matrix = np.log1p(matrix)