# Python Setup

## Imports

In [1]:
import os
import requests
import shutil
import h5py
import numpy as np
import pandas as pd
import time

Set the filepath where you want to write the h5 file (about 8GB)

In [2]:
filepath = "../data/raw/human_matrix.h5"

## Download the File

can take 10-30 min. 

***Note*** on windows, we need to write the file in binary format otherwise the newline characters get converted and the file can no longer be read as H5. If on mac or linux, you can replace the part that says 'wb' to 'w'

In [5]:
out_file = filepath

# Check if gene expression file was already downloaded, if not in current directory download file form repository
if(not os.path.exists(out_file)):
    print("Downloading compressed gene expression matrix.")
    url = "https://s3.amazonaws.com/mssm-seq-matrix/human_matrix.h5"
    r = requests.get(url, stream=True)
    if r.status_code == 200:
        with open(out_file, 'wb') as f:
            shutil.copyfileobj(r.raw, f)
    del r 
else:
    print("Local file already exists.")



Downloading compressed gene expression matrix.


## How to use H5PY
for quick start guide: http://docs.h5py.org/en/stable/quick.html

read the file with h5py. I use keyword "with" to make sure the file always closes after running the cell

In [4]:
with h5py.File(filepath, "r") as f:
    # List all groups
    print("Keys: %s" % f.keys())
    a_group_key = list(f.keys())[0]


Keys: <KeysViewHDF5 ['data', 'info', 'meta']>


Or you can open the file like so and remember to call the close() function after you're done

In [5]:
f = h5py.File(filepath, 'r')
print("Keys: %s" % f.keys())
f.close()

Keys: <KeysViewHDF5 ['data', 'info', 'meta']>


Here we see three groups of keys in our file (think of it like a dictionary): 
- Info (lab stuff)
- meta (important gene and sample information)
- data (stores the actual expression matrix)

In [6]:
with h5py.File(filepath, "r") as f:
    print("Info Keys: \n")
    print(list(f['info'].keys()))
    print("\nmeta Keys: \n")
    print(list(f['meta'].keys()))
    print("\ndata Key:\n")
    print(list(f['data'].keys()))

Info Keys: 

['author', 'contact', 'creation-date', 'lab', 'version']

meta Keys: 

['Sample_channel_count', 'Sample_characteristics_ch1', 'Sample_contact_address', 'Sample_contact_city', 'Sample_contact_country', 'Sample_contact_department', 'Sample_contact_email', 'Sample_contact_institute', 'Sample_contact_laboratory', 'Sample_contact_name', 'Sample_contact_phone', 'Sample_contact_zip-postal_code', 'Sample_data_processing', 'Sample_data_row_count', 'Sample_description', 'Sample_extract_protocol_ch1', 'Sample_geo_accession', 'Sample_instrument_model', 'Sample_last_update_date', 'Sample_library_selection', 'Sample_library_source', 'Sample_library_strategy', 'Sample_molecule_ch1', 'Sample_organism_ch1', 'Sample_platform_id', 'Sample_relation', 'Sample_series_id', 'Sample_source_name_ch1', 'Sample_status', 'Sample_submission_date', 'Sample_supplementary_file_1', 'Sample_supplementary_file_2', 'Sample_taxid_ch1', 'Sample_title', 'Sample_type', 'gene_accession', 'gene_chromosome', 'gene_e

In [7]:
with h5py.File(filepath, "r") as f:
    human_matrix = f["data"]["expression"]
    print(human_matrix.shape)

(238522, 35238)


NameError: name 'np' is not defined

our matrix has 238522 rows (samples) and 35238 columns (genes)

You can also access keys using forward slash (/):

In [111]:
with h5py.File(filepath, "r") as f:
    human_matrix = f["data/expression"]
    print(human_matrix.dtype)

int32


Since our data is counts, they are stored as integers

## Exploratory Data Analysis

Here I print out basic info for each list in the meta keys including length of list, the datatype, and the first 10 values in that list.

In [None]:
with h5py.File(filepath, "r") as f:
    human_meta = f["meta/Sample_data_processing"]
    

In [112]:
with h5py.File(filepath, "r") as f:
    human_meta = f["meta"]
    for m in human_meta.keys():
        print(m)
        val = human_meta[m]
        print("\tLength: " + str(len(val)))
        print("\tDtype: " + str(val.dtype))
        #If data type is string, then I decode value from binary to get special characters and to make it easier to read
        if("|S" in str(val.dtype)):
            print("\tFirst 10 Values: " + str([i.decode() for i in val.value[0:10]]))
        else:
            print("\tFirst 10 Values: " + str(val.value[0:10]))
        print("\n\n\n")
        

Sample_channel_count
	Length: 238522
	Dtype: |S100
	First 10 Values: ['1', '1', '1', '1', '1', '1', '1', '1', '1', '1']




Sample_characteristics_ch1
	Length: 238522
	Dtype: |S100
	First 10 Values: ['cell line: HeLaXx-xXknockdown: ELAVL1/HuR siRNA1 (see Lebedeva et. Al 2011)Xx-xXtime post transfect', 'library type: single-endXx-xXread length: 50', 'library type: single-endXx-xXread length: 50', 'library type: single-endXx-xXread length: 50', 'cell line: HeLaXx-xXknockdown: mockXx-xXtime post transfection: 120h', 'library type: single-endXx-xXread length: 50', 'library type: single-endXx-xXread length: 50', 'library type: single-endXx-xXread length: 50', 'library type: single-endXx-xXread length: 50', 'library type: single-endXx-xXread length: 50']




Sample_contact_address
	Length: 238522
	Dtype: |S100
	First 10 Values: ['Robert Rössle Str. 10 (H. 87)', '77 Massachusetts Avenue, 68-223A', '77 Massachusetts Avenue, 68-223A', '77 Massachusetts Avenue, 68-223A', 'Robert Rössle Str. 10 (

	First 10 Values: ['Jun 13 2011', 'Jun 16 2011', 'Jun 16 2011', 'Jun 16 2011', 'Jun 13 2011', 'Jun 16 2011', 'Jun 16 2011', 'Jun 16 2011', 'Jun 16 2011', 'Jun 16 2011']




Sample_supplementary_file_1
	Length: 238522
	Dtype: |S100
	First 10 Values: ['ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM741nnn/GSM741172/suppl/GSM741172_siRNA1_5d.wig.gz', 'ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM742nnn/GSM742939/suppl/GSM742939_human_brain.single.bwtou', 'ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM742nnn/GSM742942/suppl/GSM742942_human_heart.single.bwtou', 'ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM742nnn/GSM742946/suppl/GSM742946_human_lymph_node.single.', 'ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM741nnn/GSM741170/suppl/GSM741170_mock_5d.wig.gz', 'ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM742nnn/GSM742947/suppl/GSM742947_human_ovary.single.bwtou', 'ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM742nnn/GSM742943/suppl/GSM742943_human_kidney.single.bwto', 'ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM742n

Observe that every key in meta that starts with gene contains a value of a list of length 35238 corresponding to the indices of columns of the matrix under data/expression. Every key in meta that starts with "Sample" or "reads" contains a value of a list of length 238522 corresponding to each row of the matrix. 

I do the same thing now for the info keys

In [113]:
with h5py.File(filepath, "r") as f:
    human_info = f["info"]
    for info in human_info.keys():
        print(info)
        val = human_info[info]
        print("\tLength: " + str(len(val)))
        print("\tDtype: " + str(val.dtype))
        #If data type is string, then I decode value from binary to get special characters and to make it easier to read
        if("|S" in str(val.dtype)):
            print("\tValue: " + str([i.decode() for i in val.value[0:10]]))
        else:
            print("\tValue: " + str(val.value[0:10]))
        print("\n\n\n")

author
	Length: 1
	Dtype: |S19
	Value: ['Alexander Lachmann']




contact
	Length: 1
	Dtype: |S28
	Value: ['alexander.lachmann@mssm.edu']




creation-date
	Length: 1
	Dtype: |S11
	Value: ['2020-02-06']




lab
	Length: 1
	Dtype: |S54
	Value: ["Ma'ayan Lab - Icahn School of Medicine at Mount Sinai"]




version
	Length: 1
	Dtype: |S2
	Value: ['8']






# EDA

In [8]:
with h5py.File(filepath, "r") as f:
    blah = f["data"]["expression"]
    maxs = []
    mins = []
    avgs = []
    nonzeros = []
    start = time.perf_counter()
    for i in [1,100,200,300,400,500,600,6000,700,10000,20000,30000]:
        maxs.append(np.max(blah[:,i]))
        mins.append(np.min(blah[:,i]))
        avgs.append(np.mean(blah[:,i]))
        nonzeros.append(np.count_nonzero(blah[:,i]))
    print(time.perf_counter() - start)
    print(maxs)
    print(mins)
    print(avgs)
    print(nonzeros)

23.403152499999976
[62620, 308128, 214, 6519, 1186, 18871, 8854, 120945, 578, 12847, 160112, 596]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[145.99869613704396, 1851.9068345896815, 0.07339364922313246, 8.545371076881796, 2.5514292182691745, 1.0287436798282759, 8.95004234410243, 193.42126512439106, 0.11922589949774025, 11.223916452151164, 978.11925105441, 0.17611792622902708]
[140498, 206950, 4685, 47087, 65687, 12723, 111750, 178873, 4677, 128348, 161095, 8349]


In [9]:
140498/230000

0.6108608695652173