In [1]:
import h5py
import pandas as pd 
import numpy as np 
import torch 
from torch.utils.data import DataLoader,Dataset


In [6]:
#This is the dataloader that is called for each minibatch of data in the main training loop 
def LoadTCGA(data_root, batch_size=32, split='train', cancer='brca', attr = None,
               shuffle=True, load_first_n = None):

    data_root = data_root+'tcga.h5'
    key = '/'.join(['tcga',split,cancer])
    print(key)
    tcga_dataset = TCGA(data_root,key,load_first_n)
    return DataLoader(tcga_dataset,batch_size=batch_size,shuffle=shuffle,drop_last=True)

#This is an extension of the Dataset class for our TCGA data 
#The private variable 'expression' was original named data but 
#I think since we have the __getitem___ method that it should be ok to change this name
#to something more meaningful 
class TCGA(Dataset): 
    
    def __init__(self, root, key, load_first_n = None):
        with h5py.File(root,'r') as f:
            data = f[key][()]
            if load_first_n:
                data = data[:load_first_n]
        self.expression = data 
    
    def __getitem__(self, index):
        return self.expression[index]

    def __len__(self):
        return len(self.expression)

In [7]:
#Testing that the dataloader works 
test_loader = LoadTCGA('')

tcga/train/brca


OSError: Unable to open file (Unable to open file: name = 'tcga.h5', errno = 2, error message = 'no such file or directory', flags = 0, o_flags = 0)

In [4]:
# #Creating the tiny TCGA test dataset 

# #load the csvs into pandas dataframes 
# dlbc = pd.read_csv('dlbc.csv', index_col=0)
# gbm = pd.read_csv('gbm.csv', index_col=0)
# brca = pd.read_csv('brca.csv', index_col=0)

### Make the tiny h5 file for testing 
### 10 training samples that are (20501,) and 5 test samples (20501,)
# tcga = h5py.File('tcga.h5', mode='a')
# tcga.create_dataset('tcga/train/dlbc', data=dlbc.values.T[:10])
# tcga.create_dataset('tcga/test/dlbc', data=dlbc.values.T[10:15])
# tcga.create_dataset('tcga/train/gbm', data=gbm.values.T[:10])
# tcga.create_dataset('tcga/test/gbm', data=gbm.values.T[10:15])
# tcga.create_dataset('tcga/train/brca', data=brca.values.T[:10])
# tcga.create_dataset('tcga/test/brca', data=brca.values.T[10:15])

## Try out loading the data

In [5]:
#utility function for walking the h5 file
def print_name(name):
    print(name)

tcga = h5py.File('tcga.h5', mode='r')
tcga.visit(print_name)

OSError: Unable to open file (Unable to open file: name = 'tcga.h5', errno = 2, error message = 'no such file or directory', flags = 0, o_flags = 0)

In [35]:
list(tcga['tcga/train'])

['brca', 'dlbc', 'gbm']

In [36]:
tcga['tcga/train/brca'][()].shape

(10, 20501)

In [37]:
tcga['tcga/train/dlbc'][()]

array([[3.886780e+02, 0.000000e+00, 0.000000e+00, ..., 5.443902e+02,
        2.731710e+01, 0.000000e+00],
       [1.935208e+02, 0.000000e+00, 0.000000e+00, ..., 6.932067e+02,
        8.692480e+01, 7.305000e-01],
       [2.108252e+02, 0.000000e+00, 0.000000e+00, ..., 4.272727e+02,
        1.083920e+01, 0.000000e+00],
       ...,
       [2.628933e+02, 5.501000e-01, 0.000000e+00, ..., 3.883388e+02,
        1.265130e+01, 0.000000e+00],
       [4.801603e+02, 0.000000e+00, 0.000000e+00, ..., 5.971944e+02,
        4.175020e+01, 6.680000e-01],
       [4.425644e+02, 0.000000e+00, 3.866000e+00, ..., 3.427835e+02,
        1.353090e+01, 0.000000e+00]])

tcga.close()

## Select most variable genes across training data

Pick the `n_genes` with the largest median absolute deiviation (MAD)

In [101]:
tcga = h5py.File('tcga.h5', mode='r')
n_genes = 1000 # Number of genes 

# Cancers to include
cancers = list(tcga['tcga/train'])

# Compute MAD for each cancer type
def mad(X, axis=0):
    'Median absolute deviation'
    return(np.median(np.abs(X - np.median(X,axis=axis)),axis=axis))

mad_cancer = np.vstack(list(map(lambda cancer: mad(tcga['tcga/train/'+cancer][()]), cancers)))

# Average MAD over cancer types
mad_avg = np.mean(mad_cancer,axis=0)

# Take the n_genes with the largest average MAD
id_genes_keep = np.sort(np.argsort(mad_avg)[::-1][:n_genes])

In [102]:
# Create dataset
tcga_mad = h5py.File('tcga_mad.h5', mode='a')

for c in cancers:
    tcga_mad.create_dataset('tcga/train/'+c, data=tcga['tcga/train/'+c][:,id_genes_keep])
    tcga_mad.create_dataset('tcga/test/'+c, data=tcga['tcga/test/'+c][:,id_genes_keep])
    
tcga_mad.close()

## Face dataset

In [8]:
def LoadFace(data_root, batch_size=32, split='train', style='photo', attr = None,
               shuffle=True, load_first_n = None):

    data_root = data_root+'face.h5'
    key = '/'.join(['CelebA',split,style])
    celeba_dataset = Face(data_root,key,load_first_n)
    return DataLoader(celeba_dataset,batch_size=batch_size,shuffle=shuffle,drop_last=True)

class Face(Dataset):
    def __init__(self, root, key, load_first_n = None):

        with h5py.File(root,'r') as f:
            data = f[key][()]
            if load_first_n:
                data = data[:load_first_n]
        self.imgs = (data/255.0)*2 -1

    def __getitem__(self, index):
        return self.imgs[index]

    def __len__(self):
        return len(self.imgs)

In [9]:
face = h5py.File('../UFDN/data/face.h5', mode='r')
face.visit(print_name)

CelebA
CelebA/test
CelebA/test/paint
CelebA/test/photo
CelebA/test/sketch
CelebA/train
CelebA/train/paint
CelebA/train/photo
CelebA/train/sketch


In [10]:
list(face.keys())

['CelebA']

In [11]:
face['CelebA/train/paint'][()].shape

(42319, 3, 64, 64)

In [31]:
face['CelebA/train/photo'][()].shape

(42319, 3, 64, 64)

In [32]:
face['CelebA/train/sketch'][()].shape

(42319, 1, 64, 64)

In [34]:
face['CelebA/test/paint'][()].shape

(71012, 3, 64, 64)

In [None]:
face.close()

In [39]:
(64-4+2)/2+1

32.0

#### Create smaller face dataset

In [14]:
ntrain = 10
ntest = 5

domains = list(face['CelebA/train'])

# Create dataset
face_small = h5py.File('face_small.h5', mode='a')

for d in domains:
    face_small.create_dataset('CelebA/train/'+d, data=face['CelebA/train/'+d][:ntrain,:])
    face_small.create_dataset('CelebA/test/'+d, data=face['CelebA/test/'+d][:ntest,:])
    
face_small.close()

In [16]:
face_small = h5py.File('face_small.h5', mode='r')
face_small.visit(print_name)

CelebA
CelebA/test
CelebA/test/paint
CelebA/test/photo
CelebA/test/sketch
CelebA/train
CelebA/train/paint
CelebA/train/photo
CelebA/train/sketch


In [17]:
face_small['CelebA/train/paint'][()].shape

(10, 3, 64, 64)