In [2]:
# need 'tables' package to read h5 files
#!conda install -c ska tables
!pip install tables
!pip install hdf5plugin

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


[0mCollecting hdf5plugin
  Downloading hdf5plugin-3.3.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: hdf5plugin
Successfully installed hdf5plugin-3.3.1
[0m/kaggle/input/open-problems-multimodal/sample_submission.csv
/kaggle/input/open-problems-multimodal/train_cite_targets.h5
/kaggle/input/open-problems-multimodal/metadata_cite_day_2_donor_27678.csv
/kaggle/input/open-problems-multimodal/test_multi_inputs.h5
/kaggle/input/open-problems-multimodal/evaluation_ids.csv
/kaggle/input/open-problems-multimodal/train_cite_inputs.h5
/kaggle/input/open-problems-multimodal/train_multi_targets.h5
/kaggle/input/open-problems-multimodal/train_multi_inputs.h5
/kaggle/input/open-problems-multimodal/metadata.csv
/kaggle/input/open-problems-multimodal/test_cite_inputs_day_2_donor_27678.h5
/kaggle/input/open-problems-

In [3]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import h5py, hdf5plugin

from scipy.sparse import csr_matrix, vstack

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


In [4]:
# file paths
dir = '../input/open-problems-multimodal/'

metadata = dir + 'metadata.csv'
train_multi_inputs = dir + 'train_multi_inputs.h5'
train_multi_targets = dir + 'train_multi_targets.h5'


In [6]:
# function to get metadata for xfile samples
# inputs: directory - file directory, metafile - metadata csv, xfile - input file name
def getmeta(directory, metafile, xfile):
    metadf = pd.read_csv(directory + metafile)
    
    # index by cell id
    metadf.set_index('cell_id', drop=True, inplace=True)

    # get cell ids of dataset
    with h5py.File(directory + xfile, 'r') as f:
        dataset = f[xfile[:-3]]
        samples = dataset['axis1'][:].astype(str).tolist() # cell ids
    
    # retain metadata for xfile samples
    metadf = metadf.loc[samples]
    metadf.reset_index(inplace=True)

    return metadf


In [7]:
# find max values of input and output for scaling data
# if finding max of subset, provide indexes of subset
# output: column maximums of inputs (array), column maximums of outputs (array)
def getmax(directory, xfilename, yfilename, indexes=None):
    chunks = []
    maxvals = []
    for file in [xfilename, yfilename]:
        with h5py.File(directory + file, 'r') as f:
            dataset = f[file[:-3]]
            if indexes == None:
#                 values = dataset['block0_values'][:]
                datashape = dataset['block0_values'].shape
                idxarray = np.arange(datashape[0])
            else:
#                 values = dataset['block0_values'][indexes]
                idxarray = np.array(indexes)
    
            # read file in chunks to avoid memory issues
            for chunkidxs in np.array_split(idxarray, 100):
                # sparse matrix to save space
                chunk = csr_matrix(dataset['block0_values'][chunkidxs]) 
                chunks.append(chunk)
        # combine chunks    
        values = vstack(chunks)
        chunks = []

        # store maximums    
        maxvals.append(np.max(values, axis=0))
        values = None
        
    # convert sparse to dense
    inputmax = maxvals[0].todense()
    outputmax = maxvals[1].todense()
    
    # convert matrix to flattened array
    inputmax = np.asarray(inputmax).ravel()
    outputmax = np.asarray(outputmax).ravel()
    
    # if max value is 0, replace with 1 to avoid zero division when scaling
    inputmax = np.where(inputmax <= 0, 1, inputmax)
    outputmax = np.where(outputmax <= 0, 1, outputmax)
    
    return inputmax, outputmax


In [8]:
# create custom dataset, subclass pytorch Dataset
# need as inputs: 
# directory - file directory, xfile - input file name, yfile - target file name, 
# metadata - sample metadata as dataframe, xmax - array of max values per column of input data (for scaling),
# ymax - array of max values per column of output data (for scaling), indexes - indexes of training subset (if any)
class MakeDataset(torch.utils.data.Dataset):
    
    def __init__(self, directory, xfile, yfile, metadata, xmax, ymax, indexes=None):
        # initialize data
        self.dir = directory
        self.xfile = xfile
        self.yfile = yfile
        self.metadata = metadata
        self.xmax = xmax
        self.ymax = ymax
        self.indexes = indexes
    
    def __getitem__(self, index):
        # read input and target from file on disk
        data = []
        for file in [self.xfile, self.yfile]:
            with h5py.File(self.dir + file, 'r') as f:
                dataset = f[file[:-3]]
                value = dataset['block0_values'][index]
            # store value from file    
            data.append(value)
        
        # get day of index from metadata
        day = self.metadata.loc[index, 'day']
        
        # get max day for scaling
        if self.indexes == None:
            maxday = self.metadata.loc[:, 'day'].max()
        else:
            maxday = self.metadata.loc[self.indexes, 'day'].max()
        
        # add day as feature of xdata, scale
        xdata = np.append(data[0], day) / np.append(self.xmax, maxday)
        ydata = data[1] / self.ymax
        # replace nan with 0, convert to float tensors
        xdata = torch.from_numpy(np.nan_to_num(xdata)).float()
        ydata = torch.from_numpy(np.nan_to_num(ydata)).float()
        
        return xdata, ydata    
    
    def __len__(self):
        # get length of dataset
        return self.metadata.shape[0]


In [9]:
# file information
directory = '../input/open-problems-multimodal/'
xfilename = 'train_multi_inputs.h5'
yfilename = 'train_multi_targets.h5'
metafilename = 'metadata.csv'

In [10]:
#get metadata for samples
metadf = getmeta(directory, metafilename, xfilename)
print(metadf.shape)
metadf.head()

(105942, 5)


Unnamed: 0,cell_id,day,donor,cell_type,technology
0,56390cf1b95e,2,32606,NeuP,multiome
1,fc0c60183c33,2,32606,HSC,multiome
2,9b4a87e22ad0,2,32606,MasP,multiome
3,81cccad8cd81,2,32606,HSC,multiome
4,15cb3d85c232,2,32606,MkP,multiome


In [11]:
# private test set will be from day 10, day 10 data not available for training
# to simulate test situation...
# indexes of day 7 samples for validation, days 2, 3, 4 for training
day7 = list(metadf[metadf.day == 7].index)
notday7 = list(metadf[~(metadf.day == 7)].index)

In [12]:
# %%time
# # find max values of input and output for scaling data
# trainxmax, trainymax = getmax(directory, xfilename, yfilename, indexes=notday7)
# xmax, ymax = getmax(directory, xfilename, yfilename, indexes=None)

# # print(trainxmax, trainymax)
# # print(xmax, ymax)
# np.save('trainxmax', trainxmax)
# np.save('trainymax', trainymax)
# np.save('xmax', xmax)
# np.save('ymax', ymax)


In [13]:
trainxmax = np.load('trainxmax.npy')
trainymax = np.load('trainymax.npy')
xmax = np.load('xmax.npy')
ymax = np.load('ymax.npy')


In [15]:
# make dataset
dataset = MakeDataset(directory=directory, 
                      xfile=xfilename, 
                      yfile=yfilename,
                      metadata=metadf,
                      xmax=trainxmax,
                      ymax=trainymax, 
                      indexes=notday7)

# private test set will be from day 10, day 10 data not available for training
# to simulate test situation...
# get indexes of day 7 samples for validation, days 2, 3, 4 for training
print(len(dataset) == (len(notday7) + len(day7)))

# split dataset into train and validation subsets based on day of sample
trainset = torch.utils.data.Subset(dataset, notday7)
valset = torch.utils.data.Subset(dataset, day7)


True


In [16]:
a,b = dataset[123]
print(a)
print(b)

tensor([0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.5000])
tensor([0.0000, 0.7853, 0.0000,  ..., 0.0000, 0.0000, 0.7100])


In [17]:
# get input and output sizes
x, y = dataset[0]

inputsize = len(x)
outputsize = len(y)

print(f'{inputsize} --> {outputsize}')

# hyperparameters
batchsize = 256
hiddensize = 128

epochs = 3
learnrate = 0.001



228943 --> 23418


In [18]:
# dataloaders
train_dataloader = DataLoader(trainset, batch_size=batchsize, shuffle=True)
val_dataloader = DataLoader(valset, batch_size=batchsize, shuffle=True)


In [19]:
# get gpu if available, otherwise use cpu
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'using {device}')

using cpu


In [20]:
# Define model
class makenn(nn.Module):
    def __init__(self, inputnodes, outputnodes, hidnodes):
        super().__init__()
        self.layer_stack = nn.Sequential(
            nn.Linear(inputnodes, hidnodes),
            nn.ReLU(),
            nn.Linear(hidnodes, hidnodes),
            nn.Dropout(p=0.5),
            nn.ReLU(),
            nn.Linear(hidnodes, outputnodes)
        )

    def forward(self, x):
        return self.layer_stack(x)
    


In [21]:
# initialize network
model = makenn(inputsize, outputsize, hiddensize).to(device)
print(model)

makenn(
  (layer_stack): Sequential(
    (0): Linear(in_features=228943, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): Dropout(p=0.5, inplace=False)
    (4): ReLU()
    (5): Linear(in_features=128, out_features=23418, bias=True)
  )
)


In [22]:
# loss function and optimizer
lossfunc = F.mse_loss
optimizer = torch.optim.Adam(model.parameters(), lr=learnrate)


In [None]:
# train loop

# X, y = next(iter(train_dataloader))
trainlosses = []
vallosses = []
# def train(dataloader, model, loss_fn, optimizer):
for epoch in range(epochs):
#     print(f'epoch {epoch}')
    te_losses = []
    for batch, (X, y) in enumerate(train_dataloader):
        # send to device
        X = X.to(device)
        y = y.to(device)

        # put in train mode
        model.train()
        
        # Compute prediction error
        pred = model(X)
        tloss = lossfunc(pred, y)
        # store loss values
        te_losses.append(tloss.item())

        # Backpropagation
        optimizer.zero_grad()
        tloss.backward()
        optimizer.step()
        
        if batch % 50 == 0:
            print(f'epoch {epoch}, train batch {batch}')
            print(tloss)
            
    trainlosses.append(sum(te_losses)/len(te_losses))
    
    ve_losses = []
    for batch, (valX, valy) in enumerate(val_dataloader):
        
        # put in evaluation mode
        model.eval()
        
        with torch.inference_mode():
            # send to device
            valX = valX.to(device)
            valy = valy.to(device)

            pred = model(valX)
            vloss = lossfunc(pred, valy)
            vallosses.append(vloss.item())

        if batch % 50 == 0:
            print(f'epoch {epoch}, val batch {batch}')
            print(vloss)
            
    vallosses.append(sum(ve_losses)/len(ve_losses))
        

epoch 0, train batch 0
tensor(0.0900, grad_fn=<MseLossBackward0>)


In [None]:
# plot loss
print(len(trainlosses))
plt.plot(np.arange(len(trainlosses)), trainlosses, label='train')
plt.plot(np.arange(len(vallosses)), vallosses, label='val')
plt.legend()