In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import scipy

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as T
import torch.optim as optim
import torch.backends.cudnn as cudnn

from torch.utils.data import TensorDataset, DataLoader, Dataset
from torch.autograd import Variable

from tqdm.notebook import tqdm

import collections
from collections import defaultdict
from operator import itemgetter, attrgetter

In [2]:
import scipy.sparse as sp 

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

### Model

In [4]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fcs = nn.Sequential(
                    nn.Linear(228942, 512),
                    nn.ReLU(True),
                    nn.Linear(512, 256),
                    nn.ReLU(True),
                    nn.Linear(256, 128),
                    nn.ReLU(True),
                    nn.Linear(128, 23418),
                )
    def forward(self, input_data):
        feature = self.fcs(input_data)
        return feature

### DataLoader

In [5]:
# Strangely, current torch implementation of csr tensor do not accept to be moved to the gpu. 
# So we make our own equivalent class
TorchCSR = collections.namedtuple("TrochCSR", "data indices indptr shape")

def load_csr_data_to_gpu(train_inputs):
    """Move a scipy csr sparse matrix to the gpu as a TorchCSR object
    This try to manage memory efficiently by creating the tensors and moving them to the gpu one by one
    """
    th_data = torch.from_numpy(train_inputs.data).to(device)
    th_indices = torch.from_numpy(train_inputs.indices).to(device)
    th_indptr = torch.from_numpy(train_inputs.indptr).to(device)
    th_shape = train_inputs.shape
    return TorchCSR(th_data, th_indices, th_indptr, th_shape)

def make_coo_batch(torch_csr, indx):
    """Make a coo torch tensor from a TorchCSR object by taking the rows indicated by the indx tensor
    """
    th_data, th_indices, th_indptr, th_shape = torch_csr
    start_pts = th_indptr[indx]
    end_pts = th_indptr[indx+1]
    coo_data = torch.cat([th_data[start_pts[i]: end_pts[i]] for i in range(len(start_pts))], dim=0)
    coo_col = torch.cat([th_indices[start_pts[i]: end_pts[i]] for i in range(len(start_pts))], dim=0)
    coo_row = torch.repeat_interleave(torch.arange(indx.shape[0], device=device), th_indptr[indx+1] - th_indptr[indx])
    coo_batch = torch.sparse_coo_tensor(torch.vstack([coo_row, coo_col]), coo_data, [indx.shape[0], th_shape[1]])
    return coo_batch


def make_coo_batch_slice(torch_csr, start, end):
    """Make a coo torch tensor from a TorchCSR object by taking the rows within the (start, end) slice
    """
    th_data, th_indices, th_indptr, th_shape = torch_csr
    if end > th_shape[0]:
        end = th_shape[0]
    start_pts = th_indptr[start]
    end_pts = th_indptr[end]
    coo_data = th_data[start_pts: end_pts]
    coo_col = th_indices[start_pts: end_pts]
    coo_row = torch.repeat_interleave(torch.arange(end-start, device=device), th_indptr[start+1:end+1] - th_indptr[start:end])
    coo_batch = torch.sparse_coo_tensor(torch.vstack([coo_row, coo_col]), coo_data, [end-start, th_shape[1]])
    return coo_batch

class DataLoaderCOO:
    """Torch compatible DataLoader. Works with in-device TorchCSR tensors.
    Args:
         - train_inputs, train_targets: TorchCSR tensors
         - train_idx: tensor containing the indices of the rows of train_inputs and train_targets that should be used
         - batch_size, shuffle, drop_last: as in torch.utils.data.DataLoader
    """
    def __init__(self, train_inputs, train_targets, train_idx=None, 
                 *,
                batch_size=512, shuffle=False, drop_last=False):
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.drop_last = drop_last
        
        self.train_inputs = train_inputs
        self.train_targets = train_targets
        
        self.train_idx = train_idx
        
        self.nb_examples = len(self.train_idx) if self.train_idx is not None else train_inputs.shape[0]
        
        self.nb_batches = self.nb_examples//batch_size
        if not drop_last and not self.nb_examples%batch_size==0:
            self.nb_batches +=1
        
    def __iter__(self):
        if self.shuffle:
            shuffled_idx = torch.randperm(self.nb_examples, device=device)
            if self.train_idx is not None:
                idx_array = self.train_idx[shuffled_idx]
            else:
                idx_array = shuffled_idx
        else:
            if self.train_idx is not None:
                idx_array = self.train_idx
            else:
                idx_array = None
            
        for i in range(self.nb_batches):
            slc = slice(i*self.batch_size, (i+1)*self.batch_size)
            if idx_array is None:
                inp_batch = make_coo_batch_slice(self.train_inputs, i*self.batch_size, (i+1)*self.batch_size)
                if self.train_targets is None:
                    tgt_batch = None
                else:
                    tgt_batch = make_coo_batch_slice(self.train_targets, i*self.batch_size, (i+1)*self.batch_size)
            else:
                idx_batch = idx_array[slc]
                inp_batch = make_coo_batch(self.train_inputs, idx_batch)
                if self.train_targets is None:
                    tgt_batch = None
                else:
                    tgt_batch = make_coo_batch(self.train_targets, idx_batch)
            yield inp_batch, tgt_batch
            
            
    def __len__(self):
        return self.nb_batches

### Pred Test

In [6]:
model = Net()
model.load_state_dict(torch.load("../multi/multi_MSE.pt"))
model = model.cuda()

In [7]:
test_inputs = sp.load_npz("../transfered_file/test_multi_inputs_values.sparse.npz")
test_inputs = load_csr_data_to_gpu(test_inputs)

dl_test = DataLoaderCOO(test_inputs, None, train_idx=None,
                batch_size=512, shuffle=False, drop_last=False)

In [8]:
test_pred = torch.empty(
        (dl_test.nb_examples, 23418), 
        device='cpu', dtype=torch.float32)

In [9]:
cur = 0
for i, inpt in enumerate(dl_test):
    pred = model(inpt[0]).detach().cpu()
    test_pred[cur:cur+pred.shape[0]] = pred
    cur += pred.shape[0]


In [10]:
test_pred.shape

torch.Size([55935, 23418])

### Create submission

In [11]:
eval_ids = pd.read_parquet("../transfered_file/evaluation.parquet")

In [12]:
eval_ids.cell_id = eval_ids.cell_id.astype(pd.CategoricalDtype())
eval_ids.gene_id = eval_ids.gene_id.astype(pd.CategoricalDtype())

In [13]:
submission = pd.Series(name='target',
                       index=pd.MultiIndex.from_frame(eval_ids), 
                       dtype=np.float32)

In [14]:
%%time
y_columns = np.load("../transfered_file/train_multi_targets_idxcol.npz",
                   allow_pickle=True)["columns"]

test_index = np.load("../transfered_file/test_multi_inputs_idxcol.npz",
                    allow_pickle=True)["index"]

CPU times: total: 46.9 ms
Wall time: 230 ms


In [15]:
cell_dict = dict((k,v) for v,k in enumerate(test_index)) 
assert len(cell_dict)  == len(test_index)

gene_dict = dict((k,v) for v,k in enumerate(y_columns))
assert len(gene_dict) == len(y_columns)

eval_ids_cell_num = eval_ids.cell_id.apply(lambda x:cell_dict.get(x, -1))
eval_ids_gene_num = eval_ids.gene_id.apply(lambda x:gene_dict.get(x, -1))

valid_multi_rows = (eval_ids_gene_num !=-1) & (eval_ids_cell_num!=-1)

In [16]:
valid_multi_rows = valid_multi_rows.to_numpy()
eval_ids_gene_num[valid_multi_rows].to_numpy()

array([20687, 15183, 17190, ...,  9200,  9012, 20487], dtype=int64)

In [17]:
submission.iloc[valid_multi_rows] = test_pred[eval_ids_cell_num[valid_multi_rows].to_numpy(),
eval_ids_gene_num[valid_multi_rows].to_numpy()].cpu().numpy()

In [18]:
submission

row_id    cell_id       gene_id        
0         c2150f55becb  CD86                    NaN
1         c2150f55becb  CD274                   NaN
2         c2150f55becb  CD270                   NaN
3         c2150f55becb  CD155                   NaN
4         c2150f55becb  CD112                   NaN
                                             ...   
65744175  2c53aa67933d  ENSG00000134419    6.419917
65744176  2c53aa67933d  ENSG00000186862    0.031934
65744177  2c53aa67933d  ENSG00000170959    0.031995
65744178  2c53aa67933d  ENSG00000107874    1.464669
65744179  2c53aa67933d  ENSG00000166012    4.627115
Name: target, Length: 65744180, dtype: float32

In [19]:
submission.reset_index(drop=True, inplace=True)
submission.index.name = 'row_id'

In [20]:
cite_submission = pd.read_csv("./citeseq_submission.csv")
cite_submission = cite_submission.set_index("row_id")
cite_submission = cite_submission["target"]

In [21]:
submission[submission.isnull()] = cite_submission[submission.isnull()]

In [22]:
submission

row_id
0           0.094605
1          -0.162362
2          -0.405332
3          -0.302582
4           1.114355
              ...   
65744175    6.419917
65744176    0.031934
65744177    0.031995
65744178    1.464669
65744179    4.627115
Name: target, Length: 65744180, dtype: float32

In [23]:
submission.isnull().any()

False

In [24]:
submission.to_csv("submission.csv")