### Imports

In [1]:
import argparse
import datetime

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

torch.manual_seed(0)

<torch._C.Generator at 0x12a1110d0>

In [2]:
# constants
VERBOSE = 1
INPUT_DIM = 0
if INPUT_DIM==0:
    print('NO INPUT DIMENSION DEFINED')
# HIDDEN_DIM = 200
LEARNING_RATE = .01
BATCH_SIZE = 400
NUM_EPOCHS = 200
DROPOUT = 0.2

NO INPUT DIMENSION DEFINED


### Classes and functions


#### Classes

In [3]:
class CiteDataset(Dataset):
    """
    A pytorch dataset class that accepts an inputs path, and optionally a targets path. 
    This class holds all the data and implements a __getitem__ method to be used by a 
    Python generator object or other classes that need it.

    Properties:
        pd_csv: bool, default=False
            Whether the input data is a csv file
        inputs: h5py.File or pandas.DataFrame
            Either a h5py.File or pd.DataFrame object containing the inputs.
        targets: h5py.File or pandas.DataFrame
            Either a h5py.File or pd.DataFrame object containing the targets.
        num_cells: int
            Number of rows in the Dataset.
        num_features: int or None
            Number of features in the input data.
        num_cells: int or None
            Number of columns in the target.
    """
    def __init__(self, inputs_path: str, targets_path: str or None = None):
        """
        Read the content of the inputs file
        Args:
            inputs_path: string
                Path to the inputs file.
            targets_path: string, default=None
                Path to the targets file.
        """
        file_extension = inputs_path.strip().split('.')[-1]
        self.pd_csv = (file_extension =="csv")
        if self.pd_csv: 
            # Initialises CiteDataset from pandas csv

            # Store the values into self.inputs
            self.inputs = pd.read_csv(inputs_path, index_col=0)

            # Values that inform the shape of the input matrix
            self.num_cells = self.inputs.shape[0]
            self.num_features = self.inputs.shape[1]
            if targets_path: # Init CiteDataset for training
                self.targets = pd.read_csv(targets_path, index_col=0)

                # Values that inform the shape of the targets matrix
                self.num_targets = self.targets.shape[1]
        self.protein_ids = list(self.targets.columns)
        self.cell_ids = list(self.targets.index)

    def data_size(self):
        """
        A function to inform the dimensions of the data. The function returns 
        a tuple of two integers:
            num_features: int
                Number of features in the input data.
            num_targets: int
                Number of target outputs.
        """
        return self.num_features, self.num_targets
    
    def __len__(self):
        """
        Return the number of instances in the data
        """
        return self.num_cells

    def __getitem__(self, i):
        """
        Return the i-th instance in the format of:
            (inputs, targets), where inputs and targets are PyTorch tensors.
        """
        input_row = self.inputs.iloc[i,:]
        inputs = torch.tensor(input_row) 
        targets = None
        if hasattr(self, 'targets'):
            targets = torch.tensor(self.targets.iloc[i,:])
        
        return inputs, targets

In [4]:
class FCBlock(nn.Module):
    "A single feed-forward model with num_features input nodes and  num_hidden output nodes, with a specified dropout rate."
    def __init__(self, num_features: int, num_hidden: int, dropout: float):
        super(FCBlock, self).__init__()
        self.Input_Layer = nn.Linear(num_features, num_hidden)
        self.Dropout = nn.Dropout(p=dropout)

    def forward(self, x):
        "Performs Components in sequence"
        x = self.Input_Layer(x)
        x = F.relu(x)
        x = self.Dropout(x)
        return x

In [8]:
class Encoder(nn.Module):
    """
    Encoder module to generate embeddings of a RNA vector
    """
    def __init__(self, num_features: int):
        super().__init__()
        self.l0 = FCBlock(num_features, 120, 0.05)
        self.l1 = FCBlock(120, 60, 0.05)
        self.l2 = FCBlock(60, 30, 0.05)
        
    def forward(self, x):
        x = self.l0(x)
        x = self.l1(x)
        x = self.l2(x)
        return x


In [12]:
class Decoder(nn.Module):
    """
    Decoder module to extract Protein sequences from RNA embeddings
    """
    def __init__(self, num_targets: int):
        super().__init__()
        self.l0 = FCBlock(30, 70, 0.05)
        self.l1 = FCBlock(70, 100, 0.05)
        self.l2 = FCBlock(100, num_targets, 0.05)
        
    def forward(self, x):
        x = self.l0(x)
        x = self.l1(x)
        x = self.l2(x)
        return x
    
class CiteseqModel(nn.Module):
    """
    Wrapper for the Encoder and Decoder modules
    Converts RNA sequence to Protein sequence
    """
    def __init__(self, num_features: int, num_targets: int):
        super().__init__()
        self.encoder = Encoder(num_features)
        self.decoder = Decoder(num_targets)
        
    def forward(self, x):
        embeddings = self.encoder(x)
        outputs = self.decoder(embeddings)
        return outputs

#### Functions

In [10]:
def criterion(outputs, labels):
    """ MSE Loss function"""
    return nn.MSELoss()(outputs, labels)

def correlation_score(y_true, y_pred):
    """
    Scores the predictions according to the competition rules. 
    It is assumed that the predictions are not constant.
    Returns the average of each sample's Pearson correlation coefficient
    """
    
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

def collator(batch):
    """
    Define a function that receives a list of (inputs, targets) pair for each cell
    and return a pair of tensors:
        batch_inputs: a tensor that combines all the inputs in the mini-batch
        batch_targets: a tensor that combines all the targets in the mini-batch
    """
    # batch_inputs tensor dimensions: batch_dim x INPUT_DIM
    # batch_targets tensor dimensions: batch_dim x num_targets
    inputs, targets = zip(*batch)

    batch_inputs = torch.cat(inputs)
    if targets: # for training, return both texts and batch_targets
        batch_targets = torch.cat(targets)
    return batch_inputs, batch_targets


### Train

In [None]:
def train(model, dataset, batch_size, learning_rate, num_epoch, device='cpu', model_path=None, loss_fn=nn.MSELoss, optim = optim.Adam):
    """
    Complete the training procedure below by specifying the loss function
    and optimizers with the specified learning rate and specified number of epoch.
    """
    data_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collator, shuffle=True)

    # assign these variables
    criterion=loss_fn
    optimizer = optim(model.parameters(), lr=learning_rate)
    # scheduler = ExponentialLR(optimizer, gamma=0.9)
    # print('no model params:', ([p.shape for p in list(model.parameters())]))

    # train, validation split?
    # do here
    
    start = datetime.datetime.now()
    for epoch in range(num_epoch):
        model.train()
        running_loss = 0.0

        for step, data in enumerate(data_loader):
            # get the inputs; data is a tuple of (inputs_tensor, targets_tensor)
            inputs = data[0].to(device)
            targets = data[1].to(device)

            # zero the parameter gradients
            model.zero_grad()

            # do forward propagation
            y_preds = model(inputs)

            # do loss calculation
            # dont count loss for padding
            if VERBOSE == 4: print('    CHECKING LOSS...')
            loss_tensor = criterion(input= y_preds, target= targets)
            if VERBOSE == 1:
                print(loss_tensor)
                input('^LOSS')
            # loss_tensor = ((y_preds-labels)**2).sum()

            # if step%50==1: 
                # print('epoch',epoch,'step',step,'Loss:', loss_tensor)

            # do backward propagation
            loss_tensor.backward()
            # do parameter optimization step
            optimizer.step()

            # calculate running loss value
            running_loss += loss_tensor.item()
            # print('running loss updated to', running_loss)

            # print loss value every 100 steps and reset the running loss
                # input()
        # scheduler.step()
        if VERBOSE == 1:
            print('[Epoch %d, Step %5d] MSE loss: %.3f' %
                (epoch + 1, step + 1, running_loss / 100))
            running_loss = 0.0
    
        # Check correlation score with validation set
        # correlation_score()
    end = datetime.datetime.now()
    
    # define the checkpoint and save it to the model path
    # tip: the checkpoint can contain more than just the model
    checkpoint = {
        'epoch':epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'vocab':
            {'labels index': dataset.labels_index,
            'bigram index': dataset.bigram_index,}
        # 'loss':running_loss
    }
    torch.save(checkpoint, model_path)

    print('Model saved in ', model_path)
    print('Training finished in {} minutes.'.format((end - start).seconds / 60.0))


In [None]:
# define the inputs and targets path
inputs_path = '.'
targets_path = '.'

In [None]:
dataset = CiteDataset(inputs_path, targets_path)
num_features, num_targets = dataset.data_size

basemodel = CiteseqModel(num_features, num_targets)

train(model=basemodel, dataset=dataset, learning_rate=0.05, num_epoch=10, model_path='model/basemodel')

### Test

In [None]:
def test(model, dataset: CiteDataset, device='cpu'):
    """
    Function to test model on the test dataset. 
    """
    model.eval()
    data_loader = DataLoader(dataset, batch_size=20, collate_fn=collator, shuffle=False)
    preds_tensor = None
    with torch.no_grad():
        for _,data in enumerate(data_loader):
            targets = data[0].to(device)
            # print((texts.shape))
            outputs = model(targets).cpu()
            if preds_tensor is None:
                preds_tensor = outputs
            else:
                torch.cat((preds_tensor, outputs), dim=0)
            # get the label predictions
    preds = pd.DataFrame(preds_tensor, columns=dataset.protein_ids, index=dataset.cell_ids)
    return preds


In [None]:
preds = test(basemodel, dataset)
preds.to_csv('output/base_preds.csv')