In [1]:
import numpy as np
import sys
import time
import h5py
import os 
os.environ["CUDA_VISIBLE_DEVICES"]="1" 
import tensorflow as tf
from tqdm import tqdm

import numpy as np
import re
from math import ceil
from sklearn.metrics import average_precision_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Conv1D, Cropping1D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import add
import tensorflow.keras.backend as kb
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import Lambda

from tensorflow.keras.models import load_model 
import pickle
from scipy.sparse import load_npz
from glob import glob
from torch.utils.data import Dataset
import pandas as pd
import torch
from pkg_resources import resource_filename
from src.dataloader import get_GTEX_v8_Data,getDataPointListGTEX,spliceDataset
from src.evaluation_metrics import print_topl_statistics,cross_entropy_2d

def getSpliceProb(input_sequence):
    paths = ('models/spliceai{}.h5'.format(x) for x in range(1, 6))
    models = [load_model(resource_filename('spliceai', x)) for x in paths]
    #x = one_hot_encode('N'*(context//2) + input_sequence + 'N'*(context//2))[None, :]
    x = one_hot_encode(input_sequence)[None, :]
    y = np.mean([models[m].predict(x) for m in range(5)], axis=0)
    acceptor_prob = y[0, :, 1]
    donor_prob = y[0, :, 2]
    return acceptor_prob, donor_prob



2023-11-09 16:55:25.202764: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [2]:
!nvidia-smi

Thu Nov  9 16:55:32 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  Off  | 00000000:5E:00.0 Off |                    0 |
| N/A   34C    P0    35W / 250W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE...  Off  | 00000000:86:00.0 Off |                    0 |
| N/A   32C    P0    34W / 250W |      0MiB / 32510MiB |      2%      Defaul

In [3]:

###############################################################################
# Model
###############################################################################

L = 32
N_GPUS = 8

#if int(sys.argv[1]) == 80:
#    W = np.asarray([11, 11, 11, 11])
#    AR = np.asarray([1, 1, 1, 1])
#    BATCH_SIZE = 18*N_GPUS
#elif int(sys.argv[1]) == 400:
#    W = np.asarray([11, 11, 11, 11, 11, 11, 11, 11])
#    AR = np.asarray([1, 1, 1, 1, 4, 4, 4, 4])
#    BATCH_SIZE = 18*N_GPUS
#elif int(sys.argv[1]) == 2000:
#    W = np.asarray([11, 11, 11, 11, 11, 11, 11, 11,
#                    21, 21, 21, 21])
#    AR = np.asarray([1, 1, 1, 1, 4, 4, 4, 4,
#                     10, 10, 10, 10])
#    BATCH_SIZE = 12*N_GPUS
#elif int(sys.argv[1]) == 10000:
W = np.asarray([11, 11, 11, 11, 11, 11, 11, 11,
                21, 21, 21, 21, 41, 41, 41, 41])
AR = np.asarray([1, 1, 1, 1, 4, 4, 4, 4,
                 10, 10, 10, 10, 25, 25, 25, 25])
BATCH_SIZE = 6*N_GPUS
# Hyper-parameters:
# L: Number of convolution kernels
# W: Convolution window size in each residual unit
# AR: Atrous rate in each residual unit

CL = 2 * np.sum(AR*(W-1))

In [4]:
CL_max=10000
# Maximum nucleotide context length (CL_max/2 on either side of the 
# position of interest)
# CL_max should be an even number

SL=5000
# Sequence length of SpliceAIs (SL+CL will be the input length and
# SL will be the output length)

splice_table='/odinn/tmp/benediktj/Data/SplicePrediction/annotation_ensembl_v87_train.txt'
ref_genome='/odinn/tmp/benediktj/SpliceAITrainingCode/genome.fa'
# Input details

data_dir='/odinn/tmp/benediktj/Data/SplicePrediction/'


In [5]:
assert CL_max % 2 == 0

IN_MAP = np.asarray([[0, 0, 0, 0],
                     [1, 0, 0, 0],
                     [0, 1, 0, 0],
                     [0, 0, 1, 0],
                     [0, 0, 0, 1]])
# One-hot encoding of the inputs: 0 is for padding, and 1, 2, 3, 4 correspond
# to A, C, G, T respectively.

OUT_MAP = np.asarray([[1, 0, 0],
                      [0, 1, 0],
                      [0, 0, 1],
                      [0, 0, 0]])
# One-hot encoding of the outputs: 0 is for no splice, 1 is for acceptor,
# 2 is for donor and -1 is for padding.


def ceil_div(x, y):
    return int(ceil(float(x)/y))


def create_datapoints(seq, strand, tx_start, tx_end, jn_start, jn_end):
    # This function first converts the sequence into an integer array, where
    # A, C, G, T, N are mapped to 1, 2, 3, 4, 0 respectively. If the strand is
    # negative, then reverse complementing is done. The splice junctions 
    # are also converted into an array of integers, where 0, 1, 2, -1 
    # correspond to no splicing, acceptor, donor and missing information
    # respectively. It then calls reformat_data and one_hot_encode
    # and returns X, Y which can be used by Keras models.

    seq = 'N'*(CL_max//2) + seq[CL_max//2:-CL_max//2] + 'N'*(CL_max//2)
    # Context being provided on the RNA and not the DNA

    seq = seq.upper().replace('A', '1').replace('C', '2')
    seq = seq.replace('G', '3').replace('T', '4').replace('N', '0')

    tx_start = int(tx_start)
    tx_end = int(tx_end) 

    jn_start = map(lambda x: map(int, re.split(',', x)[:-1]), jn_start)
    jn_end = map(lambda x: map(int, re.split(',', x)[:-1]), jn_end)

    if strand == '+':

        X0 = np.asarray(map(int, list(seq)))
        Y0 = [-np.ones(tx_end-tx_start+1) for t in range(1)]

        for t in range(1):
            
            if len(jn_start[t]) > 0:
                Y0[t] = np.zeros(tx_end-tx_start+1)
                for c in jn_start[t]:
                    if tx_start <= c <= tx_end:
                        Y0[t][c-tx_start] = 2
                for c in jn_end[t]:
                    if tx_start <= c <= tx_end:
                        Y0[t][c-tx_start] = 1
                    # Ignoring junctions outside annotated tx start/end sites
                     
    elif strand == '-':

        X0 = (5-np.asarray(map(int, list(seq[::-1])))) % 5  # Reverse complement
        Y0 = [-np.ones(tx_end-tx_start+1) for t in range(1)]

        for t in range(1):

            if len(jn_start[t]) > 0:
                Y0[t] = np.zeros(tx_end-tx_start+1)
                for c in jn_end[t]:
                    if tx_start <= c <= tx_end:
                        Y0[t][tx_end-c] = 2
                for c in jn_start[t]:
                    if tx_start <= c <= tx_end:
                        Y0[t][tx_end-c] = 1

    Xd, Yd = reformat_data(X0, Y0)
    X, Y = one_hot_encode(Xd, Yd)

    return X, Y


def reformat_data(X0, Y0):
    # This function converts X0, Y0 of the create_datapoints function into
    # blocks such that the data is broken down into data points where the
    # input is a sequence of length SL+CL_max corresponding to SL nucleotides
    # of interest and CL_max context nucleotides, the output is a sequence of
    # length SL corresponding to the splicing information of the nucleotides
    # of interest. The CL_max context nucleotides are such that they are
    # CL_max/2 on either side of the SL nucleotides of interest.

    num_points = ceil_div(len(Y0[0]), SL)

    Xd = np.zeros((num_points, SL+CL_max))
    Yd = [-np.ones((num_points, SL)) for t in range(1)]

    X0 = np.pad(X0, [0, SL], 'constant', constant_values=0)
    Y0 = [np.pad(Y0[t], [0, SL], 'constant', constant_values=-1)
         for t in range(1)]

    for i in range(num_points):
        Xd[i] = X0[SL*i:CL_max+SL*(i+1)]

    for t in range(1):
        for i in range(num_points):
            Yd[t][i] = Y0[t][SL*i:SL*(i+1)]

    return Xd, Yd


def clip_datapoints(X, Y, CL, N_GPUS):
    # This function is necessary to make sure of the following:
    # (i) Each time model_m.fit is called, the number of datapoints is a
    # multiple of N_GPUS. Failure to ensure this often results in crashes.
    # (ii) If the required context length is less than CL_max, then
    # appropriate clipping is done below.
    # Additionally, Y is also converted to a list (the .h5 files store 
    # them as an array).

    rem = X.shape[0]%N_GPUS
    clip = (CL_max-CL)//2

    if rem != 0 and clip != 0:
        return X[:-rem, clip:-clip], [Y[t][:-rem] for t in range(1)]
    elif rem == 0 and clip != 0:
        return X[:, clip:-clip], [Y[t] for t in range(1)]
    elif rem != 0 and clip == 0:
        return X[:-rem], [Y[t][:-rem] for t in range(1)]
    else:
        return X, [Y[t] for t in range(1)]


def one_hot_encode(Xd, Yd):

    return IN_MAP[Xd.astype('int8')], \
           [OUT_MAP[Yd[t].astype('int8')] for t in range(1)]


def print_topl_statistics(y_true, y_pred):
    # Prints the following information: top-kL statistics for k=0.5,1,2,4,
    # auprc, thresholds for k=0.5,1,2,4, number of true splice sites.

    idx_true = np.nonzero(y_true == 1)[0]
    argsorted_y_pred = np.argsort(y_pred)
    sorted_y_pred = np.sort(y_pred)

    topkl_accuracy = []
    threshold = []

    for top_length in [0.5, 1, 2, 4]:

        idx_pred = argsorted_y_pred[-int(top_length*len(idx_true)):]

        topkl_accuracy += [np.size(np.intersect1d(idx_true, idx_pred)) \
                  / float(min(len(idx_pred), len(idx_true)))]
        threshold += [sorted_y_pred[-int(top_length*len(idx_true))]]

    auprc = average_precision_score(y_true, y_pred)

    print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
          np.round(topkl_accuracy[0],4), np.round(topkl_accuracy[1],4), np.round(topkl_accuracy[2],4),
          np.round(topkl_accuracy[3],4), np.round(auprc,4), np.round(threshold[0],4), np.round(threshold[1],4),
          np.round(threshold[2],4), np.round(threshold[3],4), len(idx_true)))


In [6]:
def ResidualUnit(l, w, ar):
    # Residual unit proposed in "Identity mappings in Deep Residual Networks"
    # by He et al.

    def f(input_node):

        bn1 = BatchNormalization()(input_node)
        act1 = Activation('relu')(bn1)
        conv1 = Conv1D(l, [w], dilation_rate=[ar], padding='same')(act1)
        bn2 = BatchNormalization()(conv1)
        act2 = Activation('relu')(bn2)
        conv2 = Conv1D(l, [w], dilation_rate=[ar], padding='same')(act2)
        output_node = add([conv2, input_node])

        return output_node

    return f


def SpliceAI(L, W, AR):
    # L: Number of convolution kernels
    # W: Convolution window size in each residual unit
    # AR: Atrous rate in each residual unit

    assert len(W) == len(AR)

    CL = 2 * np.sum(AR*(W-1))

    input0 = Input(shape=(None, 4))
    conv = Conv1D(L, 1)(input0)
    skip = Conv1D(L, 1)(conv)

    for i in range(len(W)):
        conv = ResidualUnit(L, W[i], AR[i])(conv)
        
        if (((i+1) % 4 == 0) or ((i+1) == len(W))):
            # Skip connections to the output after every 4 residual units
            dense = Conv1D(L, 1)(conv)
            skip = add([skip, dense])

    skip = Cropping1D(int(CL/2))(skip)

    output0 = [[] for t in range(1)]

    for t in range(1):
        output0[t] = Conv1D(3, 1, activation='softmax')(skip)
    
    model = Model(inputs=input0, outputs=output0)

    return model


def categorical_crossentropy_2d(y_true, y_pred):
    # Standard categorical cross entropy for sequence outputs
    weights = [3.33445928e-01, 1.97431150e+03, 1.97432843e+03]
    return - kb.mean(weights[0]*y_true[:, :, 0]*kb.log(y_pred[:, :, 0]+1e-10)
                   + weights[1]*y_true[:, :, 1]*kb.log(y_pred[:, :, 1]+1e-10)
                   + weights[2]*y_true[:, :, 2]*kb.log(y_pred[:, :, 2]+1e-10))


In [7]:
def make_parallel(model, gpu_count):

    def get_slice(data, idx, parts):

        shape = tf.shape(data)
        stride = tf.concat([shape[:1]//parts, shape[1:]*0], 0)
        start = stride * idx

        size = tf.concat([shape[:1]//parts, shape[1:]], 0) 
        # Split the batch into equal parts 

        return tf.slice(data, start, size)

    outputs_all = []
    for i in range(len(model.outputs)):
        outputs_all.append([])

    # Place a copy of the model on each GPU, each getting a slice of the batch
    for i in range(gpu_count):
        with tf.device('/gpu:%d' % i):
            with tf.name_scope('tower_%d' % i) as scope:

                inputs = []
                # Slice each input into a piece for processing on this GPU
                for x in model.inputs:
                    input_shape = tuple(x.get_shape().as_list())[1:]
                    slice_n = Lambda(get_slice, output_shape=input_shape,
                                  arguments={'idx': i, 'parts': gpu_count})(x)
                    inputs.append(slice_n)

                outputs = model(inputs)
                
                if not isinstance(outputs, list):
                    outputs = [outputs]
                
                # Save all the outputs for merging back together later
                for l in range(len(outputs)):
                    outputs_all[l].append(outputs[l])

    # Merge outputs on CPU
    with tf.device('/cpu:0'):
        
        merged = []
        for outputs in outputs_all:
            merged.append(concatenate(outputs, axis=0))
            
        return Model(inputs=model.inputs, outputs=merged)

In [8]:


###############################################################################
# Training and validation
###############################################################################
h5f = h5py.File('/odinn/tmp/benediktj/Data/SplicePrediction/dataset_train_.h5', 'r')

In [9]:
num_idx = len(h5f.keys())//2
idx_all = np.random.permutation(num_idx)
idx_train = idx_all[:int(0.9*num_idx)]
idx_valid = idx_all[int(0.9*num_idx):]

EPOCH_NUM = 10*len(idx_train)

n_null = 0
n_acceptor = 0
n_donor = 0
for idx in idx_train:
    y = h5f['Y' + str(idx)][:]
    n_null += np.sum(y[0,:,:,0]==1)
    n_acceptor += np.sum(y[0,:,:,1]==1)
    n_donor += np.sum(y[0,:,:,2]==1)

KeyboardInterrupt: 

In [None]:
weights = (n_null+n_acceptor+n_donor) / (3 * np.array([n_null,n_acceptor,n_donor]))

In [None]:
weights

In [None]:
start_time = time.time()


In [None]:
h5f.close()

In [None]:
from pkg_resources import resource_filename

In [16]:
h5f = h5py.File('/odinn/tmp/benediktj/Data/SplicePrediction/dataset_test_.h5', 'r')


num_idx = len(h5f.keys())//2

#model = load_model('../Results/TF_Models/SpliceAI_{}_c.h5'.format(0))
model = []
n_models = 5

paths = ('models/spliceai{}.h5'.format(x) for x in range(1, 6))
model = [load_model(resource_filename('spliceai', x)) for x in paths]


###############################################################################
# Model testing
###############################################################################

start_time = time.time()

output_class_labels = ['Null', 'Acceptor', 'Donor']
# The three neurons per output correspond to no splicing, splice acceptor (AG)
# and splice donor (GT) respectively.

for output_class in [1, 2]:

    Y_true = [[] for t in range(1)]
    Y_pred = [[] for t in range(1)]

    for idx in range(num_idx):

        X = h5f['X' + str(idx)][:]
        Y = h5f['Y' + str(idx)][:]

        Xc, Yc = clip_datapoints(X, Y, CL, 1)

        Yps = [np.zeros(Yc[0].shape) for t in range(1)]

        for v in range(n_models):
            Yp = model[v].predict(Xc, batch_size=BATCH_SIZE)

            if not isinstance(Yp, list):
                Yp = [Yp]

            for t in range(1):
                Yps[t] += Yp[t]/n_models
        # Ensemble averaging (mean of the ensemble predictions is used)

        for t in range(1):

            is_expr = (Yc[t].sum(axis=(1,2)) >= 1)

            Y_true[t].extend(Yc[t][is_expr, :, output_class].flatten())
            Y_pred[t].extend(Yps[t][is_expr, :, output_class].flatten())

    print("\n\033[1m{}:\033[0m".format(output_class_labels[output_class]))

    Y_true[0] = np.asarray(Y_true[0])
    Y_pred[0] = np.asarray(Y_pred[0])
    
    print_topl_statistics(Y_true[0], Y_pred[0])
    
    #chunkSize = Y_true[0].shape[0]/10
    
    #for idx in range(10):
    #    print_topl_statistics(Y_true[0][int(chunkSize*idx):int(chunkSize*(idx+1))], Y_pred[0][int(chunkSize*idx):int(chunkSize*(idx+1))])


h5f.close()

print("--- {} seconds ---".format(time.time() - start_time))
print("--------------------------------------------------------------")

2021-11-29 11:48:39.947913: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-11-29 11:48:40.855332: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14639 MB memory:  -> device: 0, name: Tesla V100-PCIE-16GB, pci bus id: 0000:5e:00.0, compute capability: 7.0
2021-11-29 11:48:40.861177: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 30989 MB memory:  -> device: 1, name: Tesla V100-PCIE-32GB, pci bus id: 0000:86:00.0, compute capability: 7.0




2021-11-29 11:48:47.530075: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-11-29 11:48:56.742389: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8204



[1mAcceptor:[0m
0.9763	0.9121	0.9862	0.9939	0.9514	0.9891	0.7128	0.1227	0.026	89712

[1mDonor:[0m
0.98	0.9197	0.9896	0.9959	0.9578	0.991	0.734	0.1187	0.0238	89712
--- 5325.377207040787 seconds ---
--------------------------------------------------------------


For data provided with paper: 

Acceptor PR-AUC = 0.9587

Donor PR-AUC = 0.965

In [22]:
OUT_MAP = np.asarray([[1, 0, 0],
                      [0, 1, 0],
                      [0, 0, 1],
                      [0, 0, 0]])

class spliceDataset(Dataset):
    def __init__(self, annotation, transform=None, target_transform=None):
        #self.data = data
        #self.labelDict = labelDict
        self.annotation = annotation
        self.transform = transform
        self.target_transform = target_transform
        

    def __len__(self):
        return self.annotation.shape[0]
    

    def __getitem__(self, idx):
        transcript,chrom,strand,tx_start,tx_end = self.annotation['transcript'].values[idx],self.annotation['chrom'].values[idx],self.annotation['strand'].values[idx],self.annotation['tx_start'].values[idx],self.annotation['tx_end'].values[idx]
        #gene,chrom,strand = self.annotation['gene'].values[idx],self.annotation['chrom'].values[idx],self.annotation['strand'].values[idx]
        
        length = tx_end-tx_start
        num_points = ceil_div(length, SL)
        
        Xd = np.zeros((num_points, SL+CL_max,4))
        Yd = np.zeros((num_points, SL,3))
        if strand=='+':
            X0 = seqData[chrom][int(tx_start)-1-CL_max//2:int(tx_end)+SL+CL_max//2].toarray()
            X0[:CL_max//2,:] = np.array([0,0,0,0,0])
            X0[-(CL_max//2+SL):,:] = np.array([0,0,0,0,0])
        else:
            X0 = seqData[chrom][int(tx_start)-1-SL-CL_max//2:int(tx_end)+CL_max//2].toarray()
            X0[:(CL_max//2+SL),:] = np.array([0,0,0,0,0])
            X0[-CL_max//2:,:] = np.array([0,0,0,0,0])
            X0 = X0[::-1,:]
            X0[:,[0,1,2,3]] = X0[:,[0,1,2,3]][:,::-1]
            
        label = transcriptToLabel[transcript]
        Y0 = np.zeros((length+SL,3))
        Y0[:length,0] = np.ones(length)
        Y0[label[1],:] = OUT_MAP[np.array(label[0]).astype('int8')]

        for i in range(num_points):
            tmp = X0[SL*i:CL_max+SL*(i+1),:]
            if tmp.shape[0]==0:
                continue
            Xd[i,:,:4] = tmp[:,:4]
            Yd[i,:,:] = Y0[SL*i:SL*(i+1),:]
        
        if self.transform:
            X = self.transform(X)
        if self.target_transform:
            label = self.target_transform(label)
        
        #X = torch.Tensor(X.copy())
        #print(X.shape)
        #X = torch.nn.functional.pad(X.clone(), (0,0,CL_max//2,CL_max//2+X.shape[0]-SL*(X.shape[0]//SL)), "constant", 0)
        #X = torch.transpose(X.unfold(0,SL,CL_max//2),1,2)
        #print(X.shape)
        #Y = torch.transpose(torch.Tensor(Y).unfold(0,SL,CL_max//2),1,2)
        return Xd,Yd
        #if self.include_prob:
        #    return X, [Y[t] for t in range(1)],[Y_prob[t] for t in range(1)]
        #else:
        #    return X, [Y[t] for t in range(1)]

def ceil_div(x, y):
    return int(ceil(float(x)/y))

def collate_fn(data):
    """
       data: is a list of tuples with (example, label)
             where 'example' is a tensor of arbitrary shape
             and label/length are scalars
    """
    #unfold1 = nn.Unfold((SL*3,1),SL,CL_max//2)
    #unfold2 = nn.Unfold((SL,1),SL,0)
    features = []
    labels = []
    for i in range(len(data)):
        features.append(torch.Tensor(data[i][0]))
        labels.append(torch.Tensor(data[i][1]))
        #features.append(tmp.unfold(0,SL*3,CL_max//2))
        #labels.append(torch.Tensor(data[i][1]).unfold(0,SL,CL_max//2))
    return torch.cat(features,dim=0).float(), torch.cat(labels,dim=0).float()

#train_dataset = spliceDataset(annotation_train)
#val_dataset = spliceDataset(annotation_validation)

#train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=100, shuffle=True, num_workers=16,collate_fn=collate_fn, pin_memory=True)
#val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=100, shuffle=False,collate_fn=collate_fn, num_workers=16)

data_dir = '/odinn/tmp/benediktj/Data/SplicePrediction-050422'
setType = 'test'
#with open('{}/sparse_sequence_data_{}.pickle'.format(data_dir,setType), 'rb') as handle:
#    seqData = pickle.load(handle)
    
with open('{}/sparse_discrete_label_data_{}.pickle'.format(data_dir,setType), 'rb') as handle:
    transcriptToLabel = pickle.load(handle)
    
    
CHROM_TEST = ['chr1', 'chr3', 'chr5', 'chr7', 'chr9']

annotation = pd.read_csv(data_dir+'/annotation_ensembl_v87_{}.txt'.format(setType),sep='\t',header=None)[[0,1,2,3,4]]
annotation.columns = ['name','chrom','strand','tx_start','tx_end']
annotation['transcript'] = annotation['name'].apply(lambda x: x.split('---')[-2].split('.')[0]).values
annotation['gene'] = annotation['name'].apply(lambda x: x.split('---')[-3].split('.')[0]).values
#annotation['support'] = annotation['transcript'].apply(lambda x:transcriptToSupport[x])

chrom_paths = glob(data_dir+'/sparse_sequence_data/*')
chromToPath = {}
for path in chrom_paths:
    chromToPath[path.split('/')[-1].split('_')[0]] = path
    
seqData = {}
for chrom in CHROM_TEST:
    seqData[chrom] = load_npz(data_dir+'/sparse_sequence_data/{}_{}.npz'.format(chrom,setType)).tocsr()

    


In [19]:
#!pip install spliceai

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.decode.is/simple
Collecting spliceai
  Downloading https://pypi.decode.is/packages/d6/2b/9dbf72fdd948cd606c21826cc3735a5beea52633dab72d95d9936a9454d4/spliceai-1.3.1-py2.py3-none-any.whl (16.7 MB)
[K     |████████████████████████████████| 16.7 MB 97.5 MB/s eta 0:00:01s eta 0:00:01��██████████████▋             | 9.7 MB 97.5 MB/s eta 0:00:01��████████████████▊       | 12.9 MB 97.5 MB/s eta 0:00:01
Collecting pysam>=0.10.0
  Downloading https://pypi.decode.is/packages/41/0f/4ffdea619a8898c936f6f1b909759486167976d761fc265f931119f0da1e/pysam-0.18.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (16.2 MB)
[K     |████████████████████████████████| 16.2 MB 2.7 MB/s eta 0:00:01��█████████████████████████▊  | 15.0 MB 2.7 MB/s eta 0:00:01
Installing collected packages: pysam, spliceai
Successfully installed pysam-0.18.0 spliceai-1.3.1
You should consider upgrading via the '/us

In [25]:
model = []
n_models = 5

paths = ('models/spliceai{}.h5'.format(x) for x in range(1, 6))
model = [load_model(resource_filename('spliceai', x)) for x in paths]

device = torch.device("cpu")

Y_true_acceptor, Y_pred_acceptor = [],[]
Y_true_donor, Y_pred_donor = [],[]
test_dataset = spliceDataset(annotation)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=100, shuffle=False, num_workers=0,collate_fn=collate_fn, pin_memory=False)


for (batch_chunks,target_chunks) in tqdm(test_loader):
    Xc = batch_chunks.to(device).numpy()
    targets = torch.squeeze(target_chunks.to(device),0).numpy()
    outputs = []
    for v in range(n_models):
            Yp = model[v].predict(Xc, batch_size=BATCH_SIZE)
            outputs.append(Yp)
    outputs = (outputs[0]+outputs[1]+outputs[2]+outputs[3]+outputs[4])/n_models
    #outputs = (outputs[0]+outputs[1]+outputs[2])/n_models

    is_expr = (targets.sum(axis=(1,2)) >= 1)
    Y_true_acceptor.extend(targets[is_expr, :, 1].flatten())
    Y_true_donor.extend(targets[is_expr, :, 2].flatten())
    Y_pred_acceptor.extend(outputs[is_expr, :, 1].flatten())
    Y_pred_donor.extend(outputs[is_expr, :, 2].flatten())
    
Y_true_acceptor, Y_pred_acceptor,Y_true_donor, Y_pred_donor = np.array(Y_true_acceptor), np.array(Y_pred_acceptor),np.array(Y_true_donor), np.array(Y_pred_donor)
print("\n\033[1m{}:\033[0m".format('Acceptor'))
acceptor_val_results = print_topl_statistics(Y_true_acceptor, Y_pred_acceptor)
print("\n\033[1m{}:\033[0m".format('Donor'))
donor_val_results =print_topl_statistics(Y_true_donor, Y_pred_donor)



  0%|                                                                                                                                                                                           | 0/90 [00:00<?, ?it/s]2022-04-20 16:02:08.137901: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8
2022-04-20 16:02:09.888944: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [22:31<00:00, 15.02s/it]



[1mAcceptor:[0m
0.9763	0.9121	0.9862	0.9939	0.9514	0.9890999794006348	0.7127000093460083	0.12269999831914902	0.026000000536441803	89712

[1mDonor:[0m
0.98	0.9197	0.9896	0.9959	0.9578	0.9909999966621399	0.734000027179718	0.11869999766349792	0.023800000548362732	89712


In [26]:
(0.9514+0.9578)/2

0.9546

In [8]:
setType = 'test'
annotation_test, gene_to_label, seqData = get_GTEX_v8_Data('/odinn/tmp/benediktj/Data/SplicePrediction-GTEX-V8', setType,'annotation_GTEX_v8.txt')

In [9]:
#BATCH_SIZE = 32

In [10]:
model = []
n_models = 5

paths = ('models/spliceai{}.h5'.format(x) for x in range(1, 6))
model = [load_model(resource_filename('spliceai', x)) for x in paths]

device = torch.device("cpu")

Y_true_acceptor, Y_pred_acceptor = [],[]
Y_true_donor, Y_pred_donor = [],[]
test_dataset = spliceDataset(getDataPointListGTEX(annotation_test,gene_to_label,SL,CL_max,shift=SL))
test_dataset.seqData = seqData
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=100, shuffle=False, num_workers=0, pin_memory=False)

ce_2d = []
for (batch_features ,targets) in tqdm(test_loader):
    Xc = torch.transpose(batch_features.type(torch.FloatTensor).to(device),1,2).numpy()
    targets = torch.transpose(targets.to(device)[:,:,CL_max//2:-CL_max//2],1,2).numpy()
    outputs = []
    for v in range(n_models):
            Yp = model[v].predict(Xc, batch_size=BATCH_SIZE)
            outputs.append(Yp)
    outputs = (outputs[0]+outputs[1]+outputs[2]+outputs[3]+outputs[4])/n_models

    #targets = torch.transpose(targets,1,2).cpu().numpy()
    #outputs = torch.transpose(outputs,1,2).cpu().numpy()
    ce_2d.append(cross_entropy_2d(targets,outputs))

    is_expr = (targets.sum(axis=(1,2)) >= 1)
    Y_true_acceptor.extend(targets[is_expr, :, 1].flatten())
    Y_true_donor.extend(targets[is_expr, :, 2].flatten())
    Y_pred_acceptor.extend(outputs[is_expr, :, 1].flatten())
    Y_pred_donor.extend(outputs[is_expr, :, 2].flatten())

2022-12-15 17:22:05.759506: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2022-12-15 17:22:07.604101: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1666] Found device 0 with properties: 
name: Tesla V100-PCIE-32GB major: 7 minor: 0 memoryClockRate(GHz): 1.38
pciBusID: 0000:86:00.0
2022-12-15 17:22:07.604147: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2022-12-15 17:22:07.610867: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
2022-12-15 17:22:07.613225: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcufft.so.10
2022-12-15 17:22:07.613543: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcurand.so.10
2022-12-15 17:22:07.614054: I tensorflow/stream_executor/platform/def



  0%|                                                                                                                                                                                                          | 0/743 [00:00<?, ?it/s]2022-12-15 17:22:31.480981: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8
2022-12-15 17:23:07.017227: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 743/743 [22:03<00:00,  1.78s/it]


In [13]:
from src.evaluation_metrics import print_topl_statistics

In [14]:
mean_ce = np.mean(ce_2d)
print('Cross entropy = {}'.format(mean_ce))
Y_true_acceptor, Y_pred_acceptor,Y_true_donor, Y_pred_donor = np.array(Y_true_acceptor), np.array(Y_pred_acceptor),np.array(Y_true_donor), np.array(Y_pred_donor)
print("\n\033[1m{}:\033[0m".format('Acceptor'))
acceptor_val_results = print_topl_statistics(Y_true_acceptor, Y_pred_acceptor)
print("\n\033[1m{}:\033[0m".format('Donor'))
donor_val_results =print_topl_statistics(Y_true_donor, Y_pred_donor)

Cross entropy = 0.0006880524325870373

[1mAcceptor:[0m
0.9954	0.7482	0.8785	0.9518	0.8378	0.8901	0.1673	0.0352	0.0070	67035	89600.0	89600

[1mDonor:[0m
0.9949	0.7459	0.8748	0.9471	0.834	0.8917	0.1578	0.0311	0.0058	68077	91272.0	91272


In [17]:
67035+68077

135112

In [16]:
89600+91272

180872

In [10]:
from src.dataloader import get_GTEX_v8_Data
data_dir = '/odinn/tmp/benediktj/Data/SplicePrediction-rnasplice-blood-070623/'
setType = 'test'
annotation_test, gene_to_label, seqData = get_GTEX_v8_Data(data_dir, setType,'annotation_GTEX_v8.txt')

model = []
n_models = 5

paths = ('models/spliceai{}.h5'.format(x) for x in range(1, 6))
model = [load_model(resource_filename('spliceai', x)) for x in paths]

device = torch.device("cpu")

Y_true_acceptor, Y_pred_acceptor = [],[]
Y_true_donor, Y_pred_donor = [],[]
test_dataset = spliceDataset(getDataPointListGTEX(annotation_test,gene_to_label,SL,CL_max,shift=SL))
test_dataset.seqData = seqData
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=100, shuffle=False, num_workers=0, pin_memory=False)

ce_2d = []
for (batch_features ,targets) in tqdm(test_loader):
    Xc = torch.transpose(batch_features.type(torch.FloatTensor).to(device),1,2).numpy()
    targets = torch.transpose(targets.to(device)[:,:,CL_max//2:-CL_max//2],1,2).numpy()
    outputs = []
    for v in range(n_models):
            Yp = model[v].predict(Xc, batch_size=BATCH_SIZE)
            outputs.append(Yp)
    outputs = (outputs[0]+outputs[1]+outputs[2]+outputs[3]+outputs[4])/n_models

    #targets = torch.transpose(targets,1,2).cpu().numpy()
    #outputs = torch.transpose(outputs,1,2).cpu().numpy()
    ce_2d.append(cross_entropy_2d(targets,outputs))

    is_expr = (targets.sum(axis=(1,2)) >= 1)
    Y_true_acceptor.extend(targets[is_expr, :, 1].flatten())
    Y_true_donor.extend(targets[is_expr, :, 2].flatten())
    Y_pred_acceptor.extend(outputs[is_expr, :, 1].flatten())
    Y_pred_donor.extend(outputs[is_expr, :, 2].flatten())

2023-11-09 16:57:09.506806: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2023-11-09 16:57:11.529890: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1666] Found device 0 with properties: 
name: Tesla V100-PCIE-32GB major: 7 minor: 0 memoryClockRate(GHz): 1.38
pciBusID: 0000:86:00.0
2023-11-09 16:57:11.529940: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2023-11-09 16:57:13.321761: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
2023-11-09 16:57:13.861930: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcufft.so.10
2023-11-09 16:57:14.443840: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcurand.so.10
2023-11-09 16:57:14.523518: I tensorflow/stream_executor/platform/def



  0%|                                                                                                                                               | 0/771 [00:00<?, ?it/s]2023-11-09 16:58:12.721384: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8
2023-11-09 16:58:57.308700: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 771/771 [24:05<00:00,  1.87s/it]


In [9]:
!ls /odinn/tmp/benediktj/Data/SplicePrediction-rnasplice-blood-070623/

ls: cannot access '/odinn/tmp/benediktj/Data/SplicePrediction-rnasplice-blood-070623/': No such file or directory


In [11]:
from src.evaluation_metrics import print_topl_statistics
mean_ce = np.mean(ce_2d)
print('Cross entropy = {}'.format(mean_ce))
Y_true_acceptor, Y_pred_acceptor,Y_true_donor, Y_pred_donor = np.array(Y_true_acceptor), np.array(Y_pred_acceptor),np.array(Y_true_donor), np.array(Y_pred_donor)
print("\n\033[1m{}:\033[0m".format('Acceptor'))
acceptor_val_results = print_topl_statistics(Y_true_acceptor, Y_pred_acceptor)
print("\n\033[1m{}:\033[0m".format('Donor'))
donor_val_results =print_topl_statistics(Y_true_donor, Y_pred_donor)

Cross entropy = 0.000810858211367834

[1mAcceptor:[0m
0.9932	0.7309	0.8653	0.9429	0.8196	0.8205	0.1450	0.0307	0.0061	72266	98870.0	98870

[1mDonor:[0m
0.9932	0.7332	0.8657	0.9397	0.82	0.8302	0.1380	0.0272	0.0050	73400	100114.0	100114


In [12]:
(0.7309+0.7332)/2

0.73205

In [13]:
(0.8196+0.82)/2

0.8198

In [14]:
72266+73400

145666