In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
colnames=['Chrom', 'Database', 'Annotation', 'Start','End','Score','Strand','Phase','Notes'] 
annotation_df = pd.read_csv('/home/amber/multitask_RNA/data/annotation/gencode.v40.annotation.gtf',
                            sep='\t',skiprows=5,names=colnames,header=None)
annotation_df = annotation_df.drop('Phase', 1)
annotation_df = annotation_df.drop('Score', 1)
annotation_df['GeneID'] = annotation_df['Notes'].apply(lambda x: x.split('"')[1])
annotation_df['TranscriptID'] = annotation_df['Notes'].apply(lambda x : x.split('"')[3])

  annotation_df = annotation_df.drop('Phase', 1)
  annotation_df = annotation_df.drop('Score', 1)


In [7]:
trans_id_list = []
gene_list = annotation_df['GeneID'].unique()
all_transcript_df = annotation_df[annotation_df['Annotation'] == 'transcript']
for gene in tqdm(gene_list,total=len(gene_list)):
    transcript_df = all_transcript_df[all_transcript_df['GeneID']==gene]
    trans_length = transcript_df['End'] - transcript_df['Start']
    max_trans_index = np.argmax(trans_length)
    max_trans = transcript_df.iloc[max_trans_index]
    max_id = max_trans['Notes'].split('"')[3]
    trans_id_list.append(max_id)

100%|████████████████████████████████████| 61544/61544 [37:39<00:00, 27.24it/s]


In [8]:
gene_flag = (annotation_df['Annotation']=='gene')
transcript_flag = (annotation_df['TranscriptID'].isin(trans_id_list))
cannonical_flag = np.array(gene_flag) + np.array(transcript_flag)

In [9]:
cannonical_df = annotation_df[cannonical_flag]

In [10]:
len(cannonical_df)

696277

In [12]:

cannonical_df.to_csv('./data/annotation/cannonical_annotation.csv',sep='\t',
                     columns=['Chrom', 'Database', 'Annotation', 'Start','End','Strand','GeneID','TranscriptID'])

## Creating corresponding label from selected cannonical annotations

In [13]:
import pandas as pd 
import numpy as np

In [14]:
annot_df = pd.read_csv('./data/annotation/cannonical_annotation.csv',sep='\t',index_col=0)
annot_df=annot_df[annot_df['Annotation'] != 'gene']

In [15]:
len(annot_df)

634733

In [17]:
#create label for 3'/5' splice sites, UTR, CDS, Exon, or non of the above
#Use cannonical transcript(longest transcript per gene)
def format_list(label_list):
    if len(label_list)==0:
        return 'NA'
    else: return ','.join(list(map(str, label_list)))

table_content=['Gene','Chr','Strand','Start','End',"Donor","Acceptor","UTR",'CDS','Exon']
with open('./data/annotation/annot_label.csv','w') as file:
    for index,row in annot_df.iterrows():
        start = row['Start']
        end = row['End']
        if row['Annotation'] == 'transcript':
            if index != 1:
                table_content=[transcript_info[0],
                             str(transcript_info[1]),
                              str(transcript_info[2]),
                               str(transcript_info[3]),
                               str(transcript_info[4]),
                              format_list(donor_list[:-1]),
                              format_list(acceptor_list[1:]),
                              format_list(utr_list),
                              format_list(cds_list),
                              format_list(exon_list)]
            file.write('\t'.join(table_content)+'\n')
            utr_list = []
            cds_list = []
            acceptor_list = []
            donor_list = []
            exon_list = []
            transcript_info = [row['GeneID'],row['Chrom'],row['Strand'],start,end]
        elif row['Annotation'] == 'UTR':
            utr_list.append("%s-%s" %(start,end))
        elif row['Annotation'] == 'CDS':
            cds_list.append("%s-%s" %(start,end))
        elif row['Annotation'] == 'exon':
            exon_list.append("%s-%s" %(start,end))
            if row['Strand'] == '+':
                acceptor_list.append(start)
                donor_list.append(end)
            elif row['Strand'] == '-':
                acceptor_list.append(end)
                donor_list.append(start)
    file.write('\t'.join(table_content)+'\n')

In [18]:
! cat ./data/annotation/annot_label.csv | awk -v CLl=1 -v CLr=0 '{print $2"\t"($4-CLl)"\t"($5+CLr)}' > ./data/annotation/temp.bed
! sed -i '1d' ./data/annotation/temp.bed 
! grep -v "chrM" ./data/annotation/temp.bed > ./data/annotation/seq.bed
! rm ./data/annotation/temp.bed
!bedtools getfasta -bed ./data/annotation/seq.bed -fi /home/amber/ref/hg38/hg38.fa -fo ./data/annotation/seq.txt -tab

## Create vector lable containing variable tasks list

In [14]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import h5py
import data_preprocess
padding = 80
seq_len = 5000
#Splice donor/acceptor is alwasy included. Other possible lables:
#[UTR,CDS,Exon]
task_list = []
task_count = len(task_list) + 3

In [15]:
label_df = pd.read_csv('./data/annotation/annot_label.csv',sep='\t')
label_df = label_df[label_df['Chr']!='chrM']
seq_file = open('./data/annotation/seq.txt', 'r').readlines()

In [16]:
#determine final sequenc size
train_chrom =['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7',
              'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15',
              'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 
              'chr22', 'chrX', 'chrY']
valid_chrom = ['chr8']
test_chrom = ['chr9']
train_c,valid_c,test_c=0,0,0
for i in range(len(label_df)):
    row = label_df.iloc[i]
    chrom = row['Chr']
    start = row['Start']
    end = row['End']
    length = end - start + 1
    count = -1 * (-length // seq_len)
    if chrom in train_chrom:
        train_c += count
    elif chrom in valid_chrom:
        valid_c += count
    elif chrom in test_chrom:
        test_c += count

In [17]:
file_name = str(seq_len) + '_'+str(padding)+'.h5'
h5f = h5py.File('./data/annotation/'+file_name, 'w')
x_train = h5f.create_dataset('x_train',shape=(train_c,padding+seq_len,4),dtype=np.int8)
x_valid = h5f.create_dataset('x_valid', shape=(valid_c,padding+seq_len,4),dtype=np.int8)
x_test = h5f.create_dataset('x_test', shape=(test_c,padding+seq_len,4),dtype=np.int8)
y_train = h5f.create_dataset('y_train', shape=(train_c,seq_len,task_count),dtype=np.int8)
y_valid = h5f.create_dataset('y_valid', shape=(valid_c,seq_len,task_count),dtype=np.int8)
y_test = h5f.create_dataset('y_test', shape=(test_c,seq_len,task_count),dtype=np.int8)

In [18]:
train_i = 0
valid_i = 0
test_i = 0
for idx in tqdm(range(len(label_df))):
    seq = seq_file[idx]
    row = label_df.iloc[idx]
    chrom = row['Chr']
    X,Y = data_preprocess.create_datapoints(seq,row,seq_len,padding,task_list)
    X = np.asarray(X,dtype=np.int8)
    Y = np.asarray(Y,dtype=np.int8)
    test_sum = np.sum(Y,axis=2)
    if chrom in train_chrom:
        x_train[train_i:train_i+len(X)]=X
        y_train[train_i:train_i+len(X)]=Y
        train_i = train_i+len(X)
    elif chrom in valid_chrom:
        x_valid[valid_i:valid_i+len(X)]=X
        y_valid[valid_i:valid_i+len(X)]=Y
        valid_i = valid_i+len(X)
    elif chrom in test_chrom:
        x_test[test_i:test_i+len(X)]=X
        y_test[test_i:test_i+len(X)]=Y
        test_i = test_i+len(X)
h5f.close()

100%|███████████████████████████████████| 61507/61507 [05:09<00:00, 198.50it/s]


In [6]:
h5f = h5py.File('./data/annotation/'+file_name, 'r')

In [7]:
test = np.argwhere(np.sum(h5f['y_train'],axis=2) > 1)

In [8]:
test

array([], shape=(0, 2), dtype=int64)

In [9]:
h5f.close()

## Convert SpliceAI data

In [1]:
import h5py
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
h5f = h5py.File('./SpliceAI80/dataset_train_all.h5', 'r')

In [3]:
num_idx = len(h5f.keys())//2
idx_all = np.random.permutation(num_idx)
idx_train = idx_all[:int(0.9*num_idx)]
idx_valid = idx_all[int(0.9*num_idx):]

In [4]:
train_x=[]
train_y=[]
for i in tqdm(idx_train):
    X = h5f['X' + str(i)][:]
    Y = h5f['Y' + str(i)][:]
    train_x.append(X)
    train_y.append(Y[0])

100%|███████████████████████████████████████| 119/119 [00:01<00:00, 106.19it/s]


In [5]:
x_train = np.concatenate(train_x)
y_train = np.concatenate(train_y)

In [6]:
valid_x=[]
valid_y=[]
for i in tqdm(idx_valid):
    X = h5f['X' + str(i)][:]
    Y = h5f['Y' + str(i)][:]
    valid_x.append(X)
    valid_y.append(Y[0])

100%|█████████████████████████████████████████| 14/14 [00:00<00:00, 106.23it/s]


In [7]:
x_valid = np.concatenate(valid_x)
y_valid = np.concatenate(valid_y)

In [9]:
h5f.close()

In [10]:
h5f = h5py.File('./SpliceAI80/dataset_test_0.h5', 'r')
num_idx = len(h5f.keys())//2

In [11]:
test_x=[]
test_y=[]
for i in tqdm(range(num_idx)):
    X = h5f['X' + str(i)][:]
    Y = h5f['Y' + str(i)][:]
    test_x.append(X)
    test_y.append(Y[0])
h5f.close()

100%|█████████████████████████████████████████| 16/16 [00:00<00:00, 119.90it/s]


In [12]:
x_test = np.concatenate(test_x)
y_test = np.concatenate(test_y)
y_test.shape

(16505, 5000, 3)

In [13]:
h5f = h5py.File('./data/annotation/spliceai_5000_80.h5', 'w')
h5f.create_dataset('x_train', data=np.asarray(x_train).astype('int8'))
h5f.create_dataset('x_valid', data=np.asarray(x_valid).astype('int8'))
h5f.create_dataset('x_test', data=np.asarray(x_test).astype('int8'))
h5f.create_dataset('y_train', data=np.asarray(y_train).astype('int8'))
h5f.create_dataset('y_valid', data=np.asarray(y_valid).astype('int8'))
h5f.create_dataset('y_test', data=np.asarray(y_test).astype('int8'))
h5f.close()