## Data generation code (overlapping kmer)

In [40]:
import pandas as pd
import numpy as np
import math
import random
from tqdm import tqdm
import h5py
length = 510

In [2]:
annot_df = pd.read_csv('../../data/annotation/cannonical_annotation.csv',sep = '\t',header = 0,index_col = 0)
transcript_df = pd.read_csv('../../data/annotation/annot_label.csv',sep = '\t',header = 0)
transcript_df['length'] = np.absolute(transcript_df['Start']-transcript_df['End'])
annot_df = annot_df[annot_df['Annotation'].isin(['exon','UTR',])]
annot_df['GeneID']=pd.Categorical(annot_df['GeneID'],transcript_df['Gene'])
annot_df = annot_df.sort_values(by=['Chrom','GeneID','Start']).reset_index()

In [3]:
chrom = []
strand = []
start_l = []
end_l = []
gene = []
for index,row in transcript_df.iterrows():
    start = row['Start']
    end = row['End']
    split = [(round(length*i)+start, round(length*(i+1))+start) for i in range(int(math.ceil((end-start)/length)))]
    for entry in split:
        chrom.append(row['Chr'])
        strand.append(row['Strand'])
        start_l.append(entry[0])
        end_l.append(entry[1])
        gene.append(row['Gene'])
    

In [4]:
bed_df = pd.DataFrame({'Chr':chrom,'Start':start_l,'End':end_l,'Strand':strand,'Gene':gene})
bed_df['Gene']=pd.Categorical(bed_df['Gene'],transcript_df['Gene'])
bed_df = bed_df.sort_values(by=['Chr','Gene','Start'])
len(bed_df)

3834691

In [5]:
bed_df = bed_df.drop(bed_df[bed_df['Chr']=='chrM'].index)
len(bed_df)

3834633

In [6]:
transcript_df.to_csv('/home/amber/multitask_RNA/data/pre-train/510/transcript.bed',
                index=False,header=False,sep='\t',columns=['Chr','Start','End','Gene','length','Strand'])

In [7]:
bed_df = bed_df[['Chr','Start','End','Gene','Gene','Strand']]
bed_df.to_csv('/home/amber/multitask_RNA/data/pre-train/510/rna_selection.bed',
                index=False,header=False,sep = '\t')
annot_df = annot_df[['Chrom','Start','End','Annotation','GeneID','Strand']]
annot_df.to_csv('/home/amber/multitask_RNA/data/pre-train/510/annot.bed',
                index=False,header=False,sep='\t')

In [None]:
! bedtools subtract -s -a transcript.bed -b annot.bed > intron.bed

In [8]:
intron_df = pd.read_csv('../../data/pre-train/510/intron.bed',sep = '\t',header = None,
                        names = ['Chrom','Start','End','Annotation','GeneID','Strand'],index_col = None)
intron_df['GeneID'] = intron_df['Annotation']
intron_df['Annotation']='intron'
all_annot_df = pd.concat([intron_df,annot_df])
all_annot_df.to_csv('/home/amber/multitask_RNA/data/pre-train/510/annot.bed',
                index=False,header=False,sep='\t')

In [None]:
! bedtools sort -i annot.bed > annot.sorted.bed
! bedtools intersect -s -a ./rna_selection.bed -b ./annot.bed  -wa -wb > label.bed

In [17]:
final_df = pd.read_csv('../../data/pre-train/510/label.bed',sep = '\t',header = None,index_col = None)
bed_df.columns.values[4] = "Annotation"
unique_label = final_df.groupby([0,1,2,3,5])[9].unique()
test = unique_label.values
for i in range(len(test)):
    label_str = ','.join(test[i])
    test[i] = label_str
bed_df['Annotation'] = test

In [18]:
bed_df.to_csv('/home/amber/multitask_RNA/data/pre-train/510/rna_selection.bed',
                index=False,header=False,sep='\t')

## Empty sequence throw out

In [None]:
! bedtools sort -i ./rna_selection.bed > rna_selection.sorted.bed
! bedtools getfasta -fi /home/amber/ref/hg38/hg38.fa -bed ./rna_selection.sorted.bed > rna_seq.fa

In [19]:
file = open('/home/amber/multitask_RNA/data/pre-train/510/rna_seq.fa')
fa_lines = file.readlines()
empty_index = []
empty_region = []
for i in range(0,len(fa_lines)):
    line = fa_lines[i]
    if line[0] =='>':
        continue
    elif line.strip().upper().count('N') >= 0.5*len(line.strip()):
        empty_index.append(int((i-1)/2))
        empty_region.append(fa_lines[i-1])
   

In [20]:
len(empty_index)

1907

In [21]:
bed_df = pd.read_csv('../../data/pre-train/510/rna_selection.sorted.bed',
                    sep = '\t',header = None,index_col = None,
                    names=['Chr','Start','End','Gene','Annot','Strand'])
bed_df = bed_df.drop(empty_index)
bed_df.to_csv('/home/amber/multitask_RNA/data/pre-train/510/rna_selection.sorted.bed',
                index=False,header=False,sep='\t')

In [None]:
! bedtools getfasta -fi /home/amber/ref/hg38/hg38.fa -bed ./rna_selection.sorted.bed > rna_seq.fa

## Subsample sequence for balance class

In [22]:
bed_df = pd.read_csv('../../data/pre-train/510/rna_selection.sorted.bed',
                    sep = '\t',header = None,index_col = None)

In [23]:
labels=bed_df[4]
selected_id = []
intron_id = []
for i in range(len(labels)):
    label = labels[i].split(',')
    if (len(label) > 1) or ('UTR' in label) or ('exon' in label):
        selected_id.append(i)
    else:
        intron_id.append(i)
        

In [25]:
print(len(selected_id))
print(len(intron_id))
sub_size = int(0.5*len(selected_id))
selected_id.extend(random.choices(intron_id,k=sub_size))
print(len(selected_id))

479938
3352788
719907


In [34]:
fasta = open('../../data/pre-train/'+str(length)+'/rna_seq.fa', 'r')
lines = fasta.readlines()
seq = []
# Strips the newline character
for line in tqdm(lines[1::2]):
    if line[0] == '>':
        print('error in line count')
        break
    else:
        seq.append(line.strip().upper())

100%|██████████| 3832726/3832726 [00:02<00:00, 1352633.41it/s]


In [38]:
selected_seq = [seq[i] for i in selected_id]
data_length = len(selected_seq)
print(data_length)

719907


In [39]:
random.shuffle(selected_seq)
train_data = selected_seq[:int(data_length*0.9)]
valid_data = selected_seq[int(data_length*0.9):]

In [41]:
file_name = '../../data/pre-train/'+str(length)+'/rna_seq.h5'
h5f = h5py.File(file_name, 'w')
h5f.create_dataset('train',data = train_data)
h5f.create_dataset('valid',data = valid_data)
h5f.close()