In [15]:
import pandas as pd
import numpy as np
import math
from tqdm import tqdm
import random
import h5py
length = 510*6

In [9]:
annot_df = pd.read_csv('../../data/annotation/cannonical_annotation.csv',sep = '\t',header = 0,index_col = 0)
transcript_df = pd.read_csv('../../data/annotation/annot_label.csv',sep = '\t',header = 0)
transcript_df['length'] = np.absolute(transcript_df['Start']-transcript_df['End'])
annot_df = annot_df[annot_df['Annotation'].isin(['exon','UTR',])]
annot_df['GeneID']=pd.Categorical(annot_df['GeneID'],transcript_df['Gene'])
annot_df = annot_df.sort_values(by=['Chrom','GeneID','Start']).reset_index()

In [10]:
chrom = []
strand = []
start_l = []
end_l = []
gene = []
for index,row in transcript_df.iterrows():
    start = row['Start']
    end = row['End']
    split = [(round(length*i)+start, round(length*(i+1))+start) for i in range(int(math.ceil((end-start)/length)))]
    for entry in split:
        chrom.append(row['Chr'])
        strand.append(row['Strand'])
        start_l.append(entry[0])
        end_l.append(entry[1])
        gene.append(row['Gene'])
    

In [11]:
bed_df = pd.DataFrame({'Chr':chrom,'Start':start_l,'End':end_l,'Strand':strand,'Gene':gene})
bed_df['Gene']=pd.Categorical(bed_df['Gene'],transcript_df['Gene'])
bed_df = bed_df.sort_values(by=['Chr','Gene','Start'])

In [12]:
transcript_df.to_csv('/home/amber/multitask_RNA/data/pre-train/510_6/transcript.bed',
                index=False,header=False,sep='\t',columns=['Chr','Start','End','Gene','length','Strand'])

In [13]:
bed_df = bed_df[['Chr','Start','End','Gene','Gene','Strand']]
bed_df.to_csv('/home/amber/multitask_RNA/data/pre-train/510_6/rna_selection.bed',
                index=False,header=False,sep = '\t')
annot_df = annot_df[['Chrom','Start','End','Annotation','GeneID','Strand']]
annot_df.to_csv('/home/amber/multitask_RNA/data/pre-train/510_6/annot.bed',
                index=False,header=False,sep='\t')

In [None]:
! bedtools subtract -s -a transcript.bed -b annot.bed > intron.bed

In [14]:
intron_df = pd.read_csv('../../data/pre-train/510_6/intron.bed',sep = '\t',header = None,
                        names = ['Chrom','Start','End','Annotation','GeneID','Strand'],index_col = None)
intron_df['GeneID'] = intron_df['Annotation']
intron_df['Annotation']='intron'
all_annot_df = pd.concat([intron_df,annot_df])
all_annot_df.to_csv('/home/amber/multitask_RNA/data/pre-train/510_6/annot.bed',
                index=False,header=False,sep='\t')

In [None]:
! bedtools sort -i annot.bed > annot.sorted.bed
! bedtools intersect -s -a ./rna_selection.bed -b ./annot.sorted.bed  -wa -wb > label.bed

In [15]:
final_df = pd.read_csv('../../data/pre-train/510_6/label.bed',sep = '\t',header = None,index_col = None)
bed_df.columns.values[4] = "Annotation"
unique_label = final_df.groupby([0,1,2,3,5])[9].unique()
test = unique_label.values
for i in range(len(test)):
    label_str = ','.join(test[i])
    test[i] = label_str
bed_df['Annotation'] = test

In [None]:
bed_df.to_csv('/home/amber/multitask_RNA/data/pre-train/510_6/rna_selection.bed',
                index=False,header=False,sep='\t')

In [8]:
fasta = open('/home/amber/multitask_RNA/data/pre-train/510_6/rna_seq.fa', 'r')
lines = fasta.readlines()
seq = []
# Strips the newline character
for line in tqdm(lines[1::2]):
    if line[0] == '>':
        print('error in line count')
        break
    else:
        seq.append(line.strip().upper())


100%|██████████| 672165/672165 [00:01<00:00, 464840.69it/s]


In [17]:
bed_df = pd.read_csv('../../data/pre-train/510_6/rna_selection.bed',
                    sep = '\t',header = None,index_col = None)

In [18]:
labels=bed_df[4]
selected_id = []
intron_id = []
for i in range(len(labels)):
    label = labels[i].split(',')
    if (len(label) > 1) or ('UTR' in label) or ('exon' in label):
        selected_id.append(i)
    else:
        intron_id.append(i)
        

In [19]:
print(len(selected_id))
print(len(intron_id))

224547
447623


In [11]:
random.shuffle(seq)
data_length = len(seq)
train_data = seq[:int(data_length*0.9)]
valid_data = seq[int(data_length*0.9):]

In [16]:
file_name = '/home/amber/multitask_RNA/data/pre-train/510_6/rna_seq.h5'
h5f = h5py.File(file_name, 'w')
h5f.create_dataset('train',data = train_data)
h5f.create_dataset('valid',data = valid_data)
h5f.close()