In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
annot_df = pd.read_csv('../data/annotation/cannonical_annotation.csv',sep = '\t',header = 0,index_col = 0)
transcript_df = pd.read_csv('../data/annotation/annot_label.csv',sep = '\t',header = 0)
transcript_df['length'] = np.absolute(transcript_df['Start']-transcript_df['End'])
#How many sequences if we fully chop transcriptome
np.sum(np.ceil(transcript_df['length']/512))
annot_df = annot_df[annot_df['Annotation'].isin(['exon','UTR',])]
annot_df['GeneID']=pd.Categorical(annot_df['GeneID'],transcript_df['Gene'])
annot_df = annot_df.sort_values(by=['Chrom','GeneID','Start']).reset_index()

In [3]:
reverse_strand = {'+':'-','-':'+'}
length = 512
chrom = []
strand = []
start_l = []
end_l = []
gene = []
for index,row in transcript_df.iterrows():
    start = row['Start']
    end = row['End']
    split = [(round(length*i)+start, round(length*(i+1))+start) for i in range(int(math.ceil((end-start)/length)))]
    for entry in split:
        chrom.append(row['Chr'])
        strand.append(reverse_strand[row['Strand']])
        start_l.append(entry[0])
        end_l.append(entry[1])
        gene.append(row['Gene'])
    

In [None]:
bed_df = pd.DataFrame({'Chr':chrom,'Start':start_l,'End':end_l,'Strand':strand,'Gene':gene})
bed_df['Gene']=pd.Categorical(bed_df['Gene'],transcript_df['Gene'])
bed_df = bed_df.sort_values(by=['Chr','Gene','Start'])

In [None]:
bed_pointer = 0
annot_pointer = 0
label = np.empty(len(bed_df), dtype=object)
label[...] = [set()for _ in range(label.shape[0])]
#while bed_df.iloc[bed_pointer]['Gene'] == annot_df.iloc[annot_pointer]['GeneID']:
while bed_pointer < len(bed_df)-1  or annot_pointer < len(annot_df)-1:
    bed_row = bed_df.iloc[bed_pointer]
    annot_row = annot_df.iloc[annot_pointer]
    #chromosome check:
    if bed_row['Chr'] < annot_row['Chrom']:
        bed_pointer += 1
        continue
    elif bed_row['Chr'] > annot_row['Chrom']:
        annot_pointer += 1
        continue
    #not overlapping， move pointer 
    if bed_row['End'] <= annot_row['Start']:
        bed_pointer += 1
        continue
    elif bed_row['Start'] >= annot_row['End']:
        annot_pointer += 1
        continue
    #overlapping
    if max(bed_row['Start'], annot_row['Start']) <= min(bed_row['End'], annot_row['End']):
        #complete overlap:
        if (bed_row['Start'] >= annot_row['Start']) and (bed_row['End'] <= annot_row['End']):
            label[bed_pointer].add(annot_row['Annotation'])
            bed_pointer += 1 
        #partial overlap
        elif bed_row['Start'] <= annot_row['Start']:
            label[bed_pointer].update(['junction',annot_row['Annotation']])
            bed_pointer += 1
        #partial overlap
        elif bed_row['End'] >= annot_row['End']:
            label[bed_pointer].update([annot_row['Annotation'],'junction'])
            annot_pointer += 1
               

In [None]:
label_list = [list(ele) for ele in label]
label = [','.join(ele) for ele in label_list]
bed_df['Label'] = label
bed_df['Label'] = bed_df['Label'].replace(r'^\s*$', 'intron', regex=True)

Unnamed: 0,Chr,Start,End,Strand,Gene,Label
0,chr1,11869,12381,-,ENSG00000223972.5,"junction,exon"
1,chr1,12381,12893,-,ENSG00000223972.5,"junction,exon"
2,chr1,12893,13405,-,ENSG00000223972.5,"junction,exon"
3,chr1,13405,13917,-,ENSG00000223972.5,exon
4,chr1,13917,14429,-,ENSG00000223972.5,"junction,exon"
...,...,...,...,...,...,...
3819693,chrY,57212184,57212696,+,ENSG00000227159.8_PAR_Y,exon
3819694,chrY,57212696,57213208,+,ENSG00000227159.8_PAR_Y,"junction,exon"
3819695,chrY,57213208,57213720,+,ENSG00000227159.8_PAR_Y,"junction,exon"
3819696,chrY,57213720,57214232,+,ENSG00000227159.8_PAR_Y,"junction,exon"


In [30]:
bed_df.to_csv('../data/pre-train/rna_selection.bed',sep = '\t', index=False, header = False)

In [12]:
from tqdm import tqdm
fasta = open('../data/pre-train/rna_seq.fa', 'r')
lines = fasta.readlines()
seq = []
# Strips the newline character
for line in tqdm(lines):
    if line[0] == '>':
        next
    else: 
        seq.append(line.strip().upper())

100%|██████████| 7639512/7639512 [00:03<00:00, 2057202.43it/s]


In [13]:
import h5py
import numpy as np
file_name = '../data/pre-train/rna_seq.h5'
h5f = h5py.File(file_name, 'w')
x = h5f.create_dataset('seq',data = seq)
h5f.close()

In [2]:
import pandas as pd
import numpy as np

In [8]:
bed_df = pd.read_csv('../data/pre-train/region_selection.bed',sep='\t',index_col=None)

bed_df['Label'].unique()

array(['junction,exon', 'exon', 'intron', 'junction,exon,UTR',
       'junction,UTR'], dtype=object)

In [13]:
bed_df[bed_df['Label'].str.contains("intron")]

Unnamed: 0,Chr,Start,End,Strand,Gene,Label
13,chr1,18500,19012,-,ENSG00000227232.5,intron
14,chr1,19012,19524,-,ENSG00000227232.5,intron
15,chr1,19524,20036,-,ENSG00000227232.5,intron
16,chr1,20036,20548,-,ENSG00000227232.5,intron
17,chr1,20548,21060,-,ENSG00000227232.5,intron
...,...,...,...,...,...,...
3819741,chrY,57203191,57203703,-,ENSG00000185203.12_PAR_Y,intron
3819742,chrY,57207481,57207993,+,ENSG00000182484.15_PAR_Y,intron
3819743,chrY,57207993,57208505,+,ENSG00000182484.15_PAR_Y,intron
3819747,chrY,57210041,57210553,+,ENSG00000182484.15_PAR_Y,intron


Count of reads containing each label

'UTR':19953

'exon':363682

'junction': 284194

'intron': 3455744



Not balanced. Sub-sample?

Total read :3819756

In [17]:
len('CTTCTATTTATTTATTTATTTATTTATTTGTTTGTTTTAGAAGATTCTATGTTAATATTTTATGTGT')

67