# Use the gencode v19 annotation gtf file to pull out introns with ATAC splice junction motifs

In [2]:
import pandas as pd
import numpy as np
import os
from Bio import SeqIO

In [3]:
intron_sequences = '/projects/ps-yeolab3/bay001/annotations/hg19.introns.fa'

In [4]:
from Bio import SeqIO
handle = open(intron_sequences, "rU")
atac_sequences = []
for record in SeqIO.parse(handle, "fasta"):
    if record.seq[:2].upper() == 'AT' and record.seq[-2:] == 'AC' and '_f::' in record.name:
        atac_sequences.append(record)
    elif record.seq[:2].upper() == 'GT' and record.seq[-2:] == 'AT' and '_r::' in record.name:
        atac_sequences.append(record)
        
handle.close()

In [5]:
# should print 955
outfile = '/projects/ps-yeolab3/bay001/annotations/hg19.gencode.v19.atac_introns.fa'
SeqIO.write(atac_sequences,outfile,"fasta")

955

In [6]:
def get_strand(name):
    if name.endswith('_r'):
        return '-'
    elif name.endswith('_f'):
        return '+'

bedfile = '/projects/ps-yeolab3/bay001/annotations/hg19.gencode.v19.atac_introns.bed'

o = open(bedfile, 'w')

for sequence in atac_sequences:
    name, _, chrom, pos = sequence.name.split(':')
    start, end = pos.split('-')
    o.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(
        chrom, start, end, name, 0, get_strand(name)
    ))
o.close()

# Alternatively, use the hg17 gff annotations to get a bedfile (convert to bed and liftover)

- don't use this method, old and outdated and they don't make sense.

In [None]:
gff = pd.read_table(
    '/projects/ps-yeolab3/bay001/annotations/hg17.u12db.annotations.atac_introns.gff',
    names=['chrom','src','featuretype','start','end','.','strand','.','attr']
)
gff.head()

In [None]:
def gff_to_bed(gff_file):
    df = pd.read_table(
        gff_file, names=['chrom','src','featuretype','start','end','.','strand','.','attr']
    )
    df['score'] = 0
    bed_df = df[['chrom','start','end','attr','score','strand']]
    bed_df['start'] = bed_df['start'] - 1
    return bed_df

In [None]:
hg17_bed = gff_to_bed('/projects/ps-yeolab3/bay001/annotations/hg17.u12db.annotations.atac_introns.gff')
hg17_bed.to_csv('/projects/ps-yeolab3/bay001/annotations/hg17.u12db.annotations.atac_introns.bed', sep='\t', index=False, header=False)

# Use the ucsc liftover tool (hg17 - hg19)
- /projects/ps-yeolab3/bay001/annotations/hg19.u12db.annotations.atac_introns.hg17liftover.bed
- again don't use this

In [8]:
hg19_liftover = pd.read_table(
    '/projects/ps-yeolab3/bay001/annotations/hg19.u12db.annotations.atac_introns.hg17liftover.bed',
    sep='\t', names=['chrom','start','end','name','score','strand']
)
hg19_liftover.head(5)

Unnamed: 0,chrom,start,end,name,score,strand
0,chr1,2411416,2411620,23080,0,+
1,chr1,23720526,23724037,23156,0,-
2,chr1,23845622,23847404,23160,0,-
3,chr6,20483151,20486919,23169,0,+
4,chr1,24301567,24305202,23180,0,-


# Re-format the intron bedfile to make work with eric's scripts.

In [56]:
eric_names = ['intron_type','chrom','strand','low_exon','hi_exon','name']
example_file = '/home/elvannostrand/data/ENCODE/RNAseq/scripts/exon_junction_counts/gencodev19.CIandRIlist.txt'
atac_intron_bedfile = '/projects/ps-yeolab3/bay001/annotations/hg19.gencode.v19.atac_introns.bed'
pd.read_table(example_file, names=eric_names).head()

Unnamed: 0,intron_type,chrom,strand,low_exon,hi_exon,name
0,CI,GL949744.1,+,185858-186149,215657-215798,ENSG00000260819.1
1,CI,GL949744.1,-,218214-218338,220865-221042,ENSG00000261547.1
2,CI,GL949744.1,-,121620-121839,126716-126796,ENSG00000260679.1
3,CI,GL949744.1,-,10206-10683,16302-16406,ENSG00000261155.1
4,CI,GL949744.1,-,16302-16406,18696-19007,ENSG00000261155.1


In [62]:
atac_introns = pd.read_table(
    atac_intron_bedfile, names=['intron_chrom','intron_start','intron_stop','intron_name','intron_score','intron_strand']
) # .drop_duplicates(['intron_chrom','intron_start','intron_stop','intron_strand'])
print("{} introns found.".format(atac_introns.shape[0]))
atac_introns['intron_tx_name'] = atac_introns['intron_name'].apply(lambda x: x.split('_')[0])
atac_introns.head()

955 introns found.


Unnamed: 0,intron_chrom,intron_start,intron_stop,intron_name,intron_score,intron_strand,intron_tx_name
0,chr1,67108547,67109226,ENST00000237247.6_intron_6_0_chr1_67108548_f,0,+,ENST00000237247.6
1,chr1,67108547,67109226,ENST00000371039.1_intron_5_0_chr1_67108548_f,0,+,ENST00000371039.1
2,chr1,67108547,67109226,ENST00000424320.1_intron_6_0_chr1_67108548_f,0,+,ENST00000424320.1
3,chr1,67108547,67109226,ENST00000371035.3_intron_3_0_chr1_67108548_f,0,+,ENST00000371035.3
4,chr1,67108547,67109226,ENST00000468286.1_intron_4_0_chr1_67108548_f,0,+,ENST00000468286.1


In [63]:
# intersect with exons (can be more than one exon per intron)
exons = pd.read_table(
    '/projects/ps-yeolab3/bay001/annotations/gencode.v19.annotations.exons.bed',
    names=['exon_chrom','exon_start','exon_stop','exon_name','exon_score','exon_strand']
) # .drop_duplicates(['exon_chrom','exon_start','exon_stop','exon_strand'])
print("{} exons found.".format(exons.shape[0]))
exons['exon_tx_name'] = exons['exon_name'].apply(lambda x: x.split('_')[0])
exons[exons['exon_tx_name']=='ENST00000465537.1']

1214384 exons found.


Unnamed: 0,exon_chrom,exon_start,exon_stop,exon_name,exon_score,exon_strand,exon_tx_name
260611,chr3,160073800,160073938,ENST00000465537.1_exon_0_0_chr3_160073801_r,0,-,ENST00000465537.1
260612,chr3,160075276,160075366,ENST00000465537.1_exon_1_0_chr3_160075277_r,0,-,ENST00000465537.1
260613,chr3,160083830,160083940,ENST00000465537.1_exon_2_0_chr3_160083831_r,0,-,ENST00000465537.1
260614,chr3,160093599,160093668,ENST00000465537.1_exon_3_0_chr3_160093600_r,0,-,ENST00000465537.1
260615,chr3,160095217,160095328,ENST00000465537.1_exon_4_0_chr3_160095218_r,0,-,ENST00000465537.1
260616,chr3,160099290,160099512,ENST00000465537.1_exon_5_0_chr3_160099291_r,0,-,ENST00000465537.1
260617,chr3,160102376,160102434,ENST00000465537.1_exon_6_0_chr3_160102377_r,0,-,ENST00000465537.1
260618,chr3,160116933,160116995,ENST00000465537.1_exon_7_0_chr3_160116934_r,0,-,ENST00000465537.1


In [64]:
# merge on the upstream junction

merged_upstream = pd.merge(
    atac_introns, exons, 
    how='left', 
    left_on=['intron_chrom','intron_start','intron_strand','intron_tx_name'], 
    right_on=['exon_chrom','exon_stop','exon_strand','exon_tx_name']
)
merged_upstream.columns = [
    'intron_chrom','intron_start','intron_stop','intron_name',
    'intron_score','intron_strand','intron_tx_name',
    'upstream_exon_chrom','upstream_exon_start','upstream_exon_stop',
    'upstream_exon_name','upstream_exon_score','upstream_exon_strand',
    'upstream_exon_tx_name'
]
# x = merged_upstream.fillna(0)
# x[x['upstream_exon_start']==0]

In [65]:

# merge on the downstream junctions

merged = pd.merge(
    merged_upstream, exons, 
    how='left',
    left_on=['intron_chrom','intron_stop','intron_strand','intron_tx_name'],
    right_on=['exon_chrom','exon_start','exon_strand','exon_tx_name']
)
merged.head()

Unnamed: 0,intron_chrom,intron_start,intron_stop,intron_name,intron_score,intron_strand,intron_tx_name,upstream_exon_chrom,upstream_exon_start,upstream_exon_stop,...,upstream_exon_score,upstream_exon_strand,upstream_exon_tx_name,exon_chrom,exon_start,exon_stop,exon_name,exon_score,exon_strand,exon_tx_name
0,chr1,67108547,67109226,ENST00000237247.6_intron_6_0_chr1_67108548_f,0,+,ENST00000237247.6,chr1,67108492,67108547,...,0,+,ENST00000237247.6,chr1,67109226,67109402,ENST00000237247.6_exon_7_0_chr1_67109227_f,0,+,ENST00000237247.6
1,chr1,67108547,67109226,ENST00000371039.1_intron_5_0_chr1_67108548_f,0,+,ENST00000371039.1,chr1,67108492,67108547,...,0,+,ENST00000371039.1,chr1,67109226,67109402,ENST00000371039.1_exon_6_0_chr1_67109227_f,0,+,ENST00000371039.1
2,chr1,67108547,67109226,ENST00000424320.1_intron_6_0_chr1_67108548_f,0,+,ENST00000424320.1,chr1,67108492,67108547,...,0,+,ENST00000424320.1,chr1,67109226,67109402,ENST00000424320.1_exon_7_0_chr1_67109227_f,0,+,ENST00000424320.1
3,chr1,67108547,67109226,ENST00000371035.3_intron_3_0_chr1_67108548_f,0,+,ENST00000371035.3,chr1,67108492,67108547,...,0,+,ENST00000371035.3,chr1,67109226,67109402,ENST00000371035.3_exon_4_0_chr1_67109227_f,0,+,ENST00000371035.3
4,chr1,67108547,67109226,ENST00000468286.1_intron_4_0_chr1_67108548_f,0,+,ENST00000468286.1,chr1,67108492,67108547,...,0,+,ENST00000468286.1,chr1,67109226,67109402,ENST00000468286.1_exon_5_0_chr1_67109227_f,0,+,ENST00000468286.1


In [66]:
merged = merged[['intron_chrom','upstream_exon_start','upstream_exon_stop','exon_start','exon_stop','intron_strand','intron_tx_name']]
merged.head()

Unnamed: 0,intron_chrom,upstream_exon_start,upstream_exon_stop,exon_start,exon_stop,intron_strand,intron_tx_name
0,chr1,67108492,67108547,67109226,67109402,+,ENST00000237247.6
1,chr1,67108492,67108547,67109226,67109402,+,ENST00000371039.1
2,chr1,67108492,67108547,67109226,67109402,+,ENST00000424320.1
3,chr1,67108492,67108547,67109226,67109402,+,ENST00000371035.3
4,chr1,67108492,67108547,67109226,67109402,+,ENST00000468286.1


In [67]:
def format_upstream_pos(row):
    return '{}-{}'.format(row['upstream_exon_start'], row['upstream_exon_stop'])
def format_downstream_pos(row):
    return '{}-{}'.format(row['exon_start'], row['exon_stop'])

merged['upstream'] = merged.apply(format_upstream_pos, axis=1)
merged['downstream'] = merged.apply(format_downstream_pos, axis=1)
merged['intron_label'] = 'atac_intron'
merged = merged[['intron_label','intron_chrom','intron_strand','upstream','downstream','intron_tx_name']]
merged

Unnamed: 0,intron_label,intron_chrom,intron_strand,upstream,downstream,intron_tx_name
0,atac_intron,chr1,+,67108492-67108547,67109226-67109402,ENST00000237247.6
1,atac_intron,chr1,+,67108492-67108547,67109226-67109402,ENST00000371039.1
2,atac_intron,chr1,+,67108492-67108547,67109226-67109402,ENST00000424320.1
3,atac_intron,chr1,+,67108492-67108547,67109226-67109402,ENST00000371035.3
4,atac_intron,chr1,+,67108492-67108547,67109226-67109402,ENST00000468286.1
5,atac_intron,chr1,+,67108492-67108547,67109226-67109402,ENST00000371036.3
6,atac_intron,chr1,+,67108492-67108547,67109226-67109402,ENST00000371037.4
7,atac_intron,chr1,+,67108492-67108547,67109226-67109695,ENST00000483060.1
8,atac_intron,chr1,+,181452879-181453146,181479612-181479718,ENST00000524607.1
9,atac_intron,chr1,-,23720371-23720526,23724037-23724094,ENST00000450454.2


In [69]:
merged.to_csv(
    '/projects/ps-yeolab3/bay001/annotations/hg19.gencode.v19.atac_introns.flanking_exons.tab',
    sep='\t', header=False, index=False
)