In [4]:
# Split genic regions into non-overlapping parts

import os
import pybedtools


GENOME_FAI = '../../../reference-data/dmel-all-chromosome-r6.33.fasta.fai'

In [5]:
if not os.path.exists('dmel-all-r6.33.gtf.gz'):
    !wget ftp://ftp.flybase.net/genomes/Drosophila_melanogaster/dmel_r6.33_FB2020_02/gtf/dmel-all-r6.33.gtf.gz

--2020-08-12 11:11:26--  ftp://ftp.flybase.net/genomes/Drosophila_melanogaster/dmel_r6.33_FB2020_02/gtf/dmel-all-r6.33.gtf.gz
           => ‘dmel-all-r6.33.gtf.gz’
Resolving ftp.flybase.net (ftp.flybase.net)... 52.23.126.124
Connecting to ftp.flybase.net (ftp.flybase.net)|52.23.126.124|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /genomes/Drosophila_melanogaster/dmel_r6.33_FB2020_02/gtf ... done.
==> SIZE dmel-all-r6.33.gtf.gz ... 3933975
==> PASV ... done.    ==> RETR dmel-all-r6.33.gtf.gz ... done.
Length: 3933975 (3.8M) (unauthoritative)


2020-08-12 11:11:30 (1.46 MB/s) - ‘dmel-all-r6.33.gtf.gz’ saved [3933975]



In [6]:
analysed_chromosomes = {'X', 'Y', '2L', '2R', '3L', '3R', '4'}
df = pybedtools.BedTool('dmel-all-r6.33.gtf.gz').sort().to_dataframe()
df = df[df.seqname.isin(analysed_chromosomes)]


For the calculation of distribution of Kismet bound GATC sites in the genome in Figure 4A the Drosophila gene annotation GTF was downloaded from flybase version 6.13 (Gramates et al., 2017). The GTF file was filtered to retain only 3′UTR coding, 5′UTR coding, exon and gene features. The file was then split into a single file per genomic feature and overlapping features were merged using bedtools. Using bedtools subtract, exonic regions were subtracted from genic regions to obtain intronic regions, and exonic regions were subtracted from overlapping 3′UTR and 5′UTR coding regions. Significantly bound GATC sites were classified as belonging to one of these regions using bedtools intersect.

In [7]:
exon = df[df['feature'] == 'exon']
genic = df[df['feature'] == 'gene']
genic = pybedtools.BedTool.from_dataframe(genic).sort()
genic_merged = genic.merge()
genic_merged_strand_specific = genic.merge(s=True, c="3,6,7", o='first,first,first')
promoter_proximal = genic_merged_strand_specific.flank(s=True, l=1000, r=0, g=GENOME_FAI).subtract(genic_merged_strand_specific).sort().moveto('1K-upstream.gtf')
intergenic = genic_merged.complement(L=True, g=GENOME_FAI).subtract(promoter_proximal).moveto('Intergenic.gtf')
# intronic is genic - exon
intronic = genic_merged.subtract(pybedtools.BedTool.from_dataframe(exon)).sort().subtract(promoter_proximal).moveto('Intronic.gtf')
# 5'UTR as it appears in gtf
five_utr = df[df['feature'] == '5UTR']
five_utr = pybedtools.BedTool.from_dataframe(five_utr).sort().merge().moveto('5P-UTR.gtf')
# 3'UTR as it appears in gtf
three_utr = df[df['feature'] == '3UTR']
three_utr = pybedtools.BedTool.from_dataframe(three_utr).sort().merge()
three_utr_minus_five_utr = three_utr.subtract(five_utr).moveto('3P-UTR.gtf')
# CDS, exon - utrs
constituitive_exon = pybedtools.BedTool.from_dataframe(exon).merge().subtract(five_utr).subtract(three_utr).sort().moveto('CDS.gtf')

In [8]:

# promoter_proximal.slop(s=True, l=0, r=1000, g='/Users/mvandenb/src/reference/dmel-all-chromosome-r6.33.fasta.fai').sort().moveto('TSS +- 1K.gtf')

In [9]:
def length(df):
    return (df['end'] - df['start']).sum()


length(five_utr.to_dataframe())

4520358

In [10]:
def length(df):
    return (df['end'] - df['start']).sum()


length(three_utr.to_dataframe())

7637721

In [11]:
length(three_utr_minus_five_utr.to_dataframe())

7539902