In [1]:
import pandas as pd
import pybedtools
import os
import glob
import gffutils
from collections import defaultdict
from tqdm import tnrange, tqdm_notebook, trange

In [16]:
def classify_utr(utr_feature, cds_dict):
    """
    Given a feature classified as a UTR, return whether or not it is
    upstream (5') or downstream (3') based on CDS positions

    :param utr_feature: gffutils.Feature
        feature already classified as UTR (within a coding transcript but
        outside of CDS regions).
    :param cds_dict: dict
        dictionary containing cds positions for every transcript
    :return:
    """
    three_prime_utr = False
    five_prime_utr = False

    for transcript_id in utr_feature.attributes['transcript_id']:
        if utr_feature.strand == '+':
            if cds_dict[transcript_id]['low'] > utr_feature.end:
                five_prime_utr = True
            if cds_dict[transcript_id]['hi'] < utr_feature.start + 1:
                three_prime_utr = True
        elif utr_feature.strand == '-':
            if cds_dict[transcript_id]['low'] > utr_feature.end:
                three_prime_utr = True
            if cds_dict[transcript_id]['hi'] < utr_feature.start + 1:
                five_prime_utr = True

    if five_prime_utr:
        return '5utr'
        # return 'five_prime_utr'
    elif three_prime_utr:
        return '3utr'
        # return 'three_prime_utr'
    else:
        return 'unclassified_utr'
def get_all_cds_dict(db, cds_key):
    """
    For every cds-annotated transcript id (ENST), return a
    dictionary containing the lowest and highest
    cds start and end vals for that transcript.

    :return cds_dict : defaultdict{transcript:{'start':START, 'end':END}}
    """
    # cds_dict = defaultdict(lambda: {'low': MAXVAL, 'hi': MINVAL})
    cds_dict = defaultdict(dict)
    for cds_feature in db.features_of_type(cds_key):
        for transcript_id in cds_feature.attributes['transcript_id']:
            # if cds_feature.start <= cds_dict[transcript_id]['low']:
            if cds_feature.start <= cds_dict[transcript_id].get("low", MAXVAL):
                cds_dict[transcript_id]['low'] = cds_feature.start
            # if cds_feature.end >= cds_dict[transcript_id]['hi']:
            if cds_feature.end >= cds_dict[transcript_id].get("hi", MINVAL):
                cds_dict[transcript_id]['hi'] = cds_feature.end
    return cds_dict
def create_bedtools(features, keys):
    """
    Given a list of features and keys dictionary, create a bedtool
    containing intervals of features whose name is specified
    using keys['gene_id']

    :param features: list
        list of gffutils features (1-based) for which to convert
        to bedtool intervals
    :param keys: dict
        a set of keys and values which helps translate different
        GTF/GFF nomenclatures (ie. 'cds'
    :return:
    """
    intervals = []
    for feature in features:
        for i in range(len(feature.attributes[keys['gene_id']])):
            interval = pybedtools.create_interval_from_list([
                feature.seqid, str(feature.start - 1), str(feature.end),
                feature.attributes[keys['gene_id']][i], '0',
                feature.strand
            ])
            intervals.append(interval)
    bedtool = pybedtools.BedTool(intervals)
    return bedtool

def merge_bedtool_by_gene(bedtool):
    """
    Takes a bedtool and does a merge, but preserves the distinct names.
    This is different than using bedtools merge.

    ie.

    chr1    100 200 GENE1   0   +
    chr1    150 230 GENE1   0   +
    chr1    150 250 GENE2   0   +

    ->

    chr1    100 230 GENE1   0   +
    chr1    150 250 GENE2   0   +

    :param bedtool: pybedtools.BedTool

    :return merged: pandas.DataFrame
        table containing unique merged entries per distinct name.
    """
    df = bedtool.to_dataframe()
    df.columns = ['chrom','start','end','name','score','strand']
    merged = pd.DataFrame(columns=df.columns)
    progress = trange(len(set(df['name'])))
    for name in set(df['name']):
        # create sorted bedtool from dataframe containing just the names
        dx = df[df['name'] == name]
        bt = pybedtools.BedTool.from_dataframe(dx)
        bt = bt.sort()
        # merge bedtool individually
        dy = bt.merge(s=True, c='4,5,6', o='distinct,distinct,distinct').to_dataframe()
        dy.columns = ['chrom','start','end','strand','name','score','strand2']
        dy = dy[['chrom','start','end','name','score','strand']]
        #concat to master bedtool
        merged = pd.concat([merged, dy])
        progress.update(1)
    return merged

In [3]:
i1 = pybedtools.create_interval_from_list(['chr1','100','200','gene1','0','+'])
i2 = pybedtools.create_interval_from_list(['chr1','150','230','gene1','0','+'])
i3 = pybedtools.create_interval_from_list(['chr1','150','250','gene2','0','+'])
bt = pybedtools.BedTool([i1, i2, i3])

In [4]:
bt.merge(s=True, c='4,5,6', o='distinct,distinct,distinct').to_dataframe()

Unnamed: 0,chrom,start,end,name,score,strand,thickStart
0,chr1,100,250,+,"gene1,gene2",0,+


In [5]:
merge_bedtool_by_gene(bt)

100%|██████████| 2/2 [00:00<00:00, 19.64it/s]


Unnamed: 0,chrom,start,end,name,score,strand
0,chr1,100,230,gene1,0,+
0,chr1,150,250,gene2,0,+


In [24]:
def create_utr_region_bedfiles(db, keys, cds_dict, utr3_out, utr5_out):
    # Creates the 5 and 3' UTR regions
    five_prime_utr_features = []
    three_prime_utr_features = []
    for utr_feature in db.features_of_type(keys['utr']):
        classified_utr = classify_utr(utr_feature, cds_dict)
        if classified_utr == '5utr':
            five_prime_utr_features.append(utr_feature)
        elif classified_utr == '3utr':
            three_prime_utr_features.append(utr_feature)
    utr5 = create_bedtools(five_prime_utr_features, keys)
    utr3 = create_bedtools(three_prime_utr_features, keys)
    return utr3, utr5

In [18]:
keys = {'utr':'UTR', 'cds':'CDS', 'gene_id':'gene_id'}
MAXVAL = 1000000000
MINVAL = 0
db_file = '/projects/ps-yeolab/genomes/hg19/gencode_v19/gencode.v19.annotation.gtf.db'
db = gffutils.FeatureDB(db_file)
featuretypes = [x.lower() for x in list(db.featuretypes())]
cds_dict = get_all_cds_dict(db, keys['cds'])


In [25]:
utr3, utr5 = create_utr_region_bedfiles(db, keys, cds_dict, None, None)

In [79]:
df = utr3.to_dataframe()
df = df[df['chrom']=='chr1']
df.sort_values(['start'], inplace=True)
print(df.shape)
df.head()

(8676, 6)


Unnamed: 0,chrom,start,end,name,score,strand
0,chr1,70005,70008,ENSG00000186092.4,0,+
2,chr1,134900,135802,ENSG00000237683.5,0,-
1,chr1,137620,138532,ENSG00000237683.5,0,-
3,chr1,368594,368634,ENSG00000235249.1,0,+
4,chr1,621058,621098,ENSG00000185097.2,0,-


In [80]:
%%timeit
dx = df.groupby(['name']).apply(
    lambda x: pybedtools.BedTool.from_dataframe(x).merge(
        s=True, c='4,5,6', o='distinct,distinct,distinct'
    )
)

1 loop, best of 3: 1min 33s per loop


In [None]:
utr3chr1 = utr3.to_dataframe()
utr3chr1 = utr3chr1[utr3chr1['chrom']=='chr1']
utr3chr1 = pybedtools.BedTool.from_dataframe(utr3chr1)

In [61]:
%%timeit
dx = bt.to_dataframe()
dy = dx.groupby(['chrom','strand','name']).apply(lambda x: pybedtools.BedTool.from_dataframe(x).sort().merge(s=True, c='4,5,6', o='distinct,distinct,distinct'))

1 loop, best of 3: 248 ms per loop


In [62]:
%%timeit
merge_bedtool_by_gene(bt)

100%|██████████| 2/2 [00:00<00:00,  9.59it/s]
100%|██████████| 2/2 [00:00<00:00, 10.31it/s]
100%|██████████| 2/2 [00:00<00:00, 10.76it/s]
100%|██████████| 2/2 [00:00<00:00, 10.87it/s]

1 loop, best of 3: 199 ms per loop





In [87]:
def merge_bedtool_by_gene(bedtool):
    df = bedtool.to_dataframe()
    merged = pd.DataFrame(
        columns=['chrom','start','end','name','score','strand', 'thickStart']
    )
    progress = trange(len(set(df['chrom'])))
    for chrom in set(df['chrom']):
        progress.set_description("parsing {}".format(chrom))
        dx = df[df['chrom']==chrom]
        dx.sort_values('start', inplace=True)
        dy = dx.groupby(['name']).apply(
            lambda x: pybedtools.BedTool.from_dataframe(x).merge(
                s=True, c='4,5,6', o='distinct,distinct,distinct'
            )
        )
        for d in dy:
            merged = pd.concat([merged, d.to_dataframe()], axis=0)
        progress.update(1)
    merged.columns = ['chrom','start','end','strand','name','score','strand2']
    return merged[['chrom','start','end','name','score','strand']]

In [88]:
merge_bedtool_by_gene(utr3)




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.



parsing chrY:   4%|▍         | 1/25 [00:03<01:25,  3.57s/it][A[A[A


parsing chrX:   8%|▊         | 2/25 [00:44<05:41, 14.84s/it][A[A[A


parsing chr13:  12%|█▏        | 3/25 [01:01<05:38, 15.37s/it][A[A[A


parsing chr12:  16%|█▌        | 4/25 [01:54<09:21, 26.73s/it][A[A[A


parsing chr11:  20%|██        | 5/25 [03:00<12:50, 38.54s/it][A[A[A


parsing chr10:  24%|██▍       | 6/25 [03:40<12:19, 38.90s/it][A[A[A


parsing chr17:  28%|██▊       | 7/25 [04:42<13:47, 45.97s/it][A[A[A


parsing chr16:  32%|███▏      | 8/25 [05:27<12:56, 45.67s/it][A[A[A


parsing chr15:  36%|███▌      | 9/25 [05:59<11:03, 41.44s/it][A[A[A


parsing chr14:  40%|████      | 10/25 [06:34<09:53, 39.60s/it][A[A[A


parsing chr19:  44%|██

Unnamed: 0,chrom,start,end,name,score,strand
0,chrY,21867300,21867883,ENSG00000012817.11,0,-
0,chrY,15030031,15032390,ENSG00000067048.12,0,+
0,chrY,2848031,2850547,ENSG00000067646.7,0,+
0,chrY,6959530,6959724,ENSG00000092377.9,0,+
0,chrY,4972399,4973485,ENSG00000099715.10,0,+
1,chrY,5605980,5610265,ENSG00000099715.10,0,+
0,chrY,6733958,6734116,ENSG00000099721.9,0,-
0,chrY,14971338,14972764,ENSG00000114374.8,0,+
0,chrY,2734932,2735309,ENSG00000129824.11,0,+
0,chrY,16168736,16168838,ENSG00000129862.6,0,+





[A[A[A