In [1]:
from collections import defaultdict
import gzip
import pandas as pd
import re


GTF_HEADER  = ['seqname', 'source', 'feature', 'start', 'end', 'score',
               'strand', 'frame']
R_SEMICOLON = re.compile(r'\s*;\s*')
R_COMMA     = re.compile(r'\s*,\s*')
R_KEYVALUE  = re.compile(r'(\s+|\s*=\s*)')


def dataframe(filename):
    """Open an optionally gzipped GTF file and return a pandas.DataFrame.
    """
    # Each column is a list stored as a value in this dict.
    result = defaultdict(list)

    for i, line in enumerate(lines(filename)):
        for key in line.keys():
            # This key has not been seen yet, so set it to None for all
            # previous lines.
            if key not in result:
                result[key] = [None] * i

        # Ensure this row has some value for each column.
        for key in result.keys():
            result[key].append(line.get(key, None))

    return pd.DataFrame(result)



def lines(filename):
    """Open an optionally gzipped GTF file and generate a dict for each line.
    """
    fn_open = gzip.open if filename.endswith('.gz') else open

    with fn_open(filename) as fh:
        for line in fh:
            if line.startswith('#'):
                continue
            else:
                yield parse(line)


def parse(line):
    """Parse a single GTF line and return a dict.
    """
    result = {}

    fields = line.rstrip().split('\t')

    for i, col in enumerate(GTF_HEADER):
        result[col] = _get_value(fields[i])

    # INFO field consists of "key1=value;key2=value;...".
    infos = [x for x in re.split(R_SEMICOLON, fields[8]) if x.strip()]

    for i, info in enumerate(infos, 1):
        # It should be key="value".
        try:
            key, _, value = re.split(R_KEYVALUE, info, 1)
        # But sometimes it is just "value".
        except ValueError:
            key = 'INFO{}'.format(i)
            value = info
        # Ignore the field if there is no value.
        if value:
            result[key] = _get_value(value)

    return result


def _get_value(value):
    if not value:
        return None

    # Strip double and single quotes.
    value = value.strip('"\'')

    # Return a list if the value has a comma.
    if ',' in value:
        value = re.split(R_COMMA, value)
    # These values are equivalent to None.
    elif value in ['', '.', 'NA']:
        return None

    return value

In [3]:
gtf = dataframe("/projects/ps-yeolab/genomes/hg19/gencode_v19/gencode.v19.annotation.gtf")

In [31]:
gtf.loc[gtf['feature'] == 'gene'].head()

Unnamed: 0,ccdsid,end,exon_id,exon_number,feature,frame,gene_id,gene_name,gene_status,gene_type,...,score,seqname,source,start,strand,tag,transcript_id,transcript_name,transcript_status,transcript_type
0,,14412,,,gene,,ENSG00000223972.4,DDX11L1,KNOWN,pseudogene,...,,chr1,HAVANA,11869,+,,ENSG00000223972.4,DDX11L1,KNOWN,pseudogene
21,,29806,,,gene,,ENSG00000227232.4,WASH7P,KNOWN,pseudogene,...,,chr1,HAVANA,14363,-,,ENSG00000227232.4,WASH7P,KNOWN,pseudogene
82,,31109,,,gene,,ENSG00000243485.2,MIR1302-11,NOVEL,lincRNA,...,,chr1,HAVANA,29554,+,,ENSG00000243485.2,MIR1302-11,NOVEL,lincRNA
92,,36081,,,gene,,ENSG00000237613.2,FAM138A,KNOWN,lincRNA,...,,chr1,HAVANA,34554,-,,ENSG00000237613.2,FAM138A,KNOWN,lincRNA
100,,54936,,,gene,,ENSG00000268020.2,OR4G4P,KNOWN,pseudogene,...,,chr1,HAVANA,52473,+,,ENSG00000268020.2,OR4G4P,KNOWN,pseudogene


In [None]:
gene_ids_to_name = pd.DataFrame(index = gtf['gene_id'].unique())
gene_ids_to_name = gene_ids_to_name.join(gtf['gene_name'], on='gene_id', how='left')
gene_ids_to_name.to_csv("/path/where/to/save/gene_ids_to_name_gencodev19.csv")

In [32]:
names = pd.DataFrame(index = gtf.loc[gtf['feature'] == 'gene']['gene_id'])
names.head()

ENSG00000223972.4
ENSG00000227232.4
ENSG00000243485.2
ENSG00000237613.2
ENSG00000268020.2


In [33]:
new = gtf.loc[gtf['feature'] == 'gene']
new.head()

Unnamed: 0,ccdsid,end,exon_id,exon_number,feature,frame,gene_id,gene_name,gene_status,gene_type,...,score,seqname,source,start,strand,tag,transcript_id,transcript_name,transcript_status,transcript_type
0,,14412,,,gene,,ENSG00000223972.4,DDX11L1,KNOWN,pseudogene,...,,chr1,HAVANA,11869,+,,ENSG00000223972.4,DDX11L1,KNOWN,pseudogene
21,,29806,,,gene,,ENSG00000227232.4,WASH7P,KNOWN,pseudogene,...,,chr1,HAVANA,14363,-,,ENSG00000227232.4,WASH7P,KNOWN,pseudogene
82,,31109,,,gene,,ENSG00000243485.2,MIR1302-11,NOVEL,lincRNA,...,,chr1,HAVANA,29554,+,,ENSG00000243485.2,MIR1302-11,NOVEL,lincRNA
92,,36081,,,gene,,ENSG00000237613.2,FAM138A,KNOWN,lincRNA,...,,chr1,HAVANA,34554,-,,ENSG00000237613.2,FAM138A,KNOWN,lincRNA
100,,54936,,,gene,,ENSG00000268020.2,OR4G4P,KNOWN,pseudogene,...,,chr1,HAVANA,52473,+,,ENSG00000268020.2,OR4G4P,KNOWN,pseudogene


In [34]:
new.set_index("gene_id", inplace=True)

In [35]:
names = names.join(new['gene_name'], how="left")
print names.shape
names.head()

(57820, 1)


Unnamed: 0_level_0,gene_name
gene_id,Unnamed: 1_level_1
ENSG00000223972.4,DDX11L1
ENSG00000227232.4,WASH7P
ENSG00000243485.2,MIR1302-11
ENSG00000237613.2,FAM138A
ENSG00000268020.2,OR4G4P


In [None]:
#save the result as a tab separated file
names.to_csv("/directory/to/save/gene_ids_to_name.txt", sep="\t")