In [1]:
import sys
sys.path.append('/usr/users/fsimone/trans-eqtl-pipeline/preprocess/gtex_v8/scripts')
import os
import numpy as np
import gzip
# from containers import GeneInfo
import re
import collections

GENEINFO_FIELDS = ['name', 'ensembl_id', 'chrom', 'start', 'end', 'typ']
class GeneInfo(collections.namedtuple('_GeneInfo', GENEINFO_FIELDS)):
    __slots__ = ()


def gencode_v12(filepath, feature = 'gene', trim=False, biotype=['protein_coding'], include_chrom = 0, include_chroms=['{:d}'.format(x + 1) for x in range(22)]):    
    annotfile = os.path.realpath(filepath)
    geneinfo = list()
    lncRNA_list = ["macro_lncRNA","non_coding","bidirectional_promoter_lncRNA","3prime_overlapping_ncRNA","sense_overlapping","processed_transcript","sense_intronic","TEC","antisense","lincRNA"]
    mode = "v19"
    if re.search("v26", filepath):
        mode = "v26"
        
    if "lncRNA" in biotype:
        mybiotype = biotype + lncRNA_list
    else:
        mybiotype = biotype
        
    if annotfile.endswith("affy"):
        geneinfo = affy_exon_chip(annotfile, include_chrom = include_chrom, include_chroms=include_chroms)
        return geneinfo
    else:
        try:
            with gzip.open(annotfile, 'r') as mfile:
                for line in mfile:
                    linesplit = line.decode().strip().split('\t')
                    if linesplit[0][0] == '#' or linesplit[2] != feature: continue # skip header

                    chrom = linesplit[0][3:]
                    if include_chrom > 0:
                        include_chroms = ['{:d}'.format(include_chrom)]
                    if chrom not in include_chroms: continue

                    # Any particular biotype selected?
                    infolist = linesplit[8].split(';')

                    if mode == "v19":
                        if len(mybiotype) > 0:
                            rowtype = infolist[2].strip().split(' ')[1].replace('"','')
                            if rowtype not in mybiotype: continue
                        gene_name = infolist[4].strip().split(' ')[1].replace('"','')

                    if mode == "v26":
                        if len(mybiotype) > 0:
                            rowtype = infolist[1].strip().split(' ')[1].replace('"','')
                            if rowtype not in mybiotype: continue
                        gene_name = infolist[2].strip().split(' ')[1].replace('"','')

                    # TSS: gene start (0-based coordinates for BED)
                    if linesplit[6] == '+':
                        start = np.int64(linesplit[3]) - 1
                        end   = np.int64(linesplit[4])
                    elif linesplit[6] == '-':
                        start = np.int64(linesplit[3])  # last base of gene
                        end   = np.int64(linesplit[4]) - 1
                    else:
                        raise ValueError('Strand not specified.')

                    # For simulation
                    if linesplit[1] == 'SIMULATION':
                        start = np.int64(linesplit[3])
                        end   = np.int64(linesplit[4])

                    gene_id = infolist[0].strip().split(' ')[1].replace('"','')
                    if trim:
                        gene_id = gene_id.split(".")[0]
                    this_gene = GeneInfo(name       = gene_name,
                                         ensembl_id = gene_id,
                                         chrom      = int(chrom),
                                         start      = start,
                                         end        = end,
                                         typ        = rowtype)

                    geneinfo.append(this_gene)
        except IOError as err:
            raise IOError('{:s}: {:s}'.format(annotfile, err.strerror))

        return geneinfo

def affy_exon_chip(filepath, include_chrom = 0, include_chroms=['{:d}'.format(x + 1) for x in range(22)]):
    geneinfo = list()
    try:
        with open(filepath, 'r') as mfile:
            next(mfile) # skip header
            for line in mfile:
                linesplit = line.strip().split('\t')
                if linesplit[0][0] == '#' : continue 

                chrom = linesplit[2][3:]
                if include_chrom > 0:
                    include_chroms = ['{:d}'.format(include_chrom)]
                if chrom not in include_chroms: continue

                # TSS: gene start (0-based coordinates for BED)
                if linesplit[3] == '+':
                    start = np.int64(linesplit[4]) - 1
                    end   = np.int64(linesplit[5])
                elif linesplit[3] == '-':
                    start = np.int64(linesplit[4])  # last base of gene
                    end   = np.int64(linesplit[5]) - 1
                else:
                    raise ValueError('Strand not specified.')

                gene_name = linesplit[7].split("//")[0].rstrip()
                transcript_cluster_id = linesplit[0]
                this_gene = GeneInfo(name       = gene_name,
                                     ensembl_id = transcript_cluster_id,
                                     chrom      = int(chrom),
                                     start      = start,
                                     end        = end)

                geneinfo.append(this_gene)
    except IOError as err:
        raise IOError('{:s}: {:s}'.format(annotfile, err.strerror))

    return geneinfo

In [2]:
infile = "/cbscratch/franco/datasets/GENCODE/gencode.v26.annotation.gtf.gz"
gene_info = gencode_v12(infile, biotype=['protein_coding', 'lncRNA'])

In [23]:
def read_samples(donorfile):    
    with open(donorfile, 'r') as samfile:
        sample = 0
        samplenames = list()
        # skip first two lines
        next(samfile)
        next(samfile)
        for line in samfile:
            if re.search('^#', line):
                continue
            sample += 1
            samplenames.append(line.strip().split()[0])
    return samplenames

def filter_donors(df, donors):
    donor_list = df.columns
    common  = [x for x in donors if x in donor_list]
    print("{:d} donors remained from {:d}".format(len(common), len(donor_list)))
    return df[common]

def filter_rows(df, genedict, gene_dict_type):
    gx_gene_list = df.index
    common  = [genedict[x] for x in gx_gene_list]
    current_types = [gene_dict_type[x] for x in gx_gene_list if genedict[x] == True]
    print("{:d} genes remained from {:d}".format(sum(common), len(gx_gene_list)))
    return df[common], current_types

In [24]:
from collections import defaultdict
import pandas as pd

donor_file = "/cbscratch/franco/datasets/gtex_v8/genotypes/gtex_v8.sample"
gx_file    = "/cbscratch/franco/trans-eqtl/new_preprocess_aug2019/gtex_v8/expression/tpms/wb_tpms_qcfilter.txt"
donors = read_samples(donor_file)
gene_dict = defaultdict(lambda: False)
gene_dict_type = dict()
for g in gene_info:
    gene_dict[g.ensembl_id] = True
    gene_dict_type[g.ensembl_id] = g.typ

gx_df = pd.read_table(gx_file, sep="\t", header=0, index_col=0)
print(gx_df.shape)
new_gx_df, current_types = filter_rows(gx_df, gene_dict, gene_dict_type)
sorted_gx_df = filter_donors(new_gx_df, donors)

print(sorted_gx_df.shape)

(12470, 670)
12046 genes remained from 12470
670 donors remained from 670
(12046, 670)


In [26]:
types = [x.typ for x in gene_info]
types_uniq = list(set(types))


for t in types_uniq:
    print("{:>35}\t{:>d}\t{:>d}".format(t, types.count(t), current_types.count(t) ))
print("{:>35}\t{:>d}\t{:>d}".format("Total", len(types), len(current_types) ))

                         non_coding	3	0
                       macro_lncRNA	1	0
                                TEC	1053	58
                     sense_intronic	888	57
                     protein_coding	18901	10414
                  sense_overlapping	184	40
                          antisense	5419	945
      bidirectional_promoter_lncRNA	8	0
               processed_transcript	520	103
                            lincRNA	7327	427
           3prime_overlapping_ncRNA	30	2
                              Total	34334	12046
