In [1]:
%load_ext autoreload
%autoreload 2


import sys, os
sys.path.append("../")

import numpy as np
from config import *

from helper_functions import load_target_genes


In [3]:

# Load gene list
genelistfile = "genes4testing_high_and_low_r2_0.001"
selected_gene_ids = load_target_genes(genelistfile, gene_info, chrom)

Read 641 high r2 genes

Gene ENSG00000151065.9, CHR 12, R2 value: 0.002064254894591133 
Gene ENSG00000078237.4, CHR 12, R2 value: 0.0014273084981956 
Gene ENSG00000139194.3, CHR 12, R2 value: 0.002142493224296209 
Gene ENSG00000173262.7, CHR 12, R2 value: 0.001068218937071411 
Gene ENSG00000171860.4, CHR 12, R2 value: 0.5476060358199526 
Gene ENSG00000205846.3, CHR 12, R2 value: 0.1061251194649761 
Gene ENSG00000166527.3, CHR 12, R2 value: 0.09676138334560905 
Gene ENSG00000256660.1, CHR 12, R2 value: 0.22039412271947317 
Gene ENSG00000172243.13, CHR 12, R2 value: 0.03639756962208574 
Gene ENSG00000139112.6, CHR 12, R2 value: 0.010146791070866234 
Gene ENSG00000013583.4, CHR 12, R2 value: 0.020975032822613083 
Gene ENSG00000123104.7, CHR 12, R2 value: 0.02096216113437217 
Gene ENSG00000064115.6, CHR 12, R2 value: 0.15144348128426358 
Gene ENSG00000139117.9, CHR 12, R2 value: 0.0025246906892602973 
Gene ENSG00000139174.6, CHR 12, R2 value: 0.050114437343469154 
Gene ENSG00000161800.8, C

In [17]:
candidates = ["ENSG00000090382.2", "ENSG00000127337.2", "ENSG00000139610.1", "ENSG00000171860.4", "ENSG00000256660.1", "ENSG00000205846.3"]
home = '/home/fsimone'
outdir = "fixtures"
os.makedirs(outdir)


In [18]:
# Extract gtf data
import gzip 
import numpy as np
import os

gtfpath = os.path.join(home,"datasets/gtex/gencode.v19.annotation.gtf.gz")
annotfile = os.path.realpath(gtfpath)
feature = 'gene'
biotype=['protein_coding']
include_chrom = 12
include_chroms=['{:d}'.format(x + 1) for x in range(22)]

with gzip.open(annotfile, 'r') as mfile:
    for line in mfile:
        linesplit = line.decode().strip().split('\t')
        if linesplit[0][0] == '#' or linesplit[2] != feature: continue # skip header

        chrom = linesplit[0][3:]
        if include_chrom > 0:
            include_chroms = ['{:d}'.format(include_chrom)]
        if chrom not in include_chroms: continue

        # Any particular biotype selected?
        infolist = linesplit[8].split(';')
        if len(biotype) > 0:
            rowtype = infolist[2].strip().split(' ')[1].replace('"','')
            if rowtype not in biotype: continue

        # TSS: gene start (0-based coordinates for BED)
        if linesplit[6] == '+':
            start = np.int64(linesplit[3]) - 1
            end   = np.int64(linesplit[4])
        elif linesplit[6] == '-':
            start = np.int64(linesplit[3])  # last base of gene
            end   = np.int64(linesplit[4]) - 1
        else:
            raise ValueError('Strand not specified.')

        gene_id = infolist[0].strip().split(' ')[1].replace('"','')
        gene_name = infolist[4].strip().split(' ')[1].replace('"','')
        if gene_id in candidates:
            with gzip.open(os.path.join(outdir, gene_id+".gtf.gz"), "wb") as outstream:
                outstream.write(line)

In [51]:
# Extract cis-snps for each candidate gene
from iotools import readgtf
chrom = 12
gtfile = "GTEx_450Indiv_genot_imput_info04_maf01_HWEp1E6_dbSNP135IDs_donorIDs_dosage_chr{:d}.gz".format(chrom)
windows = 1000000

for gene in candidates:
    genegtfpath = os.path.join(outdir, gene+".gtf.gz")
    gene_info = readgtf.gencode_v12(genegtfpath, include_chrom = chrom)
    with gzip.open(os.path.join(outdir, gene+".genotype.gtex.gz"), 'wb') as outstream:
        with gzip.open(gtfile, 'r') as instream:
            print(gene_info)
            for line in instream:
                arr = line.decode().strip().split()
                pos = int(arr[2])
                if pos < int(gene_info[0].start - window):
                    continue
                if pos > int(gene_info[0].end + window):
                    break
                outstream.write(line)



[GeneInfo(name='LYZ', ensembl_id='ENSG00000090382.2', chrom=12, start=69742120, end=69748014)]
[GeneInfo(name='YEATS4', ensembl_id='ENSG00000127337.2', chrom=12, start=69753482, end=69784576)]
[GeneInfo(name='CELA1', ensembl_id='ENSG00000139610.1', chrom=12, start=51722227, end=51740462)]
[GeneInfo(name='C3AR1', ensembl_id='ENSG00000171860.4', chrom=12, start=8210898, end=8219066)]
[GeneInfo(name='CLEC12B', ensembl_id='ENSG00000256660.1', chrom=12, start=10163225, end=10171218)]
[GeneInfo(name='CLEC6A', ensembl_id='ENSG00000205846.3', chrom=12, start=8608521, end=8630926)]


In [53]:
# Extract gene expressions for GTEx

exprfile = "gtex_wholeblood_normalized.lm_corr.exp.klinikum.txt"
for gene in candidates:
    with open(exprfile, 'r') as instream:
        headers = instream.readline()
        for line in instream:
            arr = line.strip().split()
            if arr[0] == gene:
                with open(os.path.join(outdir, gene+".expression.gtex.txt"), 'w') as outstream:
                    outstream.write(headers)
                    outstream.write(line)