In [2]:
from eda_import import *

import pysam

In [3]:
from extract_targets import extract_info

In [5]:
gtf = './reference_data/Homo_sapiens.GRCh37.75.gtf'

In [6]:
# http://uswest.ensembl.org/info/website/upload/gff.html
names = ['seqname', 'source', 'feature', 'start', 'end', 'score',
         'strand', 'frame', 'attribute']
# adf: annotation df
print('reading {0}...'.format(gtf))
df = pd.read_csv(gtf, header=None, sep='\t', comment='#', low_memory=False, names=names)

reading ./reference_data/Homo_sapiens.GRCh37.75.gtf...


In [7]:
extract_info(df)

extracting length...
extracting transcript id...
extracting gene id...
extracting gene name...


### Does target genes overlap at all?

In [8]:
TARGET_GENES = pd.read_csv('./reference_data/target_genes_with_type.tsv', sep='\t').Gene.values.tolist()

In [9]:
' '.join(TARGET_GENES)

'ABL1 AKT1 AKT2 ALK AMER1 APC ATM AXIN1 BAP1 BCL6 BRAF BRCA1 BRCA2 CARD11 CCNE1 CDH1 CDKN2A CDKN2C CEBPA CREBBP CTNNB1 CYLD DICER1 EGFR EP300 ERBB2 EZH2 FAS FBXW7 FGFR2 FGFR3 FLT3 GATA3 GNA11 GNAQ GNAS HNF1A HRAS IDH1 JUN KDM6A KDR KIT KRAS MAP2K1 MAP2K2 MAP2K4 MAX MDM2 MDM4 MEN1 MET MITF MLH1 KMT2A MSH2 MSH6 MYC MYCL MYCN MYD88 NF1 NF2 NFE2L2 NKX2-1 NOTCH1 NOTCH2 NRAS PAX5 PDGFRA PIK3CA PIK3R1 PRKAR1A PTCH1 PTEN RB1 REL RET RNF43 SETD2 SMAD4 SMARCA4 SMARCB1 SMO SOCS1 SOX2 STAT3 STK11 SUFU TERT TET2 TNFAIP3 TP53 TRAF7 TSC1 TSC2 TSHR VHL WT1 CCND1 CCND2 FGF2 IGF2BP1 RAB10 CD47 CHURC1 DRAM1 NFE2L3 NFYA NSUN5 RTCA SEC24A STK17A TXNL4A'

In [10]:
% time ndf = df.query('gene_name in {0}'.format(TARGET_GENES))
# remove those from unassembled region
ndf = ndf[-ndf.seqname.str.contains('PATCH')]

In [11]:
ndf.head(2)

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,len,transcript_id,gene_id,gene_name
63846,1,protein_coding,gene,40361098,40367928,.,-,.,6831,,ENSG00000116990,MYCL
63847,1,protein_coding,transcript,40361098,40367685,.,-,.,6588,ENST00000397332,ENSG00000116990,MYCL


In [13]:
assert ndf.gene_id.unique().shape[0] == 114
assert ndf.gene_name.unique().shape[0] == 114

In [14]:
def check_intersection(grp):
    _df = grp.groupby('gene_name').apply(
        lambda r: pd.Series([r.start.min(), r.end.max()]))
    _df.columns = ['start', 'end']
    
    has_intersection = False
    for (ki, ri), (kj, rj) in itertools.combinations(_df.iterrows(), 2):
        intersection = set(range(ri.start, ri.end)).intersection(range(rj.start, rj.end))
        if len(intersection) > 0:
            print(ki, ri.start, ri.end, kj, rj.start, rj.end, intersection)
            has_intersection = True
    return has_intersection

In [15]:
ndf.groupby('seqname').apply(check_intersection).any()

False

That it return False means there is no overlap/intersection among those target genes, so the calculation of expression level is straightforward

### Total length of target genes

In [16]:
ndf.gene_name.unique().shape

(114,)

In [17]:
target_gene_coords = ndf.groupby('gene_name').apply(
    lambda r: pd.Series([r.start.min(), r.end.max()]))
target_gene_coords.rename(
    columns={0: 'start', 1: 'end'}, inplace=True)
target_gene_coords['len'] = (
    target_gene_coords.end - target_gene_coords.start + 1
    )

In [18]:
assert target_gene_coords.len.sum() == 9351022

Corroborated with calcuation at the gene level

In [21]:
ndf.query('feature == "gene"').apply(
    lambda row: row.end - row.start + 1, axis=1).sum()

9351022