# Recurrent amps
Find interesting genomic ranges:
- how many regions amplified in more than *n* tumors? (get_recurrent_amps)
- how many with 2 or more oncogenes? (get_oncogene_clusters)
- what oncogenes are on recurrent amps? (get_recurrently_amp_oncogenes)
- how many without an oncogene? (get_oncogene_deserts)
- how many with genes but no oncogenes? (get_amps_w_genes_no_oncogenes)
- what uninterrupted gene sequences present on the above? (find_whole_genes_in_oncogene_deserts)

Requires the .bdg output from bed-pileup.ipynb.  
Requires pyranges. See `./pyranges.yml`.

In [None]:
import pyranges as pr
import pandas as pd
from pathlib import Path
import warnings

import sys
sys.path.append('../src')
from data_imports import *

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

In [None]:
# These operations take awhile so we set global variables
def get_gencode(path='../data/local/gencode.v47.basic.annotation.gff3'):
    df = pr.read_gff3(path)
    return df[df.Feature == 'gene']
def get_oncogene_locations(gencode=None):
    if gencode is None:
        gencode = get_gencode()
    genes = import_genes()
    names = genes[genes.is_canonical_oncogene].gene.values
    oncogene_locations = gencode[gencode.gene_name.isin(names)]
    return oncogene_locations
    
GENCODE47 = get_gencode()
ONCOGENE_LOCATIONS = get_oncogene_locations(GENCODE47)

In [None]:
# Functions to find interesting intervals
def get_recurrent_amps(path="bedgraph/ecDNA_all.bdg", cn=3, slack=10000):
    # Regions amplified in at least 3 independent tumors. Merge neighboring intervals within 10kb.
    bdg = pr.read_bed(path)
    bdg = bdg[bdg.Name >= cn]
    bdg = bdg.merge(slack=slack)
    return bdg
def get_oncogene_clusters(oncogene_locations=None):
    # Recurrent amps with 2 or more oncogenes
    if oncogene_locations is None:
        oncogene_locations = get_oncogene_locations()
    recurrent_amps = get_recurrent_amps()
    with warnings.catch_warnings(action="ignore"):
        overlaps = pr.count_overlaps({"count":oncogene_locations},recurrent_amps)
    overlaps = overlaps[overlaps.count > 1]
    return overlaps
def get_oncogene_deserts(oncogene_locations=None):
    # 'deserts' = recurrent amplifications without an oncogene.
    if oncogene_locations is None:
        oncogene_locations = get_oncogene_locations()
    recurrent_amps = get_recurrent_amps()
    with warnings.catch_warnings(action="ignore"):
        overlaps = pr.count_overlaps({"count":oncogene_locations},recurrent_amps)
    deserts = overlaps[overlaps.count < 1]    
    return deserts
def get_amps_w_genes_no_oncogenes(oncogene_locations=None,gencode=None):
    # recurrent amplifications with genes but no known oncogenes
    if oncogene_locations is None:
        oncogene_locations = get_oncogene_locations()
    if gencode is None:
        gencode = get_gencode()
    deserts = get_oncogene_deserts(oncogene_locations)
    targets = deserts.overlap(gencode,how='containment')
    return targets
def find_whole_genes_in_oncogene_deserts(oncogene_locations=None,gencode=None):
    # Genes amplified in the oncogene deserts.
    if oncogene_locations is None:
        oncogene_locations = get_oncogene_locations()
    if gencode is None:
        gencode = get_gencode()
    deserts = get_oncogene_deserts(oncogene_locations)
    targets = gencode.overlap(deserts,how='containment')
    return targets
def get_recurrently_amp_oncogenes(oncogene_locations):
    if oncogene_locations is None:
        oncogene_locations = get_oncogene_locations()
    recurrent_amps = get_recurrent_amps()
    amp_oncogenes = oncogene_locations.overlap(recurrent_amps,how='containment')
    return amp_oncogenes

In [None]:
get_recurrent_amps()

In [None]:
get_oncogene_clusters(ONCOGENE_LOCATIONS)

In [None]:
deserts = get_oncogene_deserts(ONCOGENE_LOCATIONS)
deserts.summary()
deserts.head()

In [None]:
putative_oncoregions = get_amps_w_genes_no_oncogenes(ONCOGENE_LOCATIONS,GENCODE47)
putative_oncoregions.summary()

In [None]:
putative_oncogenes = find_whole_genes_in_oncogene_deserts(ONCOGENE_LOCATIONS,GENCODE47)
putative_oncogenes.df[['gene_name','gene_type']].groupby('gene_type').count()


In [None]:
putative_oncogenes = putative_oncogenes[['Chromosome','Start','End','Strand','gene_id','gene_type',
                                         'gene_name','hgnc_id']]
putative_oncogenes.df.to_csv('out/putative_oncogenes.tsv',sep='\t',index=False)
putative_oncogenes[putative_oncogenes.gene_type == 'protein_coding']

In [None]:
get_recurrently_amp_oncogenes(ONCOGENE_LOCATIONS).df.shape

In [None]:
ONCOGENE_LOCATIONS.df.shape