# recurrent-amp-significance-threshold
Previously we have used an arbitrary threshold of n >= 3 to consider a recurrently amplified locus "interesting".* Here we define a statistical threshold for enriched loci as the 99.9% confidence interval at above which regions would be unlikely to be recurrently amplified assuming a uniform distribution of ecDNA across the mappable genome.

The permutation test is defined as follows:
- shuffle the ecDNA regions across the mappable genome:  
  ```
  cat *.bed | \
  bedtools shuffle -excl [blacklist.bed] -i [all_ecDNAs.bed] -g [hg38.genome] | \
  ```
- get genome-wide coverage of shuffled regions:
  ```
  bedtools sort | \
  bedtools genomecov -bga -i - -g [hg38.genome] | \
  bedtools subtract -a - -b $blacklist  > \
  tmp/${i}.bdg
  ```

## Requirements
- Run bed-pileup.ipynb to get actual distribution of ecDNA genome coverage
- Run run_bed_pileup_permutation_test.sh  to get null distribution

## Conclusion
n >= 3 represents a 99.9% confidence interval on a null distribution of independent randomly distributed ecDNA.

'* Twice is coincidence, thrice is a pattern.

In [None]:
import pyranges as pr
import pandas as pd
from pathlib import Path
import warnings
import numpy as np
from scipy.stats import poisson
import matplotlib.pyplot as plt
import seaborn as sns

import sys
sys.path.append('../src')
from data_imports import *

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

## Utility functions

In [None]:
def load_null_coverage(path='tmp/ecDNA'):
    '''
    Load all bedgraph files in a directory and concatenate into one giant pyranges.
    '''
    bedgraph_dir = Path(path)
    bedgraph_list = [pr.read_bed(str(file_path)) for file_path in bedgraph_dir.glob("*.bdg")]
    return pr.concat(bedgraph_list)

def load_observed_coverage(path='bedgraph/ecDNA_all.bdg'):
    observations = pr.read_bed(path)
    # hack hack: need to calculate regions with zero coverage
    null = pr.read_bed('tmp/ecDNA/1.bdg')
    l1 = observations.length
    l0 = null.length
    dummy_region = pr.PyRanges(pd.DataFrame({
        "Chromosome":["chrS"],
        "Start":[0],
        "End":[l0-l1],
        "Name":[0]
    }))
    return pr.concat([observations,dummy_region])

In [None]:
def hist_transform(bedgraph):
    '''
    For a given bedgraph, return a dict of the total bases with a given coverage.
    Eg. Input
        chr1 	0 	1 	0
        chr1 	9999 	207667 	0
        chr1 	1832744 	2045953 	1
        ...
    Output:
        0: 197667
        1: 213209
    '''
    coverages = {}
    for coverage in bedgraph.Name.unique():
        bg = bedgraph[bedgraph.Name == coverage]
        coverages[coverage] = bg.length
    return coverages
    
def fit_lambda(hist_transform):
    '''
    Assuming my data are approximately Poisson distributed, the Poisson parameter lambda is equal
    to the expected value.
    '''
    n = np.int64(); d = np.int64()
    for k,v in hist_transform.items():
        n += k*v
        d += v
    return n/d

In [None]:
# plot
def format_df_helper(pyrange,name):
    df = pd.DataFrame.from_dict(hist_transform(pyrange),orient='index')
    df.index.name = 'coverage'
    df.columns = ['freq_'+name]
    total = df['freq_'+name].sum()
    df['frac_'+name] = df['freq_'+name]/total
    return df
    
def format_dataframe(observed,null):
    '''
    Take pyranges objects observed and null and format a pd.DataFrame with the following columns:
    Index freq_h0 freq_h1 frac_h0 frac_h1
    '''
    cts_df0 = format_df_helper(null,'h0')
    cts_df1 = format_df_helper(observed,'h1')
    return cts_df1.merge(cts_df0,how='outer',left_index=True,right_index=True).fillna(0).sort_index()

def plot_poisson_null(observed,null,n=10):
    
    df = format_dataframe(observed,null)
    if n > 0:
        df = df.head(n)
    _lambda_ = np.average(df.index, weights=df.freq_h0)
    poisson_pmf = poisson.pmf(df.index, _lambda_)
    bar_width=0.35
    x_positions = np.arange(len(df.index))
    fig, ax = plt.subplots()
    ax.bar(x_positions-bar_width/2,df.frac_h0,width=bar_width,color='skyblue',label='permutations (H0)')
    ax.bar(x_positions+bar_width/2,df.frac_h1,width=bar_width,color='orange',label='observed data (H1)')
    ax.plot(x_positions, poisson_pmf, 'r-', marker='o', label=f'Poisson PMF (λ = {_lambda_:.2f})')
    plt.yscale('log')
    ax.legend()
    return ax
    

In [None]:
def savefig(basename):
    pngName = basename + ".png"
    svgName = basename + ".svg"
    plt.savefig(pngName,format='png')
    plt.savefig(svgName,format='svg')

## Null distribution of shuffled ecDNA sequences
_lambda_ = 0.29  
95% confidence threshold = 1  
99.9% confidence threshold = 3

In [None]:
permutations = load_null_coverage()
observations = load_observed_coverage()
observations.head()

In [None]:
# Get confidence thresholds
_lambda_ = fit_lambda(hist_transform(permutations))
print(_lambda_)
confidence_95 = poisson.ppf(0.95, _lambda_)
print(confidence_95)
confidence_999 = poisson.ppf(0.999, _lambda_)
print(confidence_999)

In [None]:
plot = plot_poisson_null(observations,permutations)
savefig('out/recurrent_ecDNA_null_logscale')

## Null of shuffled intrachromosomal sequences
_lambda_ = 0.24  
95% confidence threshold = 1  
99.9% confidence threshold = 3

In [None]:
permutations = load_null_coverage(path='tmp/intrachromosomal')
observations = load_observed_coverage(path='bedgraph/intrachromosomal_all.bdg')
observations.head()

In [None]:
# Get confidence thresholds
_lambda_ = fit_lambda(hist_transform(permutations))
print(_lambda_)
confidence_95 = poisson.ppf(0.95, _lambda_)
print(confidence_95)
confidence_999 = poisson.ppf(0.999, _lambda_)
print(confidence_999)

In [None]:
plot = plot_poisson_null(observations,permutations)
savefig('out/recurrent_intrachromosomal_null_logscale')