# Gene copy number by amplification type
Produces a boxplot comparing CN between ecDNA and chromosomal amps for the following genes:  
MYCN, PDGFRA, MYC, MET, CCND2, SREBF1, EGFR, CCNE1  
See **Fig. 3c**.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager
import seaborn as sns
import scipy.stats as stats
import statistics
from statsmodels.stats.multitest import fdrcorrection
from pathlib import Path
import sys
sys.path.append('../src')
Path("out").mkdir(parents=True, exist_ok=True)
from data_imports import *
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
genes = import_genes()
biosamples = import_biosamples()

# Functions

In [None]:
def savefig(basename):
    pdfName = basename + ".pdf"
    pngName = basename + ".png"
    svgName = basename + ".svg"
    plt.savefig(pdfName,format='pdf')
    plt.savefig(pngName,format='png')
    plt.savefig(svgName,format='svg')

def get_interesting_genes(genes,biosamples):
    unique_tumors = set(biosamples[biosamples.in_unique_tumor_set].index)
    genes = genes[genes.sample_name.isin(unique_tumors)]
    genes = genes[genes.gene_cn.notna()]
    genes = genes[genes.is_canonical_oncogene]
    genes = genes[genes.classification.isin(['ecDNA','chromosomal'])]
    # Group by 'gene' and 'classification', then count the number of occurrences
    gene_classification_counts = genes.groupby(['gene', 'classification']).size().unstack(fill_value=0)

    # Filter genes where each classification has at least 3 occurrences
    filtered_genes = gene_classification_counts[gene_classification_counts.ge(3).all(axis=1)].index

    return(filtered_genes)

def map_amplicon_type(feature):
    # map eg. 'BFB_1' to 'chromosomal'
    feature = feature.split('_')[0]
    d = {
        'ecDNA':'ecDNA',
        'BFB':'chromosomal',
        'Linear':'chromosomal',
        'Complex-non-cyclic':'chromosomal',
        'unknown':'unclassified'
    }
    if feature in d.keys():
        return d[feature]
    else:
        return feature

def run_statistics(df):
    # Initialize a list to store p-values
    genes = []
    p_values = []

    # Perform Mann-Whitney U test for each gene
    for gene, group in df.groupby('gene'):
        genes.append(gene)
        class1_values = (group[group['classification'] == 'ecDNA']['gene_cn']).values.astype(float)
        class2_values = (group[group['classification'] == 'chromosomal']['gene_cn']).values.astype(float)
        if len(class1_values) > 1 and len(class2_values) > 1:  # Ensure there are enough values for the test
            stat, p_value = stats.mannwhitneyu(class1_values, class2_values, alternative='greater')
            p_values.append(p_value)
        else:
            p_values.append(1.0)  # If there's not enough data, assign a non-significant p-value
    
    # Apply Benjamini-Hochberg correction
    _, p_values_corrected = fdrcorrection(p_values)
    
    # Display the corrected p-values
    for gene, p_uncor, p_corr in zip(genes, p_values, p_values_corrected):
        print(f"Gene: {gene}, p-value: {p_uncor}, Corrected p-value: {p_corr}")
    return

def italicize(text):
    return '$\\it{'+text+'}$'

def add_n(ax,genes):
    # TODO: plot these counts instead of printing
    nobs = genes[['gene','classification']].value_counts()
    print(nobs)  # too much pain to add n to this plot programmatically so we'll do it post hoc in illustrator
    return nobs

def plot_oncogene_cn_by_amplicon_class(oncogenes,biosamples=None,genes=None):
    '''
    oncogenes: list of strings. eg. ['MYC'].
    If genes is provided, should be a dataframe with the columns:
        gene, gene_cn, amplicon_type 
    '''
    if biosamples is None:
        biosamples = import_biosamples()
    if genes is None:
        genes = import_genes()
        genes['classification'] = genes.feature.map(map_amplicon_type)
    genes = genes.copy()
    # get data subset of interest
    unique_tumors = set(biosamples[biosamples.in_unique_tumor_set].index)
    genes = genes[genes.sample_name.isin(unique_tumors)]
    genes = genes[genes.gene_cn.notna()]
    genes = genes[genes.gene.isin(oncogenes)]
    genes = genes[genes.classification.isin(['ecDNA','chromosomal'])]

    # Stats
    run_statistics(genes)

    # Format gene names
    genes['gene'] = genes.gene.map(italicize)
    # Plot
    linewidth=0.75
    sns.set(rc={'svg.fonttype':'none',
                'pdf.use14corefonts':True,
                'figure.figsize':(7,2),
                'font.size': 7,'axes.labelsize': 7,'axes.titlesize': 7,'xtick.labelsize': 7,
                'ytick.labelsize': 7,'legend.fontsize': 7,
                'font.family':'sans-serif',
                'font.sans-serif':'Arial',
                'axes.linewidth':linewidth,
                'ytick.major.width':linewidth,
                })
    sns.set_style("white")
    palette = ['red','magenta']
    #ax = sns.violinplot(data=genes, x="gene", y="gene_cn", hue="classification", inner="point", density_norm="width")
    ax = sns.boxplot(data=genes,x='gene',y='gene_cn',hue='classification',fliersize=0,linewidth=linewidth,palette=palette)
    sns.stripplot(data=genes,x='gene',y='gene_cn',hue='classification',dodge=True,ax=ax,linewidth=linewidth,legend=False,palette=palette)
    sns.despine()
    ax.set_yscale("log")
    ax.set_xlabel("")
    ax.set_ylabel("Estimated copy number")
    ax.tick_params(axis='y', left=True) 
    ax.set_ylim(ymin=4,ymax=1000)
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles=handles, labels=labels)
    add_n(ax,genes)
    return ax

# Analyses

In [None]:
genes['classification'] = genes.feature.map(map_amplicon_type)
oncogenes=get_interesting_genes(genes,biosamples)
print(oncogenes)
oncogenes=['MET','CCND2','CCNE1','PDGFRA','SREBF1','EGFR','MYC','MYCN'] # deduplicate, multiple oncogenes on the same amps

In [None]:
ax = plot_oncogene_cn_by_amplicon_class(oncogenes,biosamples,genes)
savefig('out/oncogene_cn')

In [None]:
def plot_all_cn_by_amplicon_class(biosamples=None,genes=None):
    '''
    oncogenes: list of strings. eg. ['MYC'].
    If genes is provided, should be a dataframe with the columns:
        gene, gene_cn, amplicon_type
    '''
    if biosamples is None:
        biosamples = import_biosamples()
    if genes is None:
        genes = import_genes()
        genes['classification'] = genes.feature.map(map_amplicon_type)
    genes = genes.copy()
    # get data subset of interest
    unique_tumors = set(biosamples[biosamples.in_unique_tumor_set].index)
    genes = genes[genes.sample_name.isin(unique_tumors)]
    genes = genes[genes.gene_cn.notna()]
    genes = genes[genes.classification.isin(['ecDNA','chromosomal'])]
    genes = genes.groupby(['sample_name','amplicon_number','classification'],as_index=False)['gene_cn'].max()

    # Stats
    class1_values = (genes[genes['classification'] == 'ecDNA']['gene_cn']).values.astype(float)
    class2_values = (genes[genes['classification'] == 'chromosomal']['gene_cn']).values.astype(float)
    stat, p_value = stats.mannwhitneyu(class1_values, class2_values, alternative='greater')
    print(f'All. median ecDNA: {statistics.median(class1_values)}. median chromosomal: {statistics.median(class2_values)}. p-value: {p_value}')
    
    # Plot
    genes['gene'] = 'All' # add dummy variable to match plot formatting of (c)
    genes['gene_cn'] = genes['gene_cn'].map(np.log10) # kernel estimates don't work on log-scaled axes
    linewidth=0.75
    sns.set(rc={'svg.fonttype':'none',
                'pdf.use14corefonts':True,
                'figure.figsize':(1,2),
                'font.size': 7,'axes.labelsize': 7,'axes.titlesize': 7,'xtick.labelsize': 7,
                'ytick.labelsize': 7,'legend.fontsize': 7,
                'font.family':'sans-serif',
                'font.sans-serif':'Arial',
                'axes.linewidth':linewidth,
                'ytick.major.width':linewidth,
                })
    sns.set_style("white")
    palette = ['red','magenta']
    ax = sns.violinplot(data=genes, x="gene", y="gene_cn", hue="classification", hue_order=['ecDNA','chromosomal'],inner="box",
                        density_norm="width", palette=palette, linewidth=linewidth, legend=False)
    #ax = sns.boxplot(data=genes,x='gene',y='gene_cn',hue='classification',hue_order=['ecDNA','chromosomal'],fliersize=0,palette=palette)
    #sns.stripplot(data=genes,x='gene',y='gene_cn',hue='classification',hue_order=['ecDNA','chromosomal'],dodge=True,ax=ax,linewidth=1,legend=False,palette=palette)
    sns.despine()
    #ax.set_yscale("log")
    ax.set_xlabel("")
    ax.set_ylabel("Estimated copy number")
    ax.tick_params(axis='y', left=True) 
    ax.set_ylim(ymin=np.log10(4),ymax=np.log10(1000))
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles=handles, labels=labels)
    add_n(ax,genes)
    return ax

In [None]:
ax = plot_all_cn_by_amplicon_class(biosamples,genes)
savefig('out/all_cn')