In [None]:
# How closely associated is ecDNA amplification to oncogene overexpression?

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import sys
from pathlib import Path

sys.path.append('../src')
Path("out").mkdir(parents=True, exist_ok=True)

import data_imports

In [None]:
def clean_cbtn_rnaseq(df):
    # Drop ENSG prefixes 
    df.index = df.index.map(lambda x: '_'.join(x.split('_')[1:]))
    # We want these two ENSG entries (C19MC) but no others
    save = df.loc[['ENSG00000269842','ENSG00000269564']]
    # drop Drop entries without HUGO entries or with nonunique names.
    df = df[~ (df.index.str.startswith('ENSG') | df.index.str.startswith('PAR_Y'))]
    df = pd.concat([df,save])
    # drop duplicates
    df = df[~ df.index.duplicated(False)]
    # Samples in the cohort
    samples = data_imports.import_biosamples().external_sample_id
    df = df.loc[:,df.columns.isin(samples)]
    return df

def import_cbtn_rnaseq(path='/Users/ochapman/Library/CloudStorage/OneDrive-SanfordBurnhamPrebysMedicalDiscoveryInstitute/projects/2023-pedpancan/data/gex/CBTN-gex.tpm.tsv'):
    df = pd.read_csv(path,sep='\t',index_col=0)
    df=clean_cbtn_rnaseq(df)
    return df

def import_biosamples():
    df = data_imports.import_biosamples()
    # preprocessing
    df['amplicon_class']=df['amplicon_class'].replace(
        {'Complex noncyclic':'chromosomal',
         'BFB':'chromosomal',
         'Linear':'chromosomal'
        })
    # drop duplicate samples
    df=df.dropna(subset='external_sample_id')
    return df

def import_genes():
    df = data_imports.import_genes()
    # preprocessing
    df['feature']=df['feature'].map(lambda x: x.split('_')[0]) # drop suffix
    df['feature']=df['feature'].replace(
        {'Complex-non-cyclic':'chromosomal',
         'BFB':'chromosomal',
         'Linear':'chromosomal'
    })
    # unknown in this table are not reported in other AC results afaik
    df=df[df.feature != 'unknown']
    return df

In [None]:
# GLOBALS

RNA=import_cbtn_rnaseq()
BIOSAMPLES=import_biosamples()
GENES = import_genes()


### schemae

In [None]:
RNA.head() 

In [None]:
BIOSAMPLES.head()

In [None]:
GENES.head()

# all oncogene expression

In [None]:
def get_all_amp_oncogenes():
    # All oncogenes amplified on cbtn samples
    df = GENES[(GENES.is_canonical_oncogene) & GENES.sample_name.isin(BIOSAMPLES.index)].gene
    return set(df)
    
def amp_class(ex_sample_id, gene):
    '''
    returns ecDNA, chromosomal, or no amplification
    '''
    biosamples = BIOSAMPLES[BIOSAMPLES.external_sample_id == ex_sample_id].index
    genes = GENES[(GENES.sample_name.isin(biosamples)) & (GENES.gene==gene)]
    if len(genes) == 0:
        return 'no amplification'
    elif (genes.feature == 'ecDNA').any():
        return 'ecDNA'
    return 'chromosomal'

def construct_oncogene_amp_dataframe():
    # get expression of all oncogenes
    genes = get_all_amp_oncogenes()
    gex = RNA[RNA.index.isin(genes)]
    gex=gex.T
    df = pd.melt(gex,ignore_index=False)
    # annotate amp status
    # TODO this is not a fast operation
    df['amplicon_class'] = df.apply(lambda x: amp_class(x.name,x['Gene']), axis=1)
    return df

def savefig(plot,basename):
    pdfName = basename + ".pdf"
    pngName = basename + ".png"
    svgName = basename + ".svg"
    plt.savefig(pdfName,format='pdf')
    plt.savefig(pngName,format='png')
    plt.savefig(svgName,format='svg')

def plot_oncogene_exp(df):
    my_order = df.groupby(by=["amplicon_class"])['value'].median().sort_values(ascending=False).index
    print(my_order)
    plot = sns.boxplot(data=df,x='amplicon_class',y='value',order=my_order,log_scale=2)
    value_counts = df.amplicon_class.value_counts()
    labels = [f"{val}\n(n={value_counts.loc[val]})" for val in my_order]
    plt.xticks(range(len(labels)), labels)
    plt.xlabel(f' {len(df.Gene.unique())} oncogenes across {len(df.index.unique())} tumors')
    plt.ylabel(f'log_2 TPM')
    sns.despine()
    savefig(plot,'figures/gex_boxplot')
    plt.show()
    # statistical tests
    s1 = df[(df.amplicon_class=='ecDNA')]['value']
    print(f'Median oncogene expression in ecDNA: {s1.median()} (TPM)')
    for comparison in ['chromosomal','no amplification']:
        s2 = df[(df.amplicon_class==comparison)]['value']
        if len(s1) > 2 and len(s2) > 2:
            stat = scipy.stats.mannwhitneyu(s1,s2)
            print(f'Mann-Whitney U test, ecDNA + vs {comparison}: {stat}')
            print(f'Median oncogene expression in {comparison}: {s2.median()} (TPM)')
        else:
            continue
    return plot

In [None]:
asdf = construct_oncogene_amp_dataframe()
asdf.head()

In [None]:
df = plot_oncogene_exp(asdf)

# RCMB56

In [None]:
def import_case11_rnaseq(path='/Users/ochapman/Library/CloudStorage/OneDrive-SanfordBurnhamPrebysMedicalDiscoveryInstitute/projects/2023-pedpancan/data/gex/case11.genes.results'):
    df = pd.read_csv(path,sep='\t',index_col=0)
    df.index.name = "Gene"
    df['case11'] = df['FPKM']
    return df.case11
def import_mb_genes():
    cols=["_","start","end","Gene"]
    p1="/Users/ochapman/projects/medullo-ecdna/circos/RCMB56/genes_1.txt"
    df1 = pd.read_csv(p1,sep='\t',names=cols)
    p2="/Users/ochapman/projects/medullo-ecdna/circos/RCMB56/genes_2.txt"
    df2 = pd.read_csv(p2,sep='\t',names=cols)
    df = pd.concat([df1,df2]).Gene
    return df

In [None]:
def violinplot_gex(gex,highlight_set,path=None):
    sns.violinplot(data=gex,orient='v',color='C0')
    swarm = gex.loc[highlight_set]
    sns.swarmplot(data=swarm,color='C1')
    for k, v in swarm.items():
        plt.text(0.1, v, k, horizontalalignment='center', verticalalignment='bottom', fontsize=10, color='black')
    #plt.title('Violin Plot of Pandas Series')
    plt.ylabel("Log gene expression (log10 TPM)")
    #plt.xlabel("7316-2577 (Recurrence)")
    sns.despine()
    if path != None:
        plt.savefig(path)
    plt.show()
    return swarm

In [None]:
data = import_case11_rnaseq()
data = np.log10(data+1)
highlight = import_mb_genes()
highlight = highlight[highlight.isin(data.index)]
genes = violinplot_gex(data,highlight,'out/case11-gex.svg')

In [None]:
genes[genes > .5].to_csv("out/rcmb56-ecdna-highly-expressed.tsv",sep='\t')


In [None]:
Differential expression sans X01Differential expression sans X01

# ETMR PT_00G007DM
BS_AQMKA8NC 	Male 	PT_00G007DM 	7316-2577 	Recurrence  
BS_K07KNTFY 	Male 	PT_00G007DM 	7316-272 	Diagnosis  

In [None]:
def violinplot_gex(gex,highlight_set,path=None):
    sns.violinplot(data=gex,orient='v',color='C0')
    swarm = gex.loc[highlight_set]
    sns.swarmplot(data=swarm,color='C1')
    i=-.9
    for sample, series in swarm.items():
        i+=1
        for k, v in series.items():
            plt.text(i, v, k, horizontalalignment='center', verticalalignment='bottom', fontsize=10, color='black')
    #plt.title('Violin Plot of Pandas Series')
    plt.ylabel("Log gene expression (log10 TPM)")
    #plt.xlabel("7316-2577 (Recurrence)")
    sns.despine()
    if path != None:
        plt.savefig(path)
    plt.show()


In [None]:
data = RNA[['7316-272','7316-2577']]
data = np.log10(data+1)
highlight = GENES[GENES.sample_name == 'BS_K07KNTFY']['gene'].to_list()
highlight += ['ENSG00000269842','ENSG00000269564']

violinplot_gex(data,highlight,'out/PT_00G007DM-gex.svg')

In [None]:
data = RNA[['7316-272','7316-2577']]
data = np.log10(data+1)
path = '/Users/ochapman/projects/pedpancan_ecdna/data/source/AmpliconClassifier/beds_by_tumor_type/ETMR/C19MC.genes.txt'
with open(path, "r") as file:
    highlight = list(map(str.strip,file.readlines()))

violinplot_gex(data,highlight,'out/PT_00G007DM-C19MC.svg')
# C19MC miRNA expression not detected

In [None]:
data = RNA[['7316-272','7316-2577']]
data = np.log10(data+1)
highlight = ['ENSG00000269842','ENSG00000269564','MIR17HG']

violinplot_gex(data,highlight,'out/PT_00G007DM-C19MC.svg')
data.loc[highlight]
# 1/2 preprocessed RNAs expressed.

# DEAD CODE (amp gene expression comparison)

In [None]:
def construct_gene_amplicon_dataframe(gene,diagnosis=None):
    # subset only samples of specific diagnosis 
    if diagnosis != None:
        bs = BIOSAMPLES[BIOSAMPLES.cancer_type == diagnosis]
    else:
        bs=BIOSAMPLES.copy()

    # is gene on amplicon?
    bs['is_amp']=bs.index.map(lambda x: len(GENES[(GENES.sample_name==x) & 
                                            (GENES.feature.str.contains(BIOSAMPLES.loc[x,'amplicon_class'])) & 
                                            (GENES.gene==gene)])>0).copy()
    bs.set_index(keys='external_sample_id',inplace=True)
    # get amplicon class 
    bs=bs[['amplicon_class','is_amp']]
    # subset gene of interest
    df=RNA.loc[[gene]].T
    df = df.join(bs,how='inner')
    df.sort_values(gene,inplace=True)    
    return df

def plot_gene_amplicon(gene,diagnosis=None):
    df=construct_gene_amplicon_dataframe(gene,diagnosis)
    my_order = df.groupby(by=["amplicon_class"])[gene].median().iloc[::-1].index
    sns.boxplot(data=df,x='amplicon_class',y=gene,order=my_order,hue='is_amp')
    value_counts = pd.Series(data=0,index=pd.MultiIndex.from_product([my_order, [True,False]]))
    other = df.groupby(['amplicon_class','is_amp']).count()
    # hack hack
    value_counts=pd.Series(
        data=value_counts.index.map(lambda x: value_counts.loc[x] + other.loc[x].iloc[0] if other.index.isin([x]).any() else value_counts.loc[x]),
        index=value_counts.index
    )
    labels = [f"{val}\n(n={value_counts.loc[(val,False)]}, {value_counts.loc[(val,True)]})" for val in my_order]
    plt.xticks(range(len(labels)), labels)
    plt.xlabel(f'{"all tumors" if diagnosis == None else str(diagnosis)} (n={len(df)})')
    plt.ylabel(f'{gene} (TPM)')
    sns.despine()
    
    eci = value_counts.index.get_loc('ecDNA')
    for key, value in value_counts.items():
        if key == ('ecDNA',True):
            continue
        else:
            s1 = df[(df.amplicon_class==key[0]) & (df.is_amp==key[1])][gene]
            s2 = df[(df.amplicon_class=='ecDNA') & df.is_amp][gene]
            if len(s1) > 2 and len(s2) > 2:
                stat = scipy.stats.mannwhitneyu(s1,s2)
                print(f'Mann-Whitney U test, ecDNA_{gene}+ vs {key[0]}_{gene}{"+" if key[1] else "-"}: {stat}')
            else:
                #print(f'Mann-Whitney U test, ecDNA_{gene}+ vs {key[0]}_{gene}{"+" if key[1] else "-"}: is invalid')
                continue
    return df

In [None]:
df = plot_gene_amplicon('MYCN')
plt.gcf().savefig('out/all_tumors_mycn_expression.png')
plt.show()
# Most MYCN overexpression attributable to ecDNA

In [None]:
sns.violinplot(data=df,y='MYCN',x=True)

In [None]:
df=plot_gene_amplicon('MYCN','MBL')
plt.gcf().savefig('out/mb_mycn_expression.png')
plt.show()

In [None]:
df = plot_gene_amplicon('MYC','MBL')
plt.gcf().savefig('out/mb_myc_expression.png')
plt.show()

In [None]:
df = plot_gene_amplicon('CDK4')
#plt.gcf().savefig('out/all_tumors_cdk4_expression.png')
plt.show()

In [None]:
df = plot_gene_amplicon('AGAP2')
#plt.gcf().savefig('out/all_tumors_cdk4_expression.png')
plt.show()

In [None]:
df[df.amplicon_class == 'ecDNA'].sort_values('AGAP2',ascending=False).head()