In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('../src')
from data_imports import *
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
patients = import_patients()
biosamples = import_biosamples()
amplicons = import_amplicons()

# Summary statistics

In [None]:
# How many patients with ecDNA?
ec=len(patients[patients.amplicon_class == 'ecDNA'])
nec=len(patients)
print(f"Number of ecDNA+ patients: {ec}")
print(f"Total number of patients: {nec}")
print(f"fraction of ecDNA+ patients: {ec/nec}")

In [None]:
# How many unique tumor types?
print(f"Number of tumor samples: {len(biosamples)}")
print(f"Number of unique tumors: {len(biosamples[biosamples.in_unique_tumor_set])}")
print(f"Number of tumor types: {len(biosamples.cancer_type.unique())}")

# How many tumor types with >=n biosamples?
n=5
tmp = biosamples[biosamples.in_unique_tumor_set]
gby = tmp.groupby("cancer_type").count().ecDNA_sequences_detected
print(f"Number of tumor types with >= {n} tumors: {len(gby[gby >=n])}")

# How many tumor types with ecDNA?
ect = biosamples[biosamples.amplicon_class == "ecDNA"].cancer_type.unique()
print(f"Number of tumor types with ecDNA: {len(ect)}")
print(f"Tumor types with ecDNA: {', '.join(ect)}")

# How many tumor types with >=4 biosamples and ecDNA?
ect = set(ect)
m = set(gby[gby >= n].index)
print(f"Number of tumor types with ecDNA and >= {n} tumors: {len(ect & m)}")
#print(f"Tumor types with ecDNA and >= 4 tumors: {', '.join(n & m)}")

# Which ecDNA+ tumors are excluded when we set an n threshold?
print(f"ecDNA+ tumors with fewer than {n} unique biosamples: {', '.join(ect - m)}")


In [None]:
ss = amplicons[amplicons["ecDNA+"] == "Positive"]
n_ecDNA = len(ss)
print(f"Number of ecDNA sequences detected: {n_ecDNA}")

bb = biosamples[(biosamples.amplicon_class == "ecDNA")]
n_ecDNA_samples = len(bb)
print(f"Number of ecDNA+ samples: {n_ecDNA_samples}")

n_ecDNA_patients = len(bb.patient_id.unique())
print(f"Number of ecDNA+ patients: {n_ecDNA_patients}")

In [None]:
# data
tmp = biosamples[biosamples.in_unique_tumor_set]
ct = pd.crosstab(tmp.cancer_type, tmp.ecDNA_sequences_detected > 0)
ct['Total'] = ct.apply(sum,axis='columns')
ct.sort_values('Total',ascending=False,inplace=True)
ct.rename(columns={False:'no ecDNA',True:'ecDNA'},inplace=True)
ct.head()

In [None]:
def all_tumors_x_ecDNA_stacked_barplot(ct, svgfile=None):
    cats = ['ecDNA','no ecDNA']
    colors={'ecDNA':'red','no ecDNA':'grey'}

    # legend
    plt.figure(figsize=(12,5))
    l = plt.legend(handles=[plt.Rectangle((0,0),1,1,fc=colors[c],edgecolor='none') for c in cats],
                  labels=cats, loc='upper right',frameon=False,ncol=1,prop={'size':14})
    # plot
    p=recursive_stacked_barplot(ct,cats,colors)
    # axes
    sns.despine(left=True)
    p.set_xticklabels(p.get_xticklabels(), rotation=-45,horizontalalignment='left',rotation_mode="anchor")
    p.set_xlabel("tumor type")
    p.set_ylabel("count")
    #p.set_ylim(0,20)
    #p.set_yticks(range(0,20,5))
    for item in ([p.xaxis.label, p.yaxis.label] +
            p.get_yticklabels()):
        item.set_fontsize(14)
    for item in p.get_xticklabels():
        item.set_fontsize(7)
    # save
    if svgfile != None:
        plt.savefig(svgfile)


def recursive_stacked_barplot(df,cats,colors):
    if len(cats) == 0:
        pass
    else:
        # make barplot
        df = df[cats]
        y = df.apply(sum,axis='columns')
        p=sns.barplot(x = df.index, y=y, color=colors[cats[-1]])
        # recurse
        recursive_stacked_barplot(df,cats[:-1],colors)
        return p
    
## This plot has too many tumor types to be very readable but shows that we have a long tail of tumor types, and the long tail is ecDNA-.
all_tumors_x_ecDNA_stacked_barplot(ct)

In [None]:
def geqn_tumors_x_ecDNA_stacked_barplot(ct, svgfile=None):
    cats = ['ecDNA','no ecDNA']
    colors={'ecDNA':'red','no ecDNA':'grey'}

    # legend
    plt.figure(figsize=(12,5))
    l = plt.legend(handles=[plt.Rectangle((0,0),1,1,fc=colors[c],edgecolor='none') for c in cats],
                  labels=cats, loc='upper right',frameon=False,ncol=1,prop={'size':14})
    # plot
    p=recursive_stacked_barplot(ct,cats,colors)
    # axes
    sns.despine(left=True)
    p.set_xticklabels(p.get_xticklabels(), rotation=-45,horizontalalignment='left',rotation_mode="anchor")
    p.set_xlabel("tumor type")
    p.set_ylabel("count")
    #p.set_ylim(0,20)
    #p.set_yticks(range(0,20,5))
    for item in ([p.xaxis.label, p.yaxis.label] +
            p.get_xticklabels() + p.get_yticklabels()):
        item.set_fontsize(14)
    #p.set_yscale("log")
    # save
    if svgfile != None:
        plt.savefig(svgfile)
        
tmp = biosamples[biosamples.in_unique_tumor_set]
gby = tmp.groupby("cancer_type").count().ecDNA_sequences_detected
geqn = gby[gby >= n].index
geqn_tumors_x_ecDNA_stacked_barplot(ct[ct.index.isin(geqn)])

## What changed?

In [None]:
def fraction_ecDNA(cancer_type):
    tmp = biosamples[biosamples.in_unique_tumor_set]
    tmp = tmp[tmp.cancer_type == cancer_type]
    a=len(tmp[tmp.ecDNA_sequences_detected > 0])
    b=len(tmp)
    print(f"{a}/{b}, {a/b*100}%")
def subfraction_ecDNA(cancer_type):
    # break down fraction of ecDNA+ samples by subtype.
    tmp = biosamples[biosamples.in_unique_tumor_set]
    tmp = tmp[tmp.cancer_type == cancer_type]
    ct = pd.crosstab(tmp.cancer_subclass, tmp.ecDNA_sequences_detected > 0)
    for s in ct.index:
        a=ct.loc[s,True]
        b=ct.loc[s,False]+a
        print(f"{s}: {a}/{b}, {a/b*100}%")
    return

In [None]:
fraction_ecDNA('PNST')
subfraction_ecDNA('PNST')

In [None]:
fraction_ecDNA('CPT')
subfraction_ecDNA('CPT')

In [None]:
## LGG
# Sunita's 12/23 draft: 1/290
# Revision: 1/282
# XO1: 1/549
# 10/2/2024: 1/599
fraction_ecDNA('LGG')
#biosamples[(biosamples.cancer_type=='LGG') & (biosamples.amplicon_class == 'ecDNA')]


In [None]:
## HGG
# Sunita's 12/23 draft: 31/157
# Revision:  31/159
# 10/2/2024: 77/379
fraction_ecDNA('HGG')
subfraction_ecDNA('HGG')

In [None]:
## MBL
# Sunita's 12/23 draft: 25/177
# Revision: 24/178
# 10/2/2024: 55/347
fraction_ecDNA('MBL')
subfraction_ecDNA('MBL')

In [None]:
## NBL
# Sunita's 12/23 draft: 32/106
# Revision: 33/111
# 10/2/2024: 40/159
fraction_ecDNA('NBL')

In [None]:
## PBL
# Sunita's 12/23 draft: 1/4
# Revision: 1/6
# 10/2/2024: 1/28, but changed to PINT to include low-grade pineal tumors
fraction_ecDNA('PINT')
subfraction_ecDNA('PINT')

In [None]:
## EPN
# Sunita's 12/23 draft: 2/73
# Revision: 2/76
# 10/2/2024: 3/239
fraction_ecDNA('EPN')
#biosamples[(biosamples.cancer_type=='EPN') & (biosamples.amplicon_class == 'ecDNA')]

In [None]:
## OS
# Sunita's 12/23 draft: 27/57
# Revision: 26/55
# 10/2/2024: 35/70
fraction_ecDNA('OST')

In [None]:
## RMS
# Sunita's 12/23 draft: 14/35
# Revision: 16/49
fraction_ecDNA('RMS')
subfraction_ecDNA('RMS')

In [None]:
## RBL
# Sunita's 12/23 draft: 6/32
# Revision: 6/32
# 10/2/2024: 6/44
fraction_ecDNA('RBL')
#biosamples[(biosamples.cancer_type=='RBL') & (biosamples.amplicon_class == 'ecDNA')]

In [None]:
## ACC
# Sunita's 12/23 draft: 2/21
# Revision: 3/20
# 10/2/2024: 4/23
fraction_ecDNA('ACC')
#biosamples[(biosamples.cancer_type=='ACC') & (biosamples.amplicon_class == 'ecDNA')]

In [None]:
## CPG
# Sunita's 12/23 draft: 1/39
# Revision: 1/51
# 10/2/2024: 1/101
fraction_ecDNA('CPG')
#biosamples[(biosamples.cancer_type=='CPG') & (biosamples.amplicon_class == 'ecDNA')]

In [None]:
## GNT -> GG
# Sunita's 12/23 draft: 1/44
# Revision: 1/90
# 10/2/2024: 1/192
fraction_ecDNA('GNT')
#biosamples[(biosamples.cancer_type=='GNT') & (biosamples.amplicon_class == 'ecDNA')]

In [None]:
## ETMR
# Sunita's 12/23 draft: 4/4
# Revision: 4/6
# 10/2/2024: 4/9
fraction_ecDNA('ETMR')
biosamples[(biosamples.cancer_type=='ETMR') & (biosamples.amplicon_class == 'ecDNA')]

In [None]:
fraction_ecDNA('SARC')
subfraction_ecDNA('SARC')

In [None]:
fraction_ecDNA('MST')
fraction_ecDNA('GCT')
fraction_ecDNA('WLM')
fraction_ecDNA('CARC')

In [None]:
fraction_ecDNA('HBL')
fraction_ecDNA('MEL')

In [None]:
fraction_ecDNA('EWS')

In [None]:
fraction_ecDNA('EMBT')

In [None]:
fraction_ecDNA('BENG')
biosamples[(biosamples.cancer_type=='BENG') & (biosamples.amplicon_class == 'ecDNA')]

In [None]:
fraction_ecDNA('ATRT')
fraction_ecDNA('MNG')

## 9/2022 Dataset

In [None]:
def read_progress():
    # read the lines in progress.txt to a set
    with open('../2022-02-23_sj_samples/out/progress.txt','r') as f:
        return set(map(str.strip,f.readlines()))
subset = read_progress()

In [None]:
tmp = biosamples[biosamples.index.isin(subset)]

In [None]:
# data
ct = pd.crosstab(tmp.cancer_type, tmp.ecDNA_sequences_detected > 0)
ct['Total'] = ct.apply(sum,axis='columns')
ct.sort_values('Total',ascending=False,inplace=True)
ct.rename(columns={False:'no ecDNA',True:'ecDNA'},inplace=True)
all_tumors_x_ecDNA_stacked_barplot(ct)

In [None]:
tmp.tail()

In [None]:
len(tmp)