# notes by tumor type
Notes on each ecDNA sequence organized by tumor type. See also summary_statistics.ipynb.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('../src')
from data_imports import *
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
patients = import_patients()
biosamples = import_biosamples()
amplicons = import_amplicons()
genes = import_genes()
#biosamples = biosamples[biosamples.in_unique_tumor_set]

In [None]:
def get_ecDNA_biosamples(tumor_type):
    return biosamples[(biosamples.cancer_type == tumor_type) & (biosamples.amplicon_class == 'ecDNA')]
def ecDNA_by_subgroup(tumor_type):
    df = biosamples[(biosamples.cancer_type == tumor_type) & (biosamples.in_unique_tumor_set)]
    return pd.crosstab(df.cancer_subclass, df.amplicon_class)

# HBL
- SJST031395_D1: No known oncogenes. ABCB11 ACVR1 ACVR1C AHCTF1P1 B3GALT1 BAZ2B CCDC148 CD302 CERS6 COBLL1 CSRNP3 CYTIP DAPL1 DHRS9 DPP4 ERMN FAM133DP FAP FIGN G6PC2 GALNT3 GALNT5 GCA GCG GRB14 IFIH1 ITGB6 KCNH7 LRP2 LY75 MARCHF7 NOSTRIN PKP4 PLA2R1 PSMD14 RBMS1 SCN1A SCN2A SCN3A SCN7A SCN9A SLC38A11 SLC4A10 SNORA70F SPC25 STK39 TANC1 TANK TBR1 TTC21B UPP2 WDSUB1 XIRP2
- SJST031890_D1: No known oncogenes. AHCTF1P1 BAZ2B CD302 DPP4 FAP FIGN GCA GCG IFIH1 ITGB6 KCNH7 LY75 MARCHF7 PLA2R1 PSMD14 RBMS1 SLC4A10 TANK TBR1

No known oncogenes but they share a minimally recurrent amplification of chr2:159297000-164350000. Notably, amplification of this locus looks exclusive to HBL.

In [None]:
get_ecDNA_biosamples('HBL')

# MEL
- SJMEL031086_D3: Large, chromothriptic high-copy amp. ACTN4 / AKT2 / AXL / CNOT3 / RAC1 / SERTAD1 / SERTAD3 / TFPT / ZNF331 

In [None]:
get_ecDNA_biosamples('MEL')

# PNST
- BS_N3FWKZB8: AGAP2 / CDK4 / DDIT3 / GLI1 / HMGA2 / IFNG / MDM2 / WIF1
- BS_A1DV9T7G: amp1 no oncogenes (FYB1, RICTOR); amp4 TWIST1. HOXA@ on 'unknown' classification.
- BS_WH8KWW5J: CAV1 / CUX1 / FEZF1 / MET / PAX4 / SMO
Note that BS_A1DV9T7G and BS_WH8KWW5J are longitudinal samples from the same tumor. Low similarity score but some overlap means probably recombinant. 

In [None]:
get_ecDNA_biosamples('PNST')

In [None]:
ecDNA_by_subgroup('PNST')

In [None]:
len(biosamples[(biosamples.cancer_type == 'NFP') & (biosamples.in_unique_tumor_set)])

# BENG
- SJST032574_D1/D2: chr2:57170434-57220435

In [None]:
get_ecDNA_biosamples('BENG')

# GCT
- BS_MZS6FZKF: MYCN, RRAS2
- SJST030529_D1: HMGA2, IFNG, MDM2, TBC1D15, YEATS4, ALDH1A1

In [None]:
get_ecDNA_biosamples('GCT')

In [None]:
ecDNA_by_subgroup('GCT').sum()

# ACC
- SJACT001_D: no known oncogenes (chr2q23.3 ARL5A, CACNB4, NEB 5' partial). All less than 10 CN.
- SJACT005_D: no known oncogenes (chr14q32.33 ELK2AP). All less than 10 CN.
- SJACT062_D: no known oncogenes (chr8q21.11 KCNB2 partial). All less than 10 CN.
- SJACT069_D: amp2 chr11q13.4 INPPL1, NUMA1; amp3 SPI1 partial. All less than 10 CN.

In [None]:
get_ecDNA_biosamples('ACC')

In [None]:
biosamples[(biosamples.cancer_type == 'ACC') & (biosamples.amplicon_class == 'intrachromosomal')]

# WLM
- SJST030043_D2: No known oncogenes (ARAP2, DTHD1, RNF212, TMED11P). FGFR3 and NSD2 amp on unknown amplicon.
- SJWLM031333_D1: chr20 (BCL2L1, ID1, PLAGL2, SRC)
- SJST031620_D1: MYCL, YBX1
- SJWLM031677_D1: ID2, MYCN, YWHAQ

In [None]:
get_ecDNA_biosamples('WLM')

# GNT
- BS_9M49SGAQ: PLAG1 (5' partial; oncogene). CN 6.

In [None]:
get_ecDNA_biosamples('GNT')

# LGG
- SJLGG040_D:  MYB (5' partial; oncogene), CT69 (lncRNA). CN 11.

In [None]:
get_ecDNA_biosamples('LGG')

# CPG
- BS_PMFR8NPN: 2 ecDNA. amplicon1 MET; amplicon2 chr13 no known oncogenes. Progressive, partial resection. 

In [None]:
get_ecDNA_biosamples('CPG')

In [None]:
def cpg():
    df = import_biosamples()
    df = df[(df.cancer_type == 'CPG') & df.in_unique_tumor_set]
    return pd.crosstab(df.amplicon_class, df.extent_of_tumor_resection)
cpg()

# PNST


In [None]:
get_ecDNA_biosamples('PNST')

# EPN
- BS_QMY84KF4: MYCN, CN 26
- BS_9GJHMA3J: No known oncogenes. CN 8. (some partial) CLYBL FGF14 FKSG29 GPR18 GPR183 ITGBL1 NALCN PCCA TM9SF2 UBAC2 ZIC2 ZIC5  
    Best guess oncogene NALCN
- SJEPD031010_D1: No known oncogenes. CN 3. LHX3, QSOX2, NACC2-CCDC187.  
    Best guess QSOX2

In [None]:
get_ecDNA_biosamples('EPN')

In [None]:
ecDNA_by_subgroup('EPN')

# CPT
- BS_YMYESCY7 (CPP): CN < 2. Looks low-quality. Probable false positive. Update 20241112: Reran AA, reclassified.
- BS_XEVMEYFS (CPC): CN 3-4. Low copy, no known oncogenes. Some developmental TFs.

In [None]:
get_ecDNA_biosamples('CPT')

In [None]:
ecDNA_by_subgroup('CPT')

# PINT
- BS_XJZQA3AP: CN 20. MYC.

In [None]:
get_ecDNA_biosamples('PINT')

# NBL

MYCN-amp: BS_YC4RWPH3, BS_4NPEMNAD, BS_25VZM81R, SJNBL004_D, SJNBL008_D, SJNBL010_D, SJNBL012_D, SJNBL016_D, SJNBL017_D,
SJNBL019_D, SJNBL021_D, SJNBL023_D, SJNBL027_D, SJNBL030_D, SJNBL033_D, SJNBL044_D, SJNBL101_D, SJNBL117_D, SJNBL124_D,
SJNBL186_R, SJNBL189_R, SJNBL191_R, SJNBL192_R, SJNBL194_R, SJNBL195_R, SJNBL198_R, SJNBL030096_D1, SJNBL030107_D1,
SJNBL030177_D1, SJNBL030820_D1, SJNBL031239_D1, SJNBL031668_D1, SJNBL031668_D2, SJNBL032779_D1, SJNBL032876_D1
(35 total, 34 unique)

non-MYCN-amp:  
- SJNBL014_D (TERT, CN 7)  
- SJNBL196_R (STEAP1B, CN 9)  
- SJNBL197_R (no genes, CN 4)  
- SJNBL030810_D2 (TERT, CN 5)  
- SJNBL031802_D2 (KANSL1 partial, CN 8)
  

In [None]:
df = get_ecDNA_biosamples('NBL')
print(len(df))
df

# RBL
- 2 MYCN amp (SJRB001_D, SJRB011_D)
- 2 RB1 fusion amp (SJRB039_D, SJRB030065_D1
- SMARCA5 amp (SJRB030199_D1)
- ND (SJRB030188_D1). CN 72(!). No known oncogenes. CLRN2, DCAF16, FAM184B, HS3ST1, LAP3, LCORL, MED28, NCAPG, QDPR, SEL1L3, SNORA75B

In [None]:
df = get_ecDNA_biosamples('RBL')
print(len(df))
df

# Misc. sarcomas
- SJBT030809_D1 (HGCS): GALK2, FAM227B
- SJST030131_D3 (SCT): KIT, PDGFRA; APIP, CAT, CD44
- SJST032561_D1 (SYNS): low-copy (CN 3), no genes, short (48kbp)
- BS_X3KSK1AF (NOS): MYC, CN 14
- BS_AVMVWGX0 (NOS): No known oncogenes; CN 10; ARGLU1, DAOA, EFNB2, FAM155A, SLC10A2, SNORD31B. 
- BS_9E2FF9ZP (NOS): No genes; cycles file super unclear.

In [None]:
get_ecDNA_biosamples('SARC')

# ETMR
BS_K07KNTFY, BS_TE8QFF7T, BS_69VS8PS1, BS_GGY4Q2C9
All C19MC

In [None]:
get_ecDNA_biosamples('ETMR')

# EMBT
- BS_VRDQTVHR: chr6 chromothriptic. No full oncogenes, SGK1 3'.
- BS_7M145V8M: chr6 chromothriptic. No oncogenes. Progression of above. Minimal shared amplicon includes HYMAI / MAP3K5 / MPC1 / PEX7 / PLAGL1 / RPS6KA2 / SF3B5 / SFT2D1 / ZC2HC1B.
- BS_PCMHH474: C19MC, TTYH1 5'.
- BS_3PATZ29V: chr12 chromothriptic. AGAP2 / BTG1 / CDK4 / DDIT3 / FGF6 / GLI1 / HMGA2 / IFNG / KRAS / LDHB / LRIG3 / MDM2 / YEATS4.
- BS_WMXKD55A: chr12 chromothriptic. Recurrence of above. AGAP2 / CDK4 / DDIT3 / FGF6 / GLI1 / HMGA2 / IFNG / KRAS / LDHB / LRIG3 / MDM2 / YEATS4.

In [None]:
get_ecDNA_biosamples('EMBT')

# OST
- 17p11.2-p12: SJOS001101_M5, SJOS001108_M1, SJOS001126_D1 (also chr19 including CCNE1), SJOS003_D, SJOS012_D, SJOS016_D, SJOS019_D, SJOS030101_D1, SJOS018_D (also MDM2, CCND3, FOXP4), SJOS031130_D1, SJOS031782_D1
- BIRC2/BIRC3/YAP1: SJOS001104_M1, SJOS030759_D2
- 12q13 (AGAP2/CDK4/GLI1/LRIG3): SJOS001109_D1 (also FOXP4, BOC, MLLT1, SEC62, EPCAM, FBXO11, IGFR1, MSH2, MSH6 on another ecDNA),
  SJOS001110_D1, SJOS002_D, SJOS001115_D1, SJOS031125_D1 (also FOXP4),
- 6p21 (FOXP4): SJOS001109_D1, SJOS030272_D1 (17p11.2 classified as 'unknown'),SJOS031125_D1 (also 12q13), SJOS018_D (also 17p11.2)
- MYC: SJOS015_D, SJOS030605_D1
- CCNE1: SJOS001126_D1 (also including 17p11.2), SJOS016_D, SJOS030422_D1, SJOS030861_D3, SJOS032521_D1 (also RAD21 on CNC)
- SJOS001105_D1 (most of amplicon3 is categorized as 'unknown' including putative oncogenes ADGRF1, CCND3, CUL7, DNPH1, ERBB3, LRIG3, NACA, PA2G4, POLH, PTK7, TFEB; but a small segment is 'ecDNA'.
- SJOS001111_M1 (amplicon3 including MYC is 'unknown' but amplicon4 is ecDNA with a small fragment of ecDNA (CNBD1, CNGB3)
- SJOS001120_D1 (no genes on ecDNA, but oncogenes on other amplicons)
- SJOS001128_D1 (amplicon6 is mostly 'unknown' including IGF1R, but a small fragment is ecDNA+ NR2F2)
- SJOS004_D (no oncogenes on ecDNA but some on 'unknown')
- SJOS006_D (no oncogenes. LIPH, SENP2, TMEM41A)
- SJOS014_D (RAD21)
- SJOS030645_D1/2 (E2F3, HLA-B; 17p11.2 classed as 'unknown')
- SJOS031478_D2 (a bit of highly repetitive chr5 incl. PCDHB1-19, PCDHA1-12, PCDHC3-5)
- SJST031623_D1 (no ecDNA genes, but lots on 'unknown')
- SJST032337_D1 (PIM2)
- SJOS032468_D1 (no ecDNA genes, but 17p11.2 on CNC)

In [None]:
get_ecDNA_biosamples('OST')

In [None]:
osts = get_ecDNA_biosamples('OST').index
genes[genes.sample_name.isin(osts) & (genes.gene == 'FOXP4')]

In [None]:
genes.head()