In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('../src')
from data_imports import *
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
patients = import_patients()
biosamples = import_biosamples()
amplicons = import_amplicons()
genes = import_genes()
#biosamples = biosamples[biosamples.in_unique_tumor_set]

In [None]:
def get_ecDNA_biosamples(tumor_type):
    return biosamples[(biosamples.cancer_type == tumor_type) & (biosamples.amplicon_class == 'ecDNA')]

# PNST


In [None]:
get_ecDNA_biosamples('PNST')

# EPN
- BS_QMY84KF4: MYCN, CN 26
- BS_9GJHMA3J: No known oncogenes. CN 8. (some partial) CLYBL FGF14 FKSG29 GPR18 GPR183 ITGBL1 NALCN PCCA TM9SF2 UBAC2 ZIC2 ZIC5  
    Best guess oncogene NALCN
- SJEPD031010_D1: No known oncogenes. CN 3. LHX3, QSOX2, NACC2-CCDC187.  
    Best guess QSOX2

In [None]:
get_ecDNA_biosamples('EPN')

# CPT
- BS_YMYESCY7 (CPP): CN < 2. Looks low-quality. Probable false positive.
- BS_XEVMEYFS (CPC): CN 3-4. Low copy, no known oncogenes. Some developmental TFs.

In [None]:
get_ecDNA_biosamples('CPT')

# PINT
- BS_XJZQA3AP: CN 20. MYC.

In [None]:
get_ecDNA_biosamples('PINT')

# NBL

MYCN-amp: BS_YC4RWPH3, BS_4NPEMNAD, BS_25VZM81R, SJNBL004_D, SJNBL008_D, SJNBL010_D, SJNBL012_D, SJNBL016_D, SJNBL017_D,
SJNBL019_D, SJNBL021_D, SJNBL023_D, SJNBL027_D, SJNBL030_D, SJNBL033_D, SJNBL044_D, SJNBL101_D, SJNBL117_D, SJNBL124_D,
SJNBL186_R, SJNBL189_R, SJNBL191_R, SJNBL192_R, SJNBL194_R, SJNBL195_R, SJNBL198_R, SJNBL030096_D1, SJNBL030107_D1,
SJNBL030177_D1, SJNBL030820_D1, SJNBL031239_D1, SJNBL031668_D1, SJNBL031668_D2, SJNBL032779_D1, SJNBL032876_D1
(35 total, 34 unique)

non-MYCN-amp:  
- SJNBL014_D (TERT, CN 7)  
- SJNBL196_R (STEAP1B, CN 9)  
- SJNBL197_R (no genes, CN 4)  
- SJNBL030810_D2 (TERT, CN 5)  
- SJNBL031802_D2 (KANSL1 partial, CN 8)
  

In [None]:
df = get_ecDNA_biosamples('NBL')
print(len(df))
df

# RBL
No new ecDNA+ tumors from previous draft.  
TODO: add sample names.  
- 2 MYCN amp (SJRB001_D, SJRB011_D)
- 2 RB1 fusion amp (SJRB039_D, SJRB030065_D1
- SMARCA5 amp (SJRB030199_D1)
- ND (SJRB030188_D1). CN 72(!). No known oncogenes. CLRN2, DCAF16, FAM184B, HS3ST1, LAP3, LCORL, MED28, NCAPG, QDPR, SEL1L3, SNORA75B

In [None]:
df = get_ecDNA_biosamples('RBL')
print(len(df))
df

# Misc. sarcomas
- SJBT030809_D1 (HGCS): GALK2, FAM227B
- SJST030131_D3 (SCT): KIT, PDGFRA; APIP, CAT, CD44
- SJST032561_D1 (SYNS): low-copy (CN 3), no genes, short (48kbp)
- BS_X3KSK1AF (NOS): MYC, CN 14
- BS_AVMVWGX0 (NOS): No known oncogenes; CN 10; ARGLU1, DAOA, EFNB2, FAM155A, SLC10A2, SNORD31B. 
- BS_9E2FF9ZP (NOS): No genes; cycles file super unclear.

In [None]:
get_ecDNA_biosamples('SARC')

# ETMR

In [None]:
get_ecDNA_biosamples('ETMR')

# OST
- 17p11.2-p12: SJOS001101_M5, SJOS001108_M1, SJOS001126_D1 (also chr19 including CCNE1), SJOS003_D, SJOS012_D, SJOS016_D, SJOS019_D, SJOS030101_D1, SJOS018_D (also MDM2, CCND3, FOXP4), SJOS031130_D1, SJOS031782_D1
- BIRC2/BIRC3/YAP1: SJOS001104_M1, SJOS030759_D2
- 12q13 (AGAP2/CDK4/GLI1/LRIG3): SJOS001109_D1 (also FOXP4, BOC, MLLT1, SEC62, EPCAM, FBXO11, IGFR1, MSH2, MSH6 on another ecDNA),
  SJOS001110_D1, SJOS002_D, SJOS001115_D1, SJOS031125_D1 (also FOXP4),
- 6p21 (FOXP4): SJOS001109_D1, SJOS030272_D1 (17p11.2 classified as 'unknown'),SJOS031125_D1 (also 12q13), SJOS018_D (also 17p11.2)
- MYC: SJOS015_D, SJOS030605_D1
- CCNE1: SJOS001126_D1 (also including 17p11.2), SJOS016_D, SJOS030422_D1, SJOS030861_D3, SJOS032521_D1 (also RAD21 on CNC)
- SJOS001105_D1 (most of amplicon3 is categorized as 'unknown' including putative oncogenes ADGRF1, CCND3, CUL7, DNPH1, ERBB3, LRIG3, NACA, PA2G4, POLH, PTK7, TFEB; but a small segment is 'ecDNA'.
- SJOS001111_M1 (amplicon3 including MYC is 'unknown' but amplicon4 is ecDNA with a small fragment of ecDNA (CNBD1, CNGB3)
- SJOS001120_D1 (no genes on ecDNA, but oncogenes on other amplicons)
- SJOS001128_D1 (amplicon6 is mostly 'unknown' including IGF1R, but a small fragment is ecDNA+ NR2F2)
- SJOS004_D (no oncogenes on ecDNA but some on 'unknown')
- SJOS006_D (no oncogenes. LIPH, SENP2, TMEM41A)
- SJOS014_D (RAD21)
- SJOS030645_D1/2 (E2F3, HLA-B; 17p11.2 classed as 'unknown')
- SJOS031478_D2 (a bit of highly repetitive chr5 incl. PCDHB1-19, PCDHA1-12, PCDHC3-5)
- SJST031623_D1 (no ecDNA genes, but lots on 'unknown')
- SJST032337_D1 (PIM2)
- SJOS032468_D1 (no ecDNA genes, but 17p11.2 on CNC)

In [None]:
get_ecDNA_biosamples('OST')

In [None]:
osts = get_ecDNA_biosamples('OST').index
genes[genes.sample_name.isin(osts) & (genes.gene == 'FOXP4')]

In [None]:
genes.head()