# Create GCT

In [7]:
import pandas as pd
import os
import sys

In [8]:
data='/mnt/c/Users/ochapman/Documents/Mesirov/pedpancan_ecdna/data'
sys.path.append(data)

In [9]:
# source https://medium.com/intothegenomics/annotate-genes-and-genomic-coordinates-using-python-9259efa6ffc2
def gene_info(x):
# Extract gene names, gene_type, gene_status and level
    g_name = list(filter(lambda x: 'gene_name' in x,  x.split(";")))[0].split("=")[1]
    g_type = list(filter(lambda x: 'gene_type' in x,  x.split(";")))[0].split("=")[1]
    g_leve = int(list(filter(lambda x: 'level' in x,  x.split(";")))[0].split("=")[1])
    return (g_name, g_type, g_leve)

def get_protein_coding_genes():
    path='/mnt/c/Users/ochapman/Documents/circos/genes/gencode.v33.basic.annotation.gff3'
    gencode = pd.read_table(path, comment="#",
                            sep = "\t", names = ['seqname', 'source', 'feature', 'start' , 'end', 'score', 'strand', 'frame', 'attribute'])
    gencode_genes = gencode[(gencode.feature == "gene")][['seqname', 'start', 'end', 'attribute']].copy().reset_index().drop('index', axis=1)
    gencode_genes["gene_name"], gencode_genes["gene_type"], gencode_genes["gene_level"] = zip(*gencode_genes.attribute.apply(lambda x: gene_info(x)))
    gencode_genes = gencode_genes[gencode_genes['gene_type'] == 'protein_coding'].reset_index().drop('index', axis=1)
    gencode_genes = gencode_genes.sort_values(['gene_level', 'seqname'], ascending=True).drop_duplicates('gene_name', keep='first').reset_index().drop('index', axis=1)
    return gencode_genes
# Example usage
#gene_ids = get_protein_coding_genes()
#print("Protein-coding gene IDs:", gene_ids)

In [10]:
def write_gct_from_df(df,outfile):
    rows,cols = df.shape
    cols-=1
    header=f'#1.2\n{rows}\t{cols}\n'
    with open(outfile,'w') as f:
        f.write(header)
    df.to_csv(outfile,sep='\t',mode='a')
    
def gct_from_tsv(path):
    print('reading gene expression matrix...')
    gex = pd.read_csv(path,sep='\t',index_col=0)
    print('sorting by expression...')
    rowsums = gex.sum(axis=1)
    gex = gex.iloc[rowsums.argsort()]
    gex = gex.iloc[::-1]
    # drop duplicates
    gex.insert(0,'NAME',gex.index.map(lambda x: '_'.join(x.split('_')[1:])))
    l0=len(gex)
    gex.drop_duplicates(subset=['NAME'],inplace=True)
    print(f'Dropped {l0-len(gex)} duplicate gene entries...')
    # drop nonexpressed genes
    l0=len(gex)
    gex = gex[gex.sum(axis=1) > 100]
    print(f'Dropped {l0-len(gex)} nonexpressed genes...')
    # only protein-coding genes
    print('subset only protein-coding genes...')
    gene_ids = get_protein_coding_genes()
    gene_ids['gene_name']
    l0 = len(gex)
    gex = gex[gex.NAME.isin(gene_ids.gene_name.values)]
    print(f'Dropped {l0-len(gex)} non-protein-coding genes...')
    # set gene names to index
    gex = gex.copy()
    gex.insert(0,'DESCRIPTION',gex.index)
    gex.set_index('NAME',inplace=True)
    return gex

In [11]:
path='/mnt/c/Users/ochapman/Documents/Mesirov/pedpancan_ecdna/data/gex/CBTN-gex.tpm.tsv'
gex = gct_from_tsv(path)
gex.head()

reading gene expression matrix...
sorting by expression...
Dropped 1216 duplicate gene entries...


  gex = gex[gex.sum(axis=1) > 100]


Dropped 29300 nonexpressed genes...
subset only protein-coding genes...
Dropped 13730 non-protein-coding genes...


Unnamed: 0_level_0,DESCRIPTION,7316-235,7316-1953,7316-4032,7316-216,7316-2566,7316-637,7316-7541,7316UP-1104,7316-9815,...,7316-393,7316-2720,7316-6884,7316-2899,7316-3204,7316-612,7316-5277,7316-3768,7316-9066.6,7316-1955
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MT-ATP8,ENSG00000228253.1_MT-ATP8,2716.02,3275.83,15625.2,1421.59,1326.61,8632.86,22770.45,772.03,32027.25,...,1497.96,4120.38,63463.09,1551.12,1559.29,1861.08,39262.79,13591.82,17889.55,1230.14
MT-ND4L,ENSG00000212907.2_MT-ND4L,801.39,2278.33,9205.29,507.85,464.87,4686.8,13914.41,764.67,24669.84,...,830.92,3401.61,64374.74,596.3,667.76,746.16,27760.11,5440.01,13765.19,177.25
MT-CO1,ENSG00000198804.2_MT-CO1,2520.55,5609.3,6859.23,1415.1,1640.34,3750.15,9140.8,1977.47,15204.08,...,2812.45,13262.54,15432.33,1239.22,1402.13,1643.32,12802.08,13084.47,2440.27,1262.23
MT-ND3,ENSG00000198840.2_MT-ND3,1004.62,2023.32,7308.71,608.32,707.26,2614.15,7735.32,613.88,12914.83,...,866.3,3112.91,48507.84,729.49,699.2,821.11,18601.98,5000.52,6234.32,192.67
MT-CO3,ENSG00000198938.2_MT-CO3,1924.4,3669.34,8384.68,1209.34,1568.75,3960.95,8618.47,490.25,13683.26,...,2095.27,7914.33,27804.84,1129.44,1090.15,1433.76,13312.4,10692.34,7380.77,949.65


In [33]:
outfile='CBTN-gex.gct'
write_gct_from_df(gex,outfile)

In [12]:
gex.pop('DESCRIPTION')
path = '/mnt/c/Users/ochapman/Documents/Mesirov/pedpancan_ecdna/data/gex/CBTN-gex-protein-coding.tpm.tsv'
gex.to_csv(path,sep='\t')

# Clustering
(Matrix too large)

In [51]:
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.stats import kendalltau
from matplotlib import pyplot as plt

In [44]:
#df = gex.pop('DESCRIPTION')

In [None]:
# This didn't work
ordinal_gex = gex.apply(lambda x: x.argsort(),axis=0)
Z = linkage(ordinal_gex,method='weighted',metric=kendalltau)
fig = plt.figure(figsize=(25, 10))
dn = dendrogram(Z)

# Metadata

In [2]:
import pandas as pd
import os
import sys

data='/mnt/c/Users/ochapman/Documents/Mesirov/pedpancan_ecdna/data'
sys.path.append(data)

In [79]:
def get_metadata():
    master_table = '/mnt/c/Users/ochapman/Documents/Mesirov/pedpancan_ecdna/data/metadata/histologies.tsv'
    df = pd.read_csv(master_table,sep='\t',dtype='str')
    return df
def get_gex():
    gex_table = '/mnt/c/Users/ochapman/Documents/Mesirov/pedpancan_ecdna/data/gex/CBTN-gex-protein-coding.tpm.tsv'
    df = pd.read_csv(gex_table,sep='\t',index_col=0)
    return df

In [80]:
meta = get_metadata()
gex = get_gex()

In [81]:
samples = gex.columns
samples

Index(['7316-235', '7316-1953', '7316-4032', '7316-216', '7316-2566',
       '7316-637', '7316-7541', '7316UP-1104', '7316-9815', '7316-3202',
       ...
       '7316-393', '7316-2720', '7316-6884', '7316-2899', '7316-3204',
       '7316-612', '7316-5277', '7316-3768', '7316-9066.6', '7316-1955'],
      dtype='object', length=1922)

In [88]:
def preprocess_metadata(meta):
    meta=meta.sort_values(['sample_id','experimental_strategy']) #array(['RNA-Seq', 'WGS', 'Methylation', 'WXS', 'Targeted Sequencing'],
    meta=meta[meta.sample_type == 'Tumor']
    meta=meta[meta.Kids_First_Participant_ID.map(lambda x:x.startswith('PT_'))]
    Kids_First_Biospecimen_IDs=meta[['Kids_First_Biospecimen_ID','sample_id']].groupby('sample_id').agg({'Kids_First_Biospecimen_ID':';'.join})
    meta=meta.drop_duplicates('sample_id',keep='first')
    meta.set_index('sample_id',inplace=True)
    meta.pop('Kids_First_Biospecimen_ID')
    meta=meta.merge(Kids_First_Biospecimen_IDs,left_index=True,right_index=True)
    meta=meta[meta.index.isin(samples)]
    return meta
submeta=preprocess_metadata(meta)

In [90]:
path = '/mnt/c/Users/ochapman/Documents/Mesirov/pedpancan_ecdna/data/gex/CBTN-gex-histologies.tsv'
submeta.to_csv(path,sep='\t')

In [87]:
samples[~samples.isin(submeta.index)]

Index(['7316-2189.1', '7316-14.1', '7316-9066.1', '7316-8053.1', '7316-161.1',
       '7316-6477.1', '7316-8051.1', '7316-4509.1', '7316-8053.2',
       '7316-2186.1',
       ...
       '7316-1893.1', '7316-9062.2', '7316-278.1', '7316-7955.1',
       '7316-3776.2', '7316-8023.2', '7316-2582.1', '7316-1455.1',
       '7316-2176.2', '7316-9066.6'],
      dtype='object', length=159)

# Load data

In [2]:
import pandas as pd

In [1]:
def get_gex():
    gex_table = '/mnt/c/Users/ochapman/Documents/Mesirov/pedpancan_ecdna/data/gex/CBTN-gex-protein-coding.tpm.tsv'
    df = pd.read_csv(gex_table,sep='\t',index_col=0)
    return df
def get_metadata():
    master_table = '/mnt/c/Users/ochapman/Documents/Mesirov/pedpancan_ecdna/data/gex/CBTN-gex-histologies.tsv'
    df = pd.read_csv(master_table,sep='\t',dtype='str')
    return df

In [3]:
gex = get_gex()
gex

Unnamed: 0_level_0,7316-235,7316-1953,7316-4032,7316-216,7316-2566,7316-637,7316-7541,7316UP-1104,7316-9815,7316-3202,...,7316-393,7316-2720,7316-6884,7316-2899,7316-3204,7316-612,7316-5277,7316-3768,7316-9066.6,7316-1955
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MT-ATP8,2716.02,3275.83,15625.20,1421.59,1326.61,8632.86,22770.45,772.03,32027.25,1657.10,...,1497.96,4120.38,63463.09,1551.12,1559.29,1861.08,39262.79,13591.82,17889.55,1230.14
MT-ND4L,801.39,2278.33,9205.29,507.85,464.87,4686.80,13914.41,764.67,24669.84,793.74,...,830.92,3401.61,64374.74,596.30,667.76,746.16,27760.11,5440.01,13765.19,177.25
MT-CO1,2520.55,5609.30,6859.23,1415.10,1640.34,3750.15,9140.80,1977.47,15204.08,2112.39,...,2812.45,13262.54,15432.33,1239.22,1402.13,1643.32,12802.08,13084.47,2440.27,1262.23
MT-ND3,1004.62,2023.32,7308.71,608.32,707.26,2614.15,7735.32,613.88,12914.83,545.43,...,866.30,3112.91,48507.84,729.49,699.20,821.11,18601.98,5000.52,6234.32,192.67
MT-CO3,1924.40,3669.34,8384.68,1209.34,1568.75,3960.95,8618.47,490.25,13683.26,1816.44,...,2095.27,7914.33,27804.84,1129.44,1090.15,1433.76,13312.40,10692.34,7380.77,949.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TP53TG3F,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.02,0.00,...,0.00,0.00,0.59,0.00,0.00,0.00,0.00,0.00,0.00,0.00
IZUMO1R,0.04,0.00,0.00,0.01,0.00,0.00,0.00,0.00,0.01,0.01,...,0.00,0.00,0.00,0.03,0.00,0.00,0.00,0.01,0.00,0.00
LIPK,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
ASB15,0.00,0.00,0.00,0.05,0.00,0.00,0.00,0.00,0.01,0.05,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


In [4]:
meta = get_metadata()
meta

Unnamed: 0,sample_id,aliquot_id,Kids_First_Participant_ID,experimental_strategy,sample_type,composition,tumor_descriptor,primary_site,reported_gender,race,...,dkfz_v12_methylation_mgmt_estimated,molecular_subtype,integrated_diagnosis,Notes,harmonized_diagnosis,molecular_subtype_methyl,broad_histology,short_histology,cancer_group,Kids_First_Biospecimen_ID
0,7316-10,1114560,PT_K8ZV7APT,Methylation,Tumor,Solid Tissue,Initial CNS Tumor,Spinal Cord- Lumbar/Thecal Sac,Female,Black or African American,...,0.00278215388511586,,,,Neurofibroma/Plexiform,,Tumor of cranial and paraspinal nerves,Neurofibroma,Neurofibroma/Plexiform,BS_458X47PJ;BS_GDHH6T5A;BS_1RFBH1SP
1,7316-100,1112049,PT_6TZR2DH1,Methylation,Tumor,Solid Tissue,Initial CNS Tumor,Suprasellar/Hypothalamic/Pituitary,Female,Black or African American,...,0.00398842742564261,"CRANIO, ADAM",Adamantinomatous craniopharyngioma,Updated via OpenPedCan subtyping,Adamantinomatous craniopharyngioma,,Tumors of sellar region,Craniopharyngioma,Adamantinomatous Craniopharyngioma,BS_V4PBW4WH;BS_BHR08WGW;BS_SFZ3A07S
2,7316-101,1112551,PT_CWD717Q0,Methylation,Tumor,Solid Tissue,Initial CNS Tumor,Temporal Lobe,Male,Black or African American,...,0.02092587213451,"GNG, other MAPK","Ganglioglioma, other MAPK",Updated via OpenPedCan subtyping,"Ganglioglioma, other MAPK","GNT, MAPK",Low-grade astrocytic tumor,Ganglioglioma,Ganglioglioma,BS_Q37DRN94;BS_QV51J756;BS_4RS1SC48
3,7316-1017,1251886,PT_0DWRY9ZX,Methylation,Tumor,Solid Tissue,Recurrence,Frontal Lobe,Male,White,...,0.0220976046770112,"MB, Group4","Medulloblastoma, group 4",Subtype based on prediction;Updated via OpenPe...,"Medulloblastoma, group 4","MB, Group4",Embryonal tumor,Medulloblastoma,Medulloblastoma,BS_24KYA5PH;BS_FB6ADVMD;BS_SD4SNCCX;BS_ZGSTZCYN
4,7316-1038,1116046,PT_STRDJQ01,Methylation,Tumor,Solid Tissue,Initial CNS Tumor,Cerebellum/Posterior Fossa,Female,Reported Unknown,...,0.00977657395132449,"MB, SHH","Medulloblastoma, SHH-activated",Subtype based on prediction;Updated via OpenPe...,"Medulloblastoma, SHH-activated","MB, SHH",Embryonal tumor,Medulloblastoma,Medulloblastoma,BS_Y0CPEZ59;BS_VPBMDMQX
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1758,7316-9975,1277236,PT_GFERC3PT,RNA-Seq,Tumor,Solid Tissue,Initial CNS Tumor,Skull,Female,White,...,,,,,Langerhans Cell histiocytosis,,Histiocytic tumor,Langerhans cell histiocytosis,Langerhans Cell histiocytosis,BS_4RAHHT12;BS_4PFG3PX6
1759,7316UP-1104,1105139,PT_X5SGG610,RNA-Seq,Tumor,Solid Tissue,Primary Tumor,Cerebrum,Female,White/Caucasian,...,,"HGG, To be classified",,,Astrocytoma,,Diffuse astrocytic and oligodendroglial tumor,HGAT,Astrocytoma,BS_8E8JKP01
1760,7316UP-2035,1141315,PT_6RZ363TJ,RNA-Seq,Tumor,Solid Tissue,Primary Tumor,Cerebrum,Female,White/Caucasian,...,,"HGG, To be classified",,,Glioblastoma,,Diffuse astrocytic and oligodendroglial tumor,HGAT,Glioblastoma,BS_0C00AYTE
1761,7316UP-310,1105138,PT_DB281Z4K,RNA-Seq,Tumor,Solid Tissue,Primary Tumor,Cerebrum,Female,White/Caucasian,...,,"HGG, To be classified",,,Glioblastoma,,Diffuse astrocytic and oligodendroglial tumor,HGAT,Glioblastoma,BS_0XHT9W4Q
