# create inputs for DE
see next: cbtn-edger.ipynb

In [1]:
import pandas as pd
import numpy as np
#import seaborn as sns
#import matplotlib.pyplot as plt
#import scipy.stats
import sys
from pathlib import Path
from collections import OrderedDict

sys.path.append('../src')
Path("out").mkdir(parents=True, exist_ok=True)

import data_imports

In [2]:
def clean_cbtn_rnaseq(df):
    # Drop ENSG prefixes 
    df.index = df.index.map(lambda x: '_'.join(x.split('_')[1:]))
    # We want these two ENSG entries (C19MC) but no others
    #save = df.loc[['ENSG00000269842','ENSG00000269564']]
    # drop Drop entries without HUGO entries or with nonunique names.
    df = df[~ (df.index.str.startswith('ENSG') | df.index.str.startswith('PAR_Y') | df.index.str.startswith('LINC'))]
    #f = pd.concat([df,save])
    # drop duplicates
    df = df[~ df.index.duplicated(False)]
    # Samples in the cohort
    samples = data_imports.import_biosamples().external_sample_id
    df = df.loc[:,df.columns.isin(samples)]
    return df

# note that this is a different file than cbtn-gsea. GSEA requires normalized values (TPM)
# DESeq2 requires raw counts.
def import_cbtn_rnaseq(path='/Users/ochapman/Library/CloudStorage/OneDrive-SanfordBurnhamPrebysMedicalDiscoveryInstitute/projects/2023-pedpancan/data/gex/pbta-rsem-genes.expected_count.tsv'):
    df = pd.read_csv(path,sep='\t',index_col=0)
    df=clean_cbtn_rnaseq(df)
    return df

def import_biosamples():
    df = data_imports.import_biosamples()
    # preprocessing
    df = df[df.in_unique_tumor_set]
    df['amplicon_class']=df['amplicon_class'].replace(
        {'Complex noncyclic':'chromosomal',
         'BFB':'chromosomal',
         'Linear':'chromosomal',
         'No amplification':'no_amp',
        })
    # drop duplicate samples
    df=df.dropna(subset='external_sample_id')
    return df

def import_genes():
    df = data_imports.import_genes()
    # preprocessing
    df['feature']=df['feature'].map(lambda x: x.split('_')[0]) # drop suffix
    df['feature']=df['feature'].replace(
        {'Complex-non-cyclic':'chromosomal',
         'BFB':'chromosomal',
         'Linear':'chromosomal',
         'No amplification':'no_amp',
    })
    # unknown in this table are not reported in other AC results afaik
    df=df[df.feature != 'unknown']
    return df

In [3]:
# GLOBALS

RNA=import_cbtn_rnaseq()
BIOSAMPLES=import_biosamples()
GENES = import_genes()


In [4]:
RNA.columns

Index(['7316-8817', '7316-5314', '7316-2146', '7316-923', '7316-1463',
       '7316-6378', '7316-820', '7316-1957', '7316-8739', '7316-562',
       ...
       '7316-8911', '7316-4468', '7316-6388', '7316-393', '7316-6884',
       '7316-2899', '7316-3204', '7316-5277', '7316-3768', '7316-1955'],
      dtype='object', length=1565)

In [5]:
def generate_deseq2_phenotype_table():
    df = BIOSAMPLES.copy()
    df = df.set_index("external_sample_id")
    df = df[df.index.isin(RNA.columns)]
    df.index.name = "Sample ID"
    df.to_csv("out/sample_phenotypes.csv")
    return df

In [7]:
df = generate_deseq2_phenotype_table()

In [9]:
df.head()

Unnamed: 0_level_0,sex,patient_id,tumor_history,age_at_diagnosis,cohort,extent_of_tumor_resection,cancer_type,cancer_subclass,ecDNA_sequences_detected,amplicon_class,in_unique_tumor_set,in_unique_patient_set
Sample ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
7316-272,Male,PT_00G007DM,Diagnosis,464.0,PBTA-X00,Gross/Near total resection,ETMR,,1,ecDNA,True,True
7316-447,Male,PT_01MZ62KG,Progressive,546.0,PBTA-X00,Partial resection,ETMR,,1,ecDNA,True,True
7316-6365,Male,PT_01SH4F1X,Diagnosis,3838.0,PBTA-X01,Gross/Near total resection,GNT,WT,0,no_amp,True,True
7316-898,Male,PT_02J5CWN5,Progressive,3722.0,PBTA-X00,Partial resection,LGG,MAPK,0,no_amp,True,True
7316-1702,,PT_02MVZZSW,Diagnosis,4666.0,PBTA-X00,Partial resection,MNG,,0,no_amp,True,True
