# create inputs for DE
see next: cbtn-edger.ipynb

In [None]:
import pandas as pd
import numpy as np
#import seaborn as sns
#import matplotlib.pyplot as plt
#import scipy.stats
import sys
from pathlib import Path
from collections import OrderedDict

sys.path.append('../src')
Path("out").mkdir(parents=True, exist_ok=True)

import data_imports

In [None]:
def clean_cbtn_rnaseq(df):
    # Drop ENSG prefixes 
    df.index = df.index.map(lambda x: '_'.join(x.split('_')[1:]))
    # We want these two ENSG entries (C19MC) but no others
    #save = df.loc[['ENSG00000269842','ENSG00000269564']]
    # drop Drop entries without HUGO entries or with nonunique names.
    df = df[~ (df.index.str.startswith('ENSG') | df.index.str.startswith('PAR_Y') | df.index.str.startswith('LINC'))]
    #f = pd.concat([df,save])
    # drop duplicates
    df = df[~ df.index.duplicated(False)]
    # Samples in the cohort
    samples = data_imports.import_biosamples().external_sample_id
    df = df.loc[:,df.columns.isin(samples)]
    return df

# note that this is a different file than cbtn-gsea. GSEA requires normalized values (TPM)
# DESeq2 requires raw counts.
def import_cbtn_rnaseq(path='/Users/ochapman/Library/CloudStorage/OneDrive-SanfordBurnhamPrebysMedicalDiscoveryInstitute/projects/2023-pedpancan/data/gex/pbta-rsem-genes.expected_count.tsv'):
    df = pd.read_csv(path,sep='\t',index_col=0)
    df=clean_cbtn_rnaseq(df)
    return df

def import_biosamples():
    df = data_imports.import_biosamples()
    # preprocessing
    df = df[df.in_unique_tumor_set]
    df['amplicon_class']=df['amplicon_class'].replace(
        {'Complex noncyclic':'chromosomal',
         'BFB':'chromosomal',
         'Linear':'chromosomal',
         'No amplification':'no_amp',
        })
    # drop duplicate samples
    df=df.dropna(subset='external_sample_id')
    return df

def import_genes():
    df = data_imports.import_genes()
    # preprocessing
    df['feature']=df['feature'].map(lambda x: x.split('_')[0]) # drop suffix
    df['feature']=df['feature'].replace(
        {'Complex-non-cyclic':'chromosomal',
         'BFB':'chromosomal',
         'Linear':'chromosomal',
         'No amplification':'no_amp',
    })
    # unknown in this table are not reported in other AC results afaik
    df=df[df.feature != 'unknown']
    return df

In [None]:
# GLOBALS

RNA=import_cbtn_rnaseq()
BIOSAMPLES=import_biosamples()
GENES = import_genes()


In [None]:
RNA.columns

In [None]:
def generate_deseq2_phenotype_table():
    df = BIOSAMPLES.copy()
    df = df.set_index("external_sample_id")
    df = df[df.index.isin(RNA.columns)]
    df = df.drop("file_name",axis=1)
    df.index.name = "Sample ID"
    df.to_csv("out/sample_phenotypes.csv")
    return df

In [None]:
df = generate_deseq2_phenotype_table()

In [None]:
len(df)

In [None]:
df.head()