In [None]:
# Subset mutations for Sellers lab
# only include CCLE2 lines

import pandas as pd
import dalmatian as dm

In [None]:
ccle2_ws = "fccredits-silver-tan-7621/CCLE_v2"

In [None]:
ccle2wm = dm.WorkspaceManager(ccle2_ws).disable_hound()

In [None]:
ccle2samples = ccle2wm.get_samples()

In [None]:
ccle2samples.index

In [None]:
from taigapy import TaigaClient
tc = TaigaClient()
merged_somatic_mutations = tc.get(name='mutations-latest-ed72', version=24, file='merged_somatic_mutations')

In [None]:
merged_somatic_mutations

In [None]:
subset_mut = merged_somatic_mutations[merged_somatic_mutations.DepMap_ID.isin(ccle2samples.index)]
subset_mut

In [None]:
subset_mut.to_csv("ccle2_mutations.csv", index=None)

In [None]:
merged_segments_all = tc.get(name='cn-latest-d8d4', version=26, file='merged_segments_all')


In [None]:
merged_segments_all

In [None]:
GBM_names = ["42MGBA", "8MGBA", "A172", "AM38", "BECKER", "CAS1", "CCFSTTG1", "DBTRG05MG", "DKMG", "GAMG", "GB1", "GI1", "GMS10", "GOS3", "HS683", "KALS1", "KG1C", "KNS42", "KNS60", "KNS81", "LN18", "LN229", "M059K", "NMCG1", "SF126", "SF268", "SF295", "SF539", "SNB75", "SNU1105", "SNU201", "SNU466", "SNU489", "SNU626", "SNU738", "SW1088", "SW1783", "T98G", "TM31", "U118MG", "U251MG", "U87MG", "YH13", "YKG1"]

In [None]:
from depmapomics import tracker as track
trackerobj = track.initTracker()
t = trackerobj.read_tracker()
t

In [None]:
GBM_rows = t[t.stripped_cell_line_name.isin(GBM_names)]
GBM_rows

In [None]:
len(GBM_rows.arxspan_id.unique())

In [None]:
GBM_seg = merged_segments_all[merged_segments_all.DepMap_ID.isin(GBM_rows.arxspan_id.unique())]
GBM_seg

In [None]:
GBM_seg.to_csv("ccle_seg.csv", index=None)

In [None]:
# Check cclf twist snv output
twistwm = dm.WorkspaceManager("nci-mimoun-bi-org/PANCAN_TWIST copy").disable_hound()

In [None]:
twist_pairsets = twistwm.get_pair_sets()
twist_pairsets

In [None]:
twist41snv = pd.read_csv(twist_pairsets.loc["CCLF_TWIST41_pairs", "aggregate_snvs"], sep="\t")
twist41snv

In [None]:
twist41subset = twist41snv[twist41snv.external_id == "ACH-003025_BOM_P7_3D_1"]
twist41subset[twist41subset.Chromosome == "16"]['Hugo_Symbol']

In [None]:
twist41subset[twist41subset.Chromosome == "16"][['Start_position', 'End_position']]

In [None]:
twist41snv[twist41snv.external_id == "ACH-003025_BOM_P7_3D_1"].to_csv("ACH003025.csv", index=None)

In [None]:
maf = pd.read_csv("gs://fc-secure-bd7b8bc9-f665-4269-997e-5a402088a369/1cdc902e-234f-450c-9303-406fb34ff6ad/WGS_pipeline/b6bc83be-af09-4ac2-8c91-57bf070b8870/call-filterMaf/CDS-0JJKl3_outMAFfn.txt", sep="\t")
maf


In [None]:
maf.dbNSFP_Polyphen2_HVAR_pred.unique()

In [None]:
proteincoding_genes_tpm_logp1 = tc.get(name='expression-d035', version=21, file='proteincoding_genes_tpm_logp1')
proteincoding_genes_tpm_logp1

In [None]:
filter_col = [col for col in proteincoding_genes_tpm_logp1 if col.startswith(('ERBB2 (', 'ESR1 (', 'PGR ('))]
filter_col

In [None]:
from scipy.stats import zscore
zscores = proteincoding_genes_tpm_logp1.apply(zscore)
zscores

In [None]:
minerva_lines = ["ACH-002949", "ACH-002839", "ACH-002741", "ACH-002885", "ACH-002950", "ACH-002951", "ACH-002954", "ACH-002968", "ACH-002981", "ACH-002883", "ACH-002835", "ACH-002744", "ACH-002837", "ACH-002730", "ACH-002742", "ACH-002743", "ACH-002947", "ACH-002871", "ACH-002735", "ACH-002757", "ACH-002972", "ACH-002869", "ACH-002737", "ACH-002853", "ACH-002738", "ACH-002979", "ACH-002841", "ACH-002736", "ACH-002884", "ACH-002745", "ACH-002967", "ACH-002733"]
minerva_zscores = zscores[zscores.index.isin(minerva_lines)].T
minerva_zscores.to_csv("minerva_expression_zscores.csv")

In [None]:
minerva_zscores.loc["ACH-002949", "PGR (5241)"]

In [None]:
from biomart import BiomartServer
import io

def _fetchFromServer(ensemble_server, attributes):
    server = BiomartServer(ensemble_server)
    ensmbl = server.datasets["hsapiens_gene_ensembl"]
    res = pd.read_csv(
        io.StringIO(
            ensmbl.search({"attributes": attributes}, header=1).content.decode()
        ),
        sep="\t",
    )
    return res

def generateGeneNames(
    ensemble_server="http://nov2020.archive.ensembl.org/biomart",
    useCache=False,
    cache_folder="",
    attributes=[],
):
    """generate a genelist dataframe from ensembl's biomart

  Args:
      ensemble_server ([type], optional): [description]. Defaults to ENSEMBL_SERVER_V.
      useCache (bool, optional): [description]. Defaults to False.
      cache_folder ([type], optional): [description]. Defaults to CACHE_PATH.

  Raises:
      ValueError: [description]

  Returns:
      [type]: [description]
  """
    attr = [
        "ensembl_gene_id",
        "hgnc_symbol",
        "gene_biotype",
        "entrezgene_id",
    ]
    #assert cache_folder[-1] == "/"

    cache_folder = os.path.expanduser(cache_folder)
    #createFoldersFor(cache_folder)
    cachefile = os.path.join(cache_folder, ".biomart.csv")
    if useCache & os.path.isfile(cachefile):
        print("fetching gene names from biomart cache")
        res = pd.read_csv(cachefile)
    else:
        print("downloading gene names from biomart")
        res = _fetchFromServer(ensemble_server, attr + attributes)
        res.to_csv(cachefile, index=False)

    res.columns = attr + attributes
    if type(res) is not type(pd.DataFrame()):
        raise ValueError("should be a dataframe")
    res = res[~(res["hgnc_symbol"].isna())]

    return res

In [None]:
from genepy.utils import helper as h

mybiomart = generateGeneNames(ensemble_server="http://dec2021.archive.ensembl.org/biomart", useCache=False)
mybiomart

In [None]:
mybiomart[mybiomart.entrezgene_id == 1394.0]›

In [None]:
protmybiomart = mybiomart[mybiomart.gene_biotype == "protein_coding"]

In [None]:
protmybiomart

In [None]:
mybiomart_unique = protmybiomart.drop_duplicates(subset='ensembl_gene_id', keep="first")
mybiomart_unique

In [None]:
grouped = mybiomart_unique.groupby(['entrezgene_id'])

In [None]:
dups = []
for name, group in grouped:
    if len(group) > 1 and len(set(group.hgnc_symbol)) > 1:
        print(set(group.hgnc_symbol))
        dups.append((str(int(name)), set(group.hgnc_symbol)))

In [None]:
dups

In [None]:
mybiomart[mybiomart.entrezgene_id == 3811.0]

In [None]:
myoldbiomart = generateGeneNames(ensemble_server=ENSEMBL_SERVER_V, useCache=False)
myoldbiomart

In [None]:
protmyoldbiomart = myoldbiomart[myoldbiomart.gene_biotype == "protein_coding"]
myoldbiomart_unique = protmyoldbiomart.drop_duplicates(subset='ensembl_gene_id', keep="first")
groupedold = myoldbiomart_unique.groupby(['entrezgene_id'])

dupsold = []
for name, group in groupedold:
    if len(group) > 1 and len(set(group.hgnc_symbol)) > 1:
        print(set(group.hgnc_symbol))
        dupsold.append((str(int(name)), set(group.hgnc_symbol)))

In [None]:
dupsold

In [None]:
myoldbiomart[myoldbiomart.entrezgene_id == 3811.0]

In [None]:
myoldbiomart[myoldbiomart.hgnc_symbol == "CRHR1"]

In [None]:
from taigapy import TaigaClient
tc = TaigaClient()

CCLE_expression = tc.get(name='internal-22q1-1778', version=24, file='CCLE_expression')
CCLE_expression

In [None]:
CCLE_expression[['KIR3DL1 (3811)']]

In [None]:
rnawm = dm.WorkspaceManager('terra-broad-cancer-prod/CCLE_DepMap_RNAseq')
rnasamples = rnawm.get_samples()
rnasamples

In [None]:
from taigapy import TaigaClient
tc = TaigaClient()
achilles_gene_cn = tc.get(name='cn-achilles-version-06ca', version=70, file='achilles_gene_cn')

In [None]:
achilles_gene_cn

In [None]:
all_21Q2_gene_cn = tc.get(name='cn-achilles-version-06ca', version=60, file='all_21Q2_gene_cn')

In [None]:
all_21Q2_gene_cn

In [None]:
pd.read_csv("../temp/21Q4v2/merged_somatic_mutations_boolmatrix_other.csv")

In [None]:
immortalized = ["ACH-000043", "ACH-000049", "ACH-000063", "ACH-000064", "ACH-000071", "ACH-000079", "ACH-000083", "ACH-000088", "ACH-000119", "ACH-000125", "ACH-000131", "ACH-000134", "ACH-000135", "ACH-000154", "ACH-000165", "ACH-000170", "ACH-000175", "ACH-000180", "ACH-000184", "ACH-000185", "ACH-000194", "ACH-000199", "ACH-000214", "ACH-000224", "ACH-000229", "ACH-000230", "ACH-000240", "ACH-000275", "ACH-000284", "ACH-000306", "ACH-000340", "ACH-000413", "ACH-000494", "ACH-000526", "ACH-000529", "ACH-000531", "ACH-000539", "ACH-000540", "ACH-000642", "ACH-000742", "ACH-000797", "ACH-000850", "ACH-001018", "ACH-001087", "ACH-001093", "ACH-001142", "ACH-001169", "ACH-001179", "ACH-001180", "ACH-001181", "ACH-001187", "ACH-001207", "ACH-001310", "ACH-001357", "ACH-001453", "ACH-001646", "ACH-001767", "ACH-001827", "ACH-001828", "ACH-001854", "ACH-002210", "ACH-002247", "ACH-002270", "ACH-002316", "ACH-002319", "ACH-002321", "ACH-002327", "ACH-002336", "ACH-002342", "ACH-002343", "ACH-002344", "ACH-002346", "ACH-002347", "ACH-002348", "ACH-002350", "ACH-002351", "ACH-002352", "ACH-002353", "ACH-002354", "ACH-002355", "ACH-002356", "ACH-002357", "ACH-002358", "ACH-002359", "ACH-002360", "ACH-002361", "ACH-002362", "ACH-002363", "ACH-002364", "ACH-002365", "ACH-002366", "ACH-002367", "ACH-002368", "ACH-002369", "ACH-002370", "ACH-002371", "ACH-002372", "ACH-002373", "ACH-002374", "ACH-002375", "ACH-002376", "ACH-002377", "ACH-002378", "ACH-002379", "ACH-002380", "ACH-002381", "ACH-002382", "ACH-002383", "ACH-002384", "ACH-002402", "ACH-002462", "ACH-002463", "ACH-002464", "ACH-002465", "ACH-002466", "ACH-002467", "ACH-002521", "ACH-002549", "ACH-002558", "ACH-002576", "ACH-002591", "ACH-002592", "ACH-002593", "ACH-002594", "ACH-002609", "ACH-002610", "ACH-002611", "ACH-002612", "ACH-002623", "ACH-002624", "ACH-002625", "ACH-002712", "ACH-003016", "ACH-003017", "ACH-003020", "ACH-003062", "ACH-003068", "ACH-003104", "ACH-003117", "ACH-003118", "ACH-003132", "ACH-003133", "ACH-003135", "ACH-003136", "ACH-003137", "ACH-003138", "ACH-003139", "ACH-003140", "ACH-003141", "ACH-003142", "ACH-003143", "ACH-003144", "ACH-003145", "ACH-003146", "ACH-003147", "ACH-003148", "ACH-003149", "ACH-003150", "ACH-003151", "ACH-003152", "ACH-003153", "ACH-003154", "ACH-003155", "ACH-003156", "ACH-003157", "ACH-003158", "ACH-003159", "ACH-003160", "ACH-003161"]

In [None]:
from taigapy import TaigaClient
tc = TaigaClient()

OmicsExpressionProteinCodingGenesTPMLogp1 = tc.get(name='internal-22q4-56d4', version=81, file='OmicsExpressionProteinCodingGenesTPMLogp1')


In [None]:
subset_immortalized = OmicsExpressionProteinCodingGenesTPMLogp1[OmicsExpressionProteinCodingGenesTPMLogp1.index.isin(immortalized)]

In [None]:
subset_immortalized.to_csv("immortalized_proteincoding.csv")

In [None]:
from taigapy import TaigaClient
tc = TaigaClient()

proteinCoding_genes_tpm_logp1_withReplicates = tc.get(name='expression-d035', version=29, file='proteinCoding_genes_tpm_logp1_withReplicates')


In [None]:
subset_blacklisted = proteinCoding_genes_tpm_logp1_withReplicates[proteinCoding_genes_tpm_logp1_withReplicates.index.isin(["CDS-I2xehY", "CDS-vvQhsI"])]

In [None]:
subset_blacklisted.rename(index={"CDS-I2xehY": "ACH-003140", "CDS-vvQhsI": "ACH-003144"}).to_csv("immortalized_blacklisted.csv")