In [None]:
import pandas as pd
from taigapy import TaigaClient
tc = TaigaClient()

In [None]:
cosmic_cmc = pd.read_csv("/home/xiaomeng/bin/depmap_omics/cmc_export.tsv.gz", sep="\t")

In [None]:
# drop variants that don't have Mutation genome position GRCh38
cosmic_cmc_reformatted = cosmic_cmc[~cosmic_cmc["Mutation genome position GRCh38"].isna()]
# create chromosome column
cosmic_cmc_reformatted["chrom"] = cosmic_cmc_reformatted["Mutation genome position GRCh38"].str.split(":").str[0]

In [None]:
cosmic_cmc_reformatted["chrom"] = "chr" + cosmic_cmc_reformatted["chrom"]
cosmic_cmc_reformatted = cosmic_cmc_reformatted[cosmic_cmc_reformatted.chrom != "chr25"]
cosmic_cmc_reformatted = cosmic_cmc_reformatted.replace({'chrom': {"chr23": "chrX", "chr24": "chrY"}})

In [None]:
cosmic_cmc_reformatted["pos"] = cosmic_cmc_reformatted["Mutation genome position GRCh38"].str.replace('-', ':').str.split(":").str[1]

In [None]:
cosmic_cmc_reformatted["chrom"].value_counts()

In [None]:
# rename ref and alt columns
cosmic_cmc_reformatted = cosmic_cmc_reformatted.rename(columns={"GENOMIC_WT_ALLELE_SEQ": "ref", "GENOMIC_MUT_ALLELE_SEQ": "alt"})

In [None]:
# keep only tier 1
cosmic_cmc_reformatted = cosmic_cmc_reformatted[cosmic_cmc_reformatted.MUTATION_SIGNIFICANCE_TIER == "1"]

In [None]:
cosmic_cmc_reformatted.to_csv("cosmic_cmc_20230509_tier1.csv", index=False)

In [None]:
cosmic_cmc_reformatted.columns

In [None]:
cosmic_tier1 = pd.read_csv("gs://cds-cosmic/cosmic_cmc_20230509_tier1.csv")

In [None]:
cosmic_tier1

In [None]:
len(cosmic_tier1.chrom.unique())

In [None]:
# MIN_SIFT_PRED: D/T/NaN, D == damaging and T == tolerated

In [None]:
cosmic_cols_tokeep = ['CLINVAR_CLNSIG', 'CLINVAR_TRAIT', 'GERP++_RS', 'MIN_SIFT_SCORE', 'MIN_SIFT_PRED', 'MUTATION_SIGNIFICANCE_TIER']

In [None]:
cosmic_tier1[["chrom", "pos", "ref", "alt"] + cosmic_cols_tokeep]

In [None]:
from taigapy import TaigaClient
tc = TaigaClient()

OmicsSomaticMutationsMatrixHotspot = tc.get(name='internal-22q4-56d4', version=93, file='OmicsSomaticMutationsMatrixHotspot')


In [None]:
OmicsSomaticMutationsMatrixHotspot.sum(axis=1).sort_values(ascending=False)

In [None]:
new_maf = pd.read_csv("gs://fc-secure-c1c2ec19-6efc-4c8d-8410-9bcf6ae87d81/submissions/0328a1f4-3a30-4b73-8fd6-2f5f667e2492/run_vcf_to_depmap/48b70bce-e298-40ba-9617-7ce6740af461/call-vcf_to_depmap/CDS-3oyqwn-maf-coding_somatic-subset.csv.gz")

In [None]:
both = new_maf[(new_maf.hess_driver == "Y") & (~new_maf.cosmic_tier.isna())]
neither = new_maf[(new_maf.hess_driver != "Y") & (new_maf.cosmic_tier.isna())]
hess_only = new_maf[(new_maf.hess_driver == "Y") & (new_maf.cosmic_tier.isna())]
cosmic_only = new_maf[(new_maf.hess_driver != "Y") & (~new_maf.cosmic_tier.isna())]

In [None]:
pd.DataFrame(data=[[len(both), len(cosmic_only)], [len(hess_only), len(neither)]], columns=["in hess", "not in hess"], index=["in cosmic", "not in cosmic"])

In [None]:
hess_driver = pd.read_csv("/home/xiaomeng/bin/depmap_omics/WGS_pipeline/hess_drivers/data/hess_drivers.csv.gz")

In [None]:
hess_driver

In [None]:
hess_driver[hess_driver.POS == 26779863]

In [None]:
# aggregating mafs for comparison

In [None]:
from depmapomics import mutations
import dalmatian as dm

wm = dm.WorkspaceManager("broad-firecloud-ccle/DEV_DepMap_WES_CN_hg38")

In [None]:
agg_maf = mutations.aggregateMAFs(wm, mafcol="depmap_maf_withcosmic")

In [None]:
wm = dm.WorkspaceManager("broad-firecloud-ccle/DepMap_WGS_CN_cosmic_run")
agg_maf_wgs = mutations.aggregateMAFs(wm, mafcol="depmap_maf_withcosmic")

In [None]:
maf_all = pd.concat([agg_maf, agg_maf_wgs], ignore_index=True)

In [None]:
maf_all

In [None]:
agg_maf[(agg_maf.hugo_symbol == "APC") & (~agg_maf.cosmic_tier.isna())][["protein_change", "popaf"]]

In [None]:
agg_maf[(agg_maf.hugo_symbol == "TP53") & (agg_maf.cosmic_tier.isna()) & (agg_maf.hess_driver == "Y")].protein_change.value_counts()

In [None]:
maf_all["hugo_protein_change"] = maf_all["hugo_symbol"] + " " + maf_all["protein_change"]

In [None]:
both = maf_all[(maf_all.hess_driver == "Y") & (~maf_all.cosmic_tier.isna())]
neither = maf_all[(maf_all.hess_driver != "Y") & (maf_all.cosmic_tier.isna())]
hess_only = maf_all[(maf_all.hess_driver == "Y") & (maf_all.cosmic_tier.isna())]
cosmic_only = maf_all[(maf_all.hess_driver != "Y") & (~maf_all.cosmic_tier.isna())]

In [None]:
dict(cosmic_only.hugo_symbol.value_counts())

In [None]:
dict(hess_only.hugo_symbol.value_counts())

In [None]:
# multiple occurences per variant:
pd.DataFrame(data=[[len(both), len(cosmic_only)], [len(hess_only), len(neither)]], columns=["in hess", "not in hess"], index=["in cosmic", "not in cosmic"])

In [None]:
# only looking at unique variants:
unique_variants = maf_all.drop_duplicates(subset=['chrom','pos','ref','alt'])

In [None]:
both_unique = unique_variants[(unique_variants.hess_driver == "Y") & (~unique_variants.cosmic_tier.isna())]
neither_unique = unique_variants[(unique_variants.hess_driver != "Y") & (unique_variants.cosmic_tier.isna())]
hess_only_unique = unique_variants[(unique_variants.hess_driver == "Y") & (unique_variants.cosmic_tier.isna())]
cosmic_only_unique = unique_variants[(unique_variants.hess_driver != "Y") & (~unique_variants.cosmic_tier.isna())]
pd.DataFrame(data=[[len(both_unique), len(cosmic_only_unique)], [len(hess_only_unique), len(neither_unique)]], columns=["in hess", "not in hess"], index=["in cosmic", "not in cosmic"])

In [None]:
dict(cosmic_only_unique.hugo_protein_change.value_counts())

In [None]:
dict(hess_only_unique.hugo_protein_change.value_counts())

In [None]:
# transform into pr-level
from depmap_omics_upload import tracker

mytracker = tracker.SampleTracker()
seq_table = mytracker.read_seq_table()
renaming_dict = dict(zip(seq_table.index, seq_table['ProfileID']))

In [None]:
maf_all["DepMap_ID"] = maf_all["DepMap_ID"].replace(renaming_dict)

In [None]:
OmicsSomaticMutationsProfile = tc.get(name='internal-23q2-1e49', version=97, file='OmicsSomaticMutationsProfile')


In [None]:
cosmic_maf_23q2 = maf_all[maf_all.DepMap_ID.isin(set(OmicsSomaticMutationsProfile.ProfileID))]

# actually run depmap mutation postprocessing to get comparable maf

In [None]:
import pandas as pd

cosmic_maf_23q2 = pd.read_csv("../output/23Q2/merged_somatic_mutations_profile.csv")

In [None]:
cosmic_maf_23q2 = cosmic_maf_23q2[cosmic_maf_23q2.DepMap_ID.isin(set(OmicsSomaticMutationsProfile.ProfileID))]

In [None]:
cosmic_maf_23q2.CosmicTier.value_counts()

# on the sample-level, calculate overlap and plot

In [None]:
pr_ids = cosmic_maf_23q2.DepMap_ID.unique()
overlap_df = pd.DataFrame(columns=["hess_only", "cosmic_only", "neither", "both", "percent_hess", "percent_cosmic"], index=pr_ids)

In [None]:
pr_ids = cosmic_maf_23q2.DepMap_ID.unique()
for pr in pr_ids:
    variants = cosmic_maf_23q2[cosmic_maf_23q2.DepMap_ID == pr]
    overlap_df.loc[pr, "both"] = len(variants[(variants.HessDriver) & (~variants.CosmicTier.isna())])
    overlap_df.loc[pr, "neither"] = len(variants[(~variants.HessDriver) & (variants.CosmicTier.isna())])
    overlap_df.loc[pr, "hess_only"] = len(variants[(variants.HessDriver) & (variants.CosmicTier.isna())])
    overlap_df.loc[pr, "cosmic_only"] = len(variants[(~variants.HessDriver) & (~variants.CosmicTier.isna())])
    if overlap_df.loc[pr, "hess_only"] + overlap_df.loc[pr, "both"] != 0:
        overlap_df.loc[pr, "percent_hess"] = overlap_df.loc[pr, "both"] / (overlap_df.loc[pr, "hess_only"] + overlap_df.loc[pr, "both"])

    if overlap_df.loc[pr, "cosmic_only"] + overlap_df.loc[pr, "both"] != 0:
        overlap_df.loc[pr, "percent_cosmic"] = overlap_df.loc[pr, "both"] / (overlap_df.loc[pr, "cosmic_only"] + overlap_df.loc[pr, "both"])


In [None]:
overlap_df

In [None]:
overlap_df_toplot = overlap_df[(~overlap_df.percent_hess.isna()) & (~overlap_df.percent_cosmic.isna())]

In [None]:
x = np.array(overlap_df_toplot.percent_hess.tolist())
y = np.array(overlap_df_toplot.percent_cosmic.tolist())

In [None]:
import matplotlib.pyplot as plt

plt.scatter(x, y, alpha=0.1)

plt.xlabel("% Hess")
plt.ylabel("% COSMIC")

In [None]:
from cds import plotting

In [None]:
plotting.density_scatter(x, y)

In [None]:
overlap_df[(~overlap_df.percent_hess.isna()) & (overlap_df.percent_cosmic.isna())]

In [None]:
overlap_df[(overlap_df.percent_hess.isna()) & (~overlap_df.percent_cosmic.isna())]

# Comparison on profiles released in 23Q2

In [None]:
cosmic_maf_23q2["unique_var"] = cosmic_maf_23q2["DepMap_ID"] + cosmic_maf_23q2["Chrom"] + cosmic_maf_23q2["Pos"].astype(str) + cosmic_maf_23q2["Ref"] + cosmic_maf_23q2["Alt"]
OmicsSomaticMutationsProfile["unique_var"] = OmicsSomaticMutationsProfile["ProfileID"] + OmicsSomaticMutationsProfile["Chrom"] + OmicsSomaticMutationsProfile["Pos"].astype(str) + OmicsSomaticMutationsProfile["Ref"] + OmicsSomaticMutationsProfile["Alt"]

In [None]:
OmicsSomaticMutationsProfile

In [None]:
cosmic_maf_23q2

In [None]:
new_variants = cosmic_maf_23q2[~cosmic_maf_23q2.unique_var.isin(set(OmicsSomaticMutationsProfile.unique_var))]

In [None]:
new_variants[(new_variants.CivicScore > 8)]

In [None]:
new_dann = new_variants[(new_variants.DannScore > 0.96) & (new_variants.HessDriver != True) & ~(new_variants.CivicScore > 8) & (new_variants.CosmicTier != 1)]

In [None]:
new_civic = new_variants[(new_variants.DannScore < 0.96) & (new_variants.HessDriver != True) & (new_variants.CivicScore > 8) & (new_variants.CosmicTier != 1)]

In [None]:
new_civic

In [None]:
new_variants_minus_dann = new_variants[~((new_variants.DannScore > 0.96) & (new_variants.HessDriver != True) & ~(new_variants.CivicScore > 8) & (new_variants.CosmicTier != 1))]

In [None]:
dict(new_variants_minus_dann.HugoSymbol.value_counts())

# map civic variants to clinvar significance/pathogenicity score and see overlap

In [None]:
civic = pd.read_csv("/home/xiaomeng/bin/depmap_omics/WGS_pipeline/civic_export_09222022.csv").drop(columns=["chromosome_37", "start_37"])

In [None]:
civic

In [None]:
cosmic_cmc_reformatted.pos = cosmic_cmc_reformatted.pos.astype("int64")

In [None]:
cosmic_cmc_reformatted[["chrom", "pos", "ref", "alt", "CLINVAR_CLNSIG", "MUTATION_SIGNIFICANCE_TIER"]]

In [None]:
merged_civic_cosmic = civic.merge(cosmic_cmc_reformatted[["chrom", "pos", "ref", "alt", "CLINVAR_CLNSIG", "MUTATION_SIGNIFICANCE_TIER"]], on=["chrom", "pos", "ref", "alt"], how="left")

In [None]:
merged_civic_cosmic[merged_civic_cosmic.civic_actionability_score > 8].MUTATION_SIGNIFICANCE_TIER.value_counts(dropna=False)

In [None]:
merged_civic_cosmic[merged_civic_cosmic.civic_actionability_score > 8]

In [None]:
# https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/

In [None]:
clinvar_variants = pd.read_csv("/home/xiaomeng/bin/depmap_omics/variant_summary.txt.gz", sep="\t")

In [None]:
clinvar_variants[["Chromosome", "Start", "Stop", "Assembly", "GeneSymbol", "ReferenceAlleleVCF", "AlternateAlleleVCF", "ClinSigSimple"]]

In [None]:
clinvar_variants.NumberSubmitters.value_counts()

In [None]:
clinvar_variants.loc[100]

In [None]:
clinvar_variants.columns

In [None]:
new_variants_minus_dann[new_variants_minus_dann.HugoSymbol == "ABCB1"].CivicScore.value_counts()

In [None]:
new_variants_minus_dann[new_variants_minus_dann.HugoSymbol == "ABCB1"][["DannScore", "HessDriver", "CivicScore", "CosmicTier", "LikelyLoF"]]

In [None]:
from mgenepy.utils import helper as h

tumor_suppressor_list = h.fileToList("../WGS_pipeline/oncokb_dm/data/tumor_suppressor_oncokb.txt")

In [None]:
cosmic_minus_dann = cosmic_maf_23q2[~((cosmic_maf_23q2.hugo_symbol.isin(set(tumor_suppressor_list))) & (cosmic_maf_23q2.dann_score > 0.96) & (cosmic_maf_23q2.hess_driver != "Y") & (cosmic_maf_23q2.cosmic_tier != 1) & (cosmic_maf_23q2.civic_score.isna()))]

In [None]:
minus_civic = cosmic_minus_dann[~((cosmic_minus_dann.hess_driver != "Y") & (cosmic_minus_dann.cosmic_tier != 1) & (cosmic_maf_23q2.civic_score > 8))]

In [None]:
OmicsSomaticMutationsProfile.HugoSymbol.value_counts()

In [None]:
new_dict = dict(cosmic_minus_dann.hugo_symbol.value_counts())
old_dict = dict(OmicsSomaticMutationsProfile.HugoSymbol.value_counts())
diff = dict()

for k, v in new_dict.items():
    if k in old_dict.keys():
        v = v - old_dict[k]
        if v > 100:
            diff[k] = v

In [None]:
diff

In [None]:
cosmic_maf_23q2[(cosmic_maf_23q2.hugo_symbol == "TP53") & (cosmic_maf_23q2.civic_score > 8)].civic_score.value_counts()

In [None]:
OmicsSomaticMutationsProfile[(OmicsSomaticMutationsProfile.HugoSymbol == "FLT3") & (OmicsSomaticMutationsProfile.CivicScore > 8)][["CivicScore", "HessDriver"]]

In [None]:
cosmic_maf_23q2[cosmic_maf_23q2.civic_score.isna()]

In [None]:
cosmic_maf_23q2.columns

In [None]:
cosmic_maf_23q2[cosmic_maf_23q2.funseq2_score > 0.5]

In [None]:
cosmic_maf_23q2.funseq2_score.value_counts()

# Compare Civic and COSMIC actionability scores

In [None]:
import pandas as pd

cosmic_actionability = pd.read_csv("../ActionabilityData.tsv", sep="\t")
civic = pd.read_csv("/home/xiaomeng/bin/depmap_omics/01-Jun-2023-VariantSummaries.tsv", sep="\t")

In [None]:
cosmic_actionability

In [None]:
civic

In [None]:
#!/usr/bin/env python

import io
import os
import pandas as pd


def read_vcf(path):
    with open(path, 'r') as f:
        lines = [l for l in f if not l.startswith('##')]
    return pd.read_csv(
        io.StringIO(''.join(lines)),
        dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
               'QUAL': str, 'FILTER': str, 'INFO': str},
        sep='\t'
    ).rename(columns={'#CHROM': 'CHROM'})

In [None]:
vcf = read_vcf('../clinvar_20230604.vcf')

In [None]:
vcf.loc[0, "INFO"]

In [None]:
vcf.CHROM.unique()

In [None]:
import numpy as np

def transformGeneInfo(row):
    split_list = row.split(';')
    geneinfo = [l for l in split_list if l.startswith('GENEINFO')]
    if len(geneinfo) > 0:
        full = geneinfo[0]
        return full.split("=")[1].split(":")[0]
    else:
        return np.nan

In [None]:
def transformSig(row):
    split_list = row.split(';')
    geneinfo = [l for l in split_list if l.startswith('CLNSIG')]
    if len(geneinfo) > 0:
        full = geneinfo[0]
        return full.split("=")[1]
    else:
        return np.nan

In [None]:
vcf["GeneName"] = vcf.apply(lambda x: (transformGeneInfo(x["INFO"])), axis=1)

In [None]:
vcf["Sig"] = vcf.apply(lambda x: (transformSig(x["INFO"])), axis=1)

In [None]:
len(vcf.GeneName.unique())

In [None]:
vcf

In [None]:
from mgenepy.utils import fetch_biomart as bm

mybiomart = bm.generateGeneNames(
        useCache=False,
        attributes=["start_position", "end_position", "chromosome_name"],
    )

In [None]:
mybiomart

In [None]:
overlap = mybiomart[(mybiomart.hgnc_symbol.isin(vcf.GeneName)) & (mybiomart.chromosome_name.isin(["9", "13", "21", "X", "16"]))]

In [None]:
overlap[['chromosome_name', "start_position", "end_position", "hgnc_symbol"]]

In [None]:
overlap['chromosome_name'] = "chr" + overlap['chromosome_name']
overlap = overlap.rename(columns={'chromosome_name': 'sequence', 'start_position': 'sequenceStart', 'end_position': 'sequenceEnd'})


In [None]:
overlap = overlap[["sequence", "sequenceStart", "sequenceEnd", "hgnc_symbol"]]

In [None]:
overlap.to_csv("overlap.bed", sep='\t', index=False)

In [None]:
overlap[overlap.hgnc_symbol == "U2AF1"]

In [None]:
missing = vcf[(~vcf.GeneName.isin(mybiomart.hgnc_symbol)) & (vcf.CHROM.isin(["9", "13", "21", "X", "16"]))]

In [None]:
missing.Sig.unique()

In [None]:
missing_sig = missing[missing.Sig.isin(['Conflicting_interpretations_of_pathogenicity','Likely_pathogenic',
       'Pathogenic/Likely_pathogenic', 'Pathogenic', '2443913:Pathogenic'])]

In [None]:
missing_sig

In [None]:
missing_sig[missing_sig.CHROM == "16"]

In [None]:
vcf[vcf.GeneName == "TRPM3"]

In [None]:
vcf.loc[991950, "INFO"]

In [None]:
server = BiomartServer("http://feb2023.archive.ensembl.org/biomart")
ensmbl = server.datasets["hsapiens_gene_ensembl"]

In [None]:
ensmbl.search({"attributes": ["attributes"]}, header=1)

In [None]:
ensmbl.attributes

# Cutoff=0.05, check recurrent/immortalized mutations

In [None]:
# model-level:
OmicsSomaticMutations = tc.get(name='internal-23q2-1e49', version=97, file='OmicsSomaticMutations')


In [None]:
OmicsSomaticMutations

In [None]:
from collections import Counter

def annotateLikelyImmortalized(
    maf,
    sample_col="ModelID",
    genome_change_col="DNAChange",
    chrom_col="Chrom",
    pos_col="Pos",
    hotspotcol="cosmic_hotspot",
    max_recurrence=0.05,
):
    """Annotate the maf file with the likely immortalized mutations

    Based on occurence accross samples

    Args:
        maf (pandas.DataFrame): the maf file with columns: sample_col, genome_change_col, TCGAlocs
        sample_col (str): the column name of the sample id
        genome_change_col (str, optional): the column name of the genome change. Defaults to "Genome_Change".
        TCGAlocs (list, optional): the column names of the counts that would make the mutation non immortalization induced. Defaults to ['TCGAhsCnt', 'COSMIChsCnt'].
        max_recurrence (float, optional): the maximum recurrence rate to call immortalize. Defaults to 0.05.
        min_tcga_true_cancer (int, optional): the minimum number of TCGA true cancer samples to not call immortalize. Defaults to 5.

    Returns:
        pandas.DataFrame: the maf file with the added column: immortalized
    """
    maf["is_likely_immortalization"] = False
    maf["combined_mut"] = (
        maf[chrom_col] + "_" + maf[pos_col].astype(str) + "_" + maf[genome_change_col]
    )
    leng = len(set(maf[sample_col]))
    maf[(
            maf["combined_mut"].isin(
                [
                    k
                    for k, v in Counter(maf["combined_mut"].tolist()).items()
                    if v > max_recurrence * leng
                ]
            )
        )
    ]["LikelyImmortalized"] = True
    # maf = maf.drop(columns=["combined_mut"])
    return maf

In [None]:
annotated = annotateLikelyImmortalized(OmicsSomaticMutations)

In [None]:
annotated[annotated.is_likely_immortalization]

In [None]:
annotated.combined_mut.value_counts()