# DepMap WGS and WES variant filtering and whitelisting evaluation



# Outline

- [Setup](#setup)
   - [Dependencies](#load-packages)
   - [Datasets](#load-datasets)
- [Sanity checks of variants](#sanity-checks)
- [CompareToPastRelease]()

# Setup

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
# %%capture
# !pip install plotly

In [None]:
# ! pip install -U kaleido

## Load packages

In [None]:
import pandas as pd
import numpy as np
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
from taigapy import TaigaClient
import os
import plotnine
import plotly.io as pio

pio.renderers.default = "jpeg"
tc = TaigaClient()

In [None]:
%pwd

## Load Datasets

In [None]:
# Generate aggregated maf from remote Terra workspaces

!python combine_mafs.py

In [None]:
assert os.path.exists("23Q4_mutation_maf_latest.tsv")

In [None]:
# Upload to Taiga for data version control

update = True
if update:
    new_dataset_id = tc.update_dataset(
        "23q4-mutation-maf-f431",
        changes_description="this is a draft for 23Q4 maf of mutation",
        upload_files=[
            {
                "path": "23Q4_mutation_maf_latest.tsv",
                "name": "MAF", # optional, will use file name if not provided
                "format": "Raw", # or "NumericMatrixCSV" or "TableCSV"
                "encoding": "utf-8" # optional (but recommended), will use iso-8859-1 if not provided
            }
        ],
        add_all_existing_files=True,
    )
else:
    new_dataset_id = tc.create_dataset(
        "23Q4_mutation_maf",
        dataset_description="this is a draft for 23Q4 maf of mutation",
        upload_files=[
            {
                "path": "23Q4_mutation_maf_latest.tsv",
                "name": "MAF", # optional, will use file name if not provided
                "format": "Raw", # or "NumericMatrixCSV" or "TableCSV"
                "encoding": "utf-8" # optional (but recommended), will use iso-8859-1 if not provided
            }
        ],
        folder_id="a9eedc220a6a4e70b8f1e64d2e57ed87", # optional, will default to your home folder if not provided
    )

# new_dataset_id

In [None]:
%%capture
maf_variants_23q4 = tc.download_to_cache(name='23q4-mutation-maf-f431', version=2, file='MAF')  # download_to_cache for raw
maf_variants_23q4 = pd.read_table(maf_variants_23q4)

# release_standard_maf is profile-based
# release_maf_23q2 is model-based
# We removed variants without gene symbols
release_maf_23q2 = tc.get(name='internal-23q2-1e49', version=98, file='OmicsSomaticMutations')
release_standard_maf = tc.download_to_cache(name='internal-23q2-1e49', version=98, file='OmicsSomaticMutationsMAFProfile.maf')  # download_to_cache for raw
release_standard_maf_23q2 = pd.read_table(release_standard_maf)

# Sanity checks

## Sanity checks for compound variants that has multiple consequences even with --pick

- Typical examples are splicing variants that may be missense mutation as well

In [None]:
maf_variants_23q4.loc[:, 'gnomade_af'] = maf_variants_23q4.loc[:, 'gnomade_af'].fillna(0)

In [None]:
maf_variants_23q4.loc[:, 'gnomadg_af'] = maf_variants_23q4.loc[:, 'gnomadg_af'].fillna(0)

In [None]:
maf_variants_23q4.loc[:, 'gnomade_af'].describe()

In [None]:
check_columns = ['pos', 'gnomadg_af', 'hess_driver', 'vep_impact', 'cosmic_tier', 'civic_score', 'clnsig', 'hugo_symbol', 'vep_mane_select', 'variant_info', 'protein_change', 'oncokb_oncogenic', 'oncokb_effect', 'oncokb_hotspot', 'rescue']

In [None]:
# splicing mutation do not have protein changes
# when co-exist with synonymous mutation or missense mutation
# there will be protein changes
maf_variants_23q4.query("variant_info.str.contains('&')")[check_columns].head()

In [None]:
# Remaining compound synonymous mutations are caused by rescue
# 25 of them are from julian's papers
maf_variants_23q4.query("variant_info.str.contains('&') & variant_info.str.contains('syno')").value_counts("hess_driver"), maf_variants_23q4.query("variant_info.str.contains('&') & variant_info.str.contains('syno')").value_counts("rescue").sum()

In [None]:
# No missense and Synonymous mutation co-occur
# This is a sanity check that we chose only one transcript per variant
((maf_variants_23q4.variant_info.str.contains('synony')) & (maf_variants_23q4.variant_info.str.contains('mis'))).sum()

## Sanity checks of repeat elements

In [None]:
# Some segmental duplicates are masking the oncokb annotations
# but rescued back
maf_variants_23q4.query("segdup == 'Y'").shape, maf_variants_23q4.query("segdup == 'Y'").value_counts('oncokb_effect')

In [None]:
# all of the repeat masking variants are from high impact variants on Tumor Suppressor or Oncogene
# We only whitelisting high impact mutations that locate in the Tumor Suppressor or Oncogene
maf_variants_23q4.query("rm == 'Y'").shape, maf_variants_23q4.query("rm == 'Y'").value_counts('vep_impact')

## Sanity checks with St Jude clinical pipeline

In [None]:
# Miss two only St Jude InDel now

## MONOMAC1	chr11.47355346.GC.	chr11.47355353.CGCC.CC	InDel	TP53	NA
## SKNAS	chr6.156778971..GCA	chr6.156778980.GG.GCAGG	InDel
## For St Jude results to be left-aligned as well
maf_variants_23q4.loc[maf_variants_23q4.pos.isin(np.array([105272634, 41224922, 124064038, 48791113, 23803335, 7675067, 47355346, 68293321, 156778971, 226064457, 49041176]) - 1), check_columns].shape

## Sanity checks of Somatic filter 

- Gnomad genome frequencies
- Gnomad exome frequencies

In [None]:
# We will not keep low frequency synonymous mutation that are never compound with splicing or other consequences
maf_variants_23q4['gnomadg_af'].describe()

In [None]:
maf_variants_23q4.gnomadg_af.isnull().sum() / maf_variants_23q4.shape[0], ((maf_variants_23q4.gnomadg_af.astype('float32') < 1e-5)).sum() / maf_variants_23q4.shape[0], ((maf_variants_23q4.gnomadg_af.astype('float32') >= 1e-5)).sum() / maf_variants_23q4.shape[0]

In [None]:
(maf_variants_23q4.gnomadg_af.astype('float32') >= 1e-5).sum() 

In [None]:
maf_variants_23q4.shape

In [None]:
import plotly.express as px
fig = px.pie(((maf_variants_23q4.gnomadg_af.astype('float32')>1e-5) | ((maf_variants_23q4.gnomade_af.astype('float32')>1e-5))).value_counts().reset_index(), names="index", values=0)
fig.show()

## Sanity checks of Whitelisted variants number

In [None]:
maf_variants_23q4.rescue.dropna().shape

In [None]:
maf_variants_23q4.rescue.dropna().sum(), maf_variants_23q4.rescue.dropna().shape[0]

In [None]:
# Majority of simple synonymous mutations are rescued by Julian's paper
# Let's remove hess_driver in rescue list
maf_variants_23q4.query("variant_info.str.contains('syno')")[check_columns].value_counts("hess_driver"), (maf_variants_23q4.hess_driver == 'Y').sum()

In [None]:
rescued_maf_variants = maf_variants_23q4.loc[maf_variants_23q4.rescue, :]
rescued_maf_variants_sub_categores = rescued_maf_variants.loc[:, ['cosmic_tier', 'oncokb_effect', 'oncokb_hotspot', 'oncokb_oncogenic', 'hess_driver', 'gnomadg_af', 'gnomade_af']].melt(id_vars=['cosmic_tier', 'oncokb_effect', 'oncokb_hotspot', 'oncokb_oncogenic', 'hess_driver'])

## Majority of the oncoKB hotspot mutation are also very pemissive
## They will introduce inclusive oncoKB mutation effects 
## Let's remove hotspot mutation for whitelisting ..
## May consider run hotspot method later

sns.set_style("white")
fig, ax = plt.subplots(1, 5)
fig.set_size_inches(15, 4)
ax = ax.flatten()

sns.violinplot(data=rescued_maf_variants_sub_categores, x='cosmic_tier', y='value', hue='variable', ax=ax[0])
sns.violinplot(data=rescued_maf_variants_sub_categores, x='oncokb_effect', y='value', hue='variable', ax=ax[1])
sns.violinplot(data=rescued_maf_variants_sub_categores, x='oncokb_hotspot', y='value', hue='variable', ax=ax[2])
sns.violinplot(data=rescued_maf_variants_sub_categores, x='oncokb_oncogenic', y='value', hue='variable', ax=ax[3])
sns.violinplot(data=rescued_maf_variants_sub_categores, x='hess_driver', y='value', hue='variable', ax=ax[4])

for ax_index, curr_ax in enumerate(ax):
    for tick in curr_ax.get_xticklabels():
        tick.set_rotation(45)
        tick.set_ha('right')
        sns.despine()
        if ax_index == 4:
            curr_ax.legend(bbox_to_anchor=(0.5, 1.2), ncol=3, fancybox=True, shadow=True, loc='upper center')
        else:
            curr_ax.legend().set_visible(False)
#plt.tight_layout()


In [None]:
sns.ecdfplot(data=rescued_maf_variants, y="gnomadg_af", label='Gnomad Genome')
sns.ecdfplot(data=rescued_maf_variants, y="gnomade_af", label='Gnomad Exome')
plt.ylabel("AF")
plt.legend()

# Remove some rescue list to finalize a maf file

In [None]:
adjusted_gnomad_af_cutoff = 1e-1 # version 1 

# Turn off hotspot rescue list 
# for both oncoKB and hess driver
# maf_variants_23q4_hotspot = maf_variants_23q4.query("(oncokb_hotspot == 'Y') | (hess_driver == 'Y')")

## # Only remove the hotspot with synonymous mutations and high allele frequency
#maf_variants_23q4_clean = maf_variants_23q4.drop(maf_variants_23q4_hotspot.index[((maf_variants_23q4_hotspot.gnomadg_af > adjusted_gnomad_af_cutoff) | (maf_variants_23q4_hotspot.gnomade_af > adjusted_gnomad_af_cutoff)) & (maf_variants_23q4_hotspot.variant_info.str.contains("^syno", regex=True))], axis=0)

# With above filter of rescue list
# We still have ~110 synonymous mutations
# Remove synonymous mutation even with low gnomad af
# Unless there is very strong oncoKB support 
# any variants with leftmost (strongest) consequence is synonymous mutation 
# will be removed

maf_variants_23q4_clean = maf_variants_23q4.query("~variant_info.str.contains('^synony', regex=True)")

In [None]:
maf_variants_23q4_clean.shape

In [None]:
# Double check after removing hotspot rescue
rescued_maf_variants = maf_variants_23q4_clean.loc[maf_variants_23q4_clean.rescue, :]
rescued_maf_variants_sub_categores = rescued_maf_variants.loc[:, ['cosmic_tier', 'oncokb_effect', 'oncokb_hotspot', 'oncokb_oncogenic', 'hess_driver', 'gnomadg_af', 'gnomade_af']].melt(id_vars=['cosmic_tier', 'oncokb_effect', 'oncokb_hotspot', 'oncokb_oncogenic', 'hess_driver'])

## Majority of the oncoKB hotspot mutation are also very pemissive
## They will introduce inclusive oncoKB mutation effects 
## Let's remove hotspot mutation for whitelisting ..
## May consider run hotspot method later

sns.set_style("white")
fig, ax = plt.subplots(1, 5)
fig.set_size_inches(18, 4)
ax = ax.flatten()

sns.violinplot(data=rescued_maf_variants_sub_categores, x='cosmic_tier', y='value', hue='variable', ax=ax[0])
sns.violinplot(data=rescued_maf_variants_sub_categores, x='oncokb_effect', y='value', hue='variable', ax=ax[1])
sns.violinplot(data=rescued_maf_variants_sub_categores, x='oncokb_hotspot', y='value', hue='variable', ax=ax[2])
sns.violinplot(data=rescued_maf_variants_sub_categores, x='oncokb_oncogenic', y='value', hue='variable', ax=ax[3])
sns.violinplot(data=rescued_maf_variants_sub_categores, x='hess_driver', y='value', hue='variable', ax=ax[4])

for ax_index, curr_ax in enumerate(ax):
    for tick in curr_ax.get_xticklabels():
        tick.set_rotation(45)
        tick.set_ha('right')
        sns.despine()
        if ax_index == 4:
            curr_ax.legend(bbox_to_anchor=(0.5, 1.2), ncol=3, fancybox=True, shadow=True, loc='upper center')
        else:
            curr_ax.legend().set_visible(False)
plt.tight_layout()

In [None]:
rescued_maf_variants.gnomade_af.max(), rescued_maf_variants.gnomadg_af.max()

In [None]:
# Turn off oncoKB hotspot rescue list  
# maf_variants_23q4_oncokb = maf_variants_23q4_clean.query("(oncokb_hotspot == 'Y')")

## Only remove the hotspot with synonymous mutations and high allele frequency
#maf_variants_23q4_clean = maf_variants_23q4_clean.drop(maf_variants_23q4_oncokb.index[((maf_variants_23q4_oncokb.gnomadg_af > adjusted_gnomad_af_cutoff) | (maf_variants_23q4_oncokb.gnomade_af > adjusted_gnomad_af_cutoff)) | \
#                                                                                       (maf_variants_23q4_oncokb.oncokb_effect.str.contains("^Inconclusive", regex=True))], axis=0)

# We do not remove any inclusive mutation which are hotspot in oncoKB
maf_variants_23q4_clean = maf_variants_23q4_clean.drop(maf_variants_23q4_clean.index[((maf_variants_23q4_clean.gnomadg_af > adjusted_gnomad_af_cutoff) | (maf_variants_23q4_clean.gnomade_af > adjusted_gnomad_af_cutoff))], axis=0)

maf_variants_23q4_clean.shape                                                                                    

In [None]:
# Double check after removing hotspot rescue
rescued_maf_variants = maf_variants_23q4_clean.loc[maf_variants_23q4_clean.rescue, :]
rescued_maf_variants_sub_categores = rescued_maf_variants.loc[:, ['cosmic_tier', 'oncokb_effect', 'oncokb_hotspot', 'oncokb_oncogenic', 'hess_driver', 'gnomadg_af', 'gnomade_af']].melt(id_vars=['cosmic_tier', 'oncokb_effect', 'oncokb_hotspot', 'oncokb_oncogenic', 'hess_driver'])

## Majority of the oncoKB hotspot mutation are also very pemissive
## They will introduce inclusive oncoKB mutation effects 
## Let's remove hotspot mutation for whitelisting ..
## May consider run hotspot method later

sns.set_style("white")
fig, ax = plt.subplots(1, 5)
fig.set_size_inches(18, 4)
ax = ax.flatten()

sns.violinplot(data=rescued_maf_variants_sub_categores, x='cosmic_tier', y='value', hue='variable', ax=ax[0])
sns.violinplot(data=rescued_maf_variants_sub_categores, x='oncokb_effect', y='value', hue='variable', ax=ax[1])
sns.violinplot(data=rescued_maf_variants_sub_categores, x='oncokb_hotspot', y='value', hue='variable', ax=ax[2])
sns.violinplot(data=rescued_maf_variants_sub_categores, x='oncokb_oncogenic', y='value', hue='variable', ax=ax[3])
sns.violinplot(data=rescued_maf_variants_sub_categores, x='hess_driver', y='value', hue='variable', ax=ax[4])

for ax_index, curr_ax in enumerate(ax):
    for tick in curr_ax.get_xticklabels():
        tick.set_rotation(45)
        tick.set_ha('right')
        sns.despine()
        if ax_index == 4:
            curr_ax.legend(bbox_to_anchor=(0.5, 1.2), ncol=3, fancybox=True, shadow=True, loc='upper center')
        else:
            curr_ax.legend().set_visible(False)
plt.tight_layout()

In [None]:
# # Trim COSMIC tier 1 with high frequency
# maf_variants_23q4_cosmic = maf_variants_23q4_clean.query("(cosmic_tier == 1) | (cosmic_tier == 3)")

# # Only remove the hotspot with synonymous mutations and high allele frequency
# maf_variants_23q4_clean = maf_variants_23q4_clean.drop(maf_variants_23q4_cosmic.index[((maf_variants_23q4_cosmic.gnomadg_af > adjusted_gnomad_af_cutoff) | (maf_variants_23q4_cosmic.gnomade_af > adjusted_gnomad_af_cutoff))], axis=0)

maf_variants_23q4_clean.shape                                                                                    

### Check COSMIC census genes overlap with different adjusted gnomad for rescue list

In [None]:
# Simplify the whole rescue list adjustment process
adjusted_gnomad_af_cutoff = 1e-2 # version 2
maf_variants_23q4_clean_v2 = maf_variants_23q4_clean.drop(maf_variants_23q4_clean.index[((maf_variants_23q4_clean.gnomadg_af > adjusted_gnomad_af_cutoff) | (maf_variants_23q4_clean.gnomade_af > adjusted_gnomad_af_cutoff))], axis=0)

In [None]:
# Simplify the whole rescue list adjustment process
adjusted_gnomad_af_cutoff = 1e-3 # version 3
maf_variants_23q4_clean_v3 = maf_variants_23q4_clean.drop(maf_variants_23q4_clean.index[((maf_variants_23q4_clean.gnomadg_af > adjusted_gnomad_af_cutoff) | (maf_variants_23q4_clean.gnomade_af > adjusted_gnomad_af_cutoff))], axis=0)

In [None]:
adjusted_gnomad_af_cutoff = 1e-4 # version 4
maf_variants_23q4_clean_v4 = maf_variants_23q4_clean.drop(maf_variants_23q4_clean.index[((maf_variants_23q4_clean.gnomadg_af > adjusted_gnomad_af_cutoff) | (maf_variants_23q4_clean.gnomade_af > adjusted_gnomad_af_cutoff))], axis=0)

In [None]:
# Check the version 3 and version 4 differences
# We chose 1e-3 due to 1e-4 lose JAK2 gain-of-functions
maf_variants_23q4_clean_v3.loc[~maf_variants_23q4_clean_v3.index.isin(maf_variants_23q4_clean_v4.index), :].to_csv("rescue_gnomad1e3_gnomad1e4_changes.tsv", sep='\t')

In [None]:
!wc -l rescue_gnomad1e3_gnomad1e4_changes.tsv 

In [None]:
%pwd

In [None]:
adjusted_gnomad_af_cutoff = 1e-5 # version 3
maf_variants_23q4_clean_v5 = maf_variants_23q4_clean.drop(maf_variants_23q4_clean.index[((maf_variants_23q4_clean.gnomadg_af > adjusted_gnomad_af_cutoff) | (maf_variants_23q4_clean.gnomade_af > adjusted_gnomad_af_cutoff))], axis=0)

In [None]:
cosmic_tier1 = pd.read_csv("/home/ubuntu/COSMIC_Census_Tier1_allSun Sep 24 20_50_28 2023.csv")

In [None]:
maf_variants_23q4_clean.columns

In [None]:
cosmic_tier1.iloc[:, 0].unique().shape,  maf_variants_23q4_clean.loc[:, 'hugo_symbol'].unique().shape

In [None]:
cosmic_census_genes_overlap = []
cosmic_tier_variants = []
rescue_gnomad_cutoff = []
total_rescued = []
rescued_variants_without_gnomadaf = []

for maf, cutoff in zip([maf_variants_23q4_clean, maf_variants_23q4_clean_v2, maf_variants_23q4_clean_v3, maf_variants_23q4_clean_v4, maf_variants_23q4_clean_v5], 
                       ['1e-1', '1e-2', '1e-3', '1e-4', '1e-5']):
    cosmic_census_genes_overlap.append(np.intersect1d(list(cosmic_tier1.iloc[:, 0].unique()), list(maf.loc[:, 'hugo_symbol'].unique())).shape[0])
    cosmic_tier_variants.append((maf.cosmic_tier == 1).sum())
    rescue_gnomad_cutoff.append(cutoff)
    total_rescued.append(maf.rescue.sum())
    if cutoff == 'no':
        rescued_variants_without_gnomadaf.append(maf.rescue.sum())
    else:
        print(float(cutoff))
        # Fix the cutoff of gnomad 1e-5 for evaluation
        rescued_variants_without_gnomadaf.append(((maf.rescue) & ((maf.gnomadg_af > float(1e-5)) | (maf.gnomade_af > float(1e-5)))).sum())

In [None]:
maf.cds_id.unique().shape[0]

In [None]:
rescue_gnomad_cosmic_census_df = pd.DataFrame({"cosmic_census_genes_overlap": cosmic_census_genes_overlap, 
                                               "cosmic_tier_variants": cosmic_tier_variants, 
                                               "rescue_gnomad_cutoff": rescue_gnomad_cutoff, 
                                               "total_rescued": total_rescued,
                                               "average_rescued_per_samplee": np.array(total_rescued)/maf.cds_id.unique().shape[0] ,
                                               "average_rescued_variants_without_gnomadaf": np.array(rescued_variants_without_gnomadaf)/maf.cds_id.unique().shape[0],
                                               "rescued_variants_without_gnomadaf": rescued_variants_without_gnomadaf})

In [None]:
rescue_gnomad_cosmic_census_df

In [None]:
fig, ax = plt.subplots(1, 2)
fig.set_size_inches(8, 4)

sns.barplot(rescue_gnomad_cosmic_census_df, x='rescue_gnomad_cutoff', y='cosmic_census_genes_overlap', ax=ax[0])
for container in ax[0].containers:
    ax[0].bar_label(container, fmt='%.1f')
ax2 = ax[0].twinx()
sns.lineplot(rescue_gnomad_cosmic_census_df, x='rescue_gnomad_cutoff', y='cosmic_tier_variants', 
             color='red', ax=ax2)

sns.barplot(rescue_gnomad_cosmic_census_df, x='rescue_gnomad_cutoff', y='average_rescued_per_samplee', ax=ax[1])
for container in ax[1].containers:
    ax[1].bar_label(container, fmt='%.1f')
ax2 = ax[1].twinx()
sns.lineplot(rescue_gnomad_cosmic_census_df, x='rescue_gnomad_cutoff', y='average_rescued_variants_without_gnomadaf', 
             color='red', ax=ax2)
fig.tight_layout()

In [None]:
np.setdiff1d(np.intersect1d(list(cosmic_tier1.iloc[:, 0].unique()), list(maf_variants_23q4_clean_v2.loc[:, 'hugo_symbol'].unique())), 
             np.intersect1d(list(cosmic_tier1.iloc[:, 0].unique()), list(maf_variants_23q4_clean_v3.loc[:, 'hugo_symbol'].unique())))

## Force the oncoKB variant frequency

In [None]:
# This considers NaN which does not > or < 1e-3
maf_variants_23q4_clean_v3 = maf_variants_23q4_clean.drop(maf_variants_23q4_clean.index[((maf_variants_23q4_clean.gnomadg_af > 1e-3) | (maf_variants_23q4_clean.gnomade_af > 1e-3))], axis=0)

maf_variants_23q4_clean_v3.shape

In [None]:
cosmic_census_genes_overlap = []
cosmic_tier_variants = []
rescue_gnomad_cutoff = []
total_rescued = []
rescued_variants_without_gnomadaf = []

for maf, cutoff in zip([maf_variants_23q4, maf_variants_23q4_clean_v3],
                       ['no', '1e-3']):
    cosmic_census_genes_overlap.append(np.intersect1d(list(cosmic_tier1.iloc[:, 0].unique()), list(maf.loc[:, 'hugo_symbol'].unique())).shape[0])
    cosmic_tier_variants.append((maf.cosmic_tier == 1).sum())
    rescue_gnomad_cutoff.append(cutoff)
    total_rescued.append(maf.rescue.sum())
    if cutoff == 'no':
        rescued_variants_without_gnomadaf.append(maf.rescue.sum())
    else:
        print(float(cutoff))
        # Fix the cutoff of gnomad 1e-5 for evaluation
        rescued_variants_without_gnomadaf.append(((maf.rescue) & ((maf.gnomadg_af > float(1e-5)) | (maf.gnomade_af > float(1e-5)))).sum())

rescue_gnomad_cosmic_census_df = pd.DataFrame({"cosmic_census_genes_overlap": cosmic_census_genes_overlap, 
              "cosmic_tier_variants": cosmic_tier_variants, 
              "rescue_gnomad_cutoff": rescue_gnomad_cutoff, 
              "total_rescued": total_rescued,
              "average_rescued_per_samplee": np.array(total_rescued)/maf.cds_id.unique().shape[0] ,
              "average_rescued_variants_without_gnomadaf": np.array(rescued_variants_without_gnomadaf)/maf.cds_id.unique().shape[0],
              "rescued_variants_without_gnomadaf": rescued_variants_without_gnomadaf})
rescue_gnomad_cosmic_census_df

In [None]:
# Double check after removing hotspot rescue
rescued_maf_variants = maf_variants_23q4_clean_v2.loc[maf_variants_23q4_clean_v2.rescue, :]
rescued_maf_variants_sub_categores = rescued_maf_variants.loc[:, ['cosmic_tier', 'oncokb_effect', 'oncokb_hotspot', 'oncokb_oncogenic', 'hess_driver', 'gnomadg_af', 'gnomade_af']].melt(id_vars=['cosmic_tier', 'oncokb_effect', 'oncokb_hotspot', 'oncokb_oncogenic', 'hess_driver'])

## Majority of the oncoKB hotspot mutation are also very pemissive
## They will introduce inclusive oncoKB mutation effects 
## Let's remove hotspot mutation for whitelisting ..
## May consider run hotspot method later

sns.set_style("white")
fig, ax = plt.subplots(1, 5)
fig.set_size_inches(18, 4)
ax = ax.flatten()

sns.violinplot(data=rescued_maf_variants_sub_categores, x='cosmic_tier', y='value', hue='variable', ax=ax[0])
sns.violinplot(data=rescued_maf_variants_sub_categores, x='oncokb_effect', y='value', hue='variable', ax=ax[1])
sns.violinplot(data=rescued_maf_variants_sub_categores, x='oncokb_hotspot', y='value', hue='variable', ax=ax[2])
sns.violinplot(data=rescued_maf_variants_sub_categores, x='oncokb_oncogenic', y='value', hue='variable', ax=ax[3])
sns.violinplot(data=rescued_maf_variants_sub_categores, x='hess_driver', y='value', hue='variable', ax=ax[4])

for ax_index, curr_ax in enumerate(ax):
    for tick in curr_ax.get_xticklabels():
        tick.set_rotation(45)
        tick.set_ha('right')
        sns.despine()
        if ax_index == 4:
            curr_ax.legend(bbox_to_anchor=(0.5, 1.2), ncol=3, fancybox=True, shadow=True, loc='upper center')
        else:
            curr_ax.legend().set_visible(False)
plt.tight_layout()

In [None]:
# Still have 38 variants with secondary effect belonging to synonymous mutation 
# Will be assigned to be silent mutation later according to maftools
# That seems to be important in oncoKB 

maf_variants_23q4_clean_v3.query("variant_info.str.contains('syno')")[['ref','alt', 'cds_id']+check_columns].shape

## Double check clean MAF overlaped with St Jude

In [None]:
maf_variants_23q4_clean_v3.loc[maf_variants_23q4_clean_v3.pos.isin(np.array([105272634, 41224922, 124064038, 48791113, 23803335, 7675067, 47355346, 68293321, 156778971, 226064457, 49041176]) - 1), check_columns].shape

# VCF 2 MAF standardised formatting to output 

## Convert coordinate and add essential standard MAF columns

In [None]:
def GetMafEndPosition(start: int, ref: str, alt: str) -> tuple:
    """Get the end position from the VCF start position and ref alt alleles

    Learn the complex InDel from https://github.com/qinqian/vcf2maf/blob/main/vcf2maf.pl#L706


    Return
    -----------
    (start, vartype, inframe): tuple
    """
    assert len(ref) > 0
    assert len(alt) > 0
    if len(ref) == len(alt):
        var_type_dict = {1: "SNP", 2: "DNP", 3: "TNP"}
        inframe = False
        if len(alt) > 3:
            var_type = "ONP"
        else:
            var_type = var_type_dict[len(alt)]
        return start, start + len(alt) - 1, var_type, inframe
    elif len(ref) < len(alt):
        # Insertion
        var_type = "INS";
        inframe = abs(len(ref) - len(alt)) % 3 == 0
        if ref == "-":
            return start - 1, start, var_type, inframe
        else:
            return start, start + len(ref) - 1, var_type, inframe
    else:
        # Deletion
        inframe = abs(len(ref) - len(alt)) % 3 == 0
        var_type = 'DEL'
        return start, start + len(ref) - 1, var_type, inframe

In [None]:
formatted_coords = maf_variants_23q4_clean_v3.loc[:, ['pos', 'ref', 'alt']].apply(lambda x: GetMafEndPosition(*x), axis=1, result_type="expand")

In [None]:
import re

def GetVariantClassification(vep_seq_ontology: str, var_type: str, inframe: bool) -> str:
    """Map VEP sequence ontology into MAF variant classifications,
    VEP consequences is ordered by http://useast.ensembl.org/info/genome/variation/prediction/predicted_data.html"""

    if re.match(r"^(splice_acceptor_variant|splice_donor_variant|transcript_ablation|exon_loss_variant)", vep_seq_ontology):
        return "Splice_Site"
    
    if re.match(r"^(stop_gained)", vep_seq_ontology):
        return "Nonsense_Mutation"

    if (re.match(r"^(frameshift_variant)", vep_seq_ontology) or (re.match(r"^(protein_altering_variant)", vep_seq_ontology) and not inframe)) and (var_type == 'DEL'):
        return "Frame_Shift_Del"

    if (re.match(r"^(frameshift_variant)", vep_seq_ontology) or (re.match(r"^(protein_altering_variant)", vep_seq_ontology) and not inframe)) and (var_type == 'INS'):
        return "Frame_Shift_Ins"
    
    if re.match(r"^(stop_lost)", vep_seq_ontology):
        return "Nonstop_Mutation"
    
    if re.match(r"^(initiator_codon_variant|start_lost)", vep_seq_ontology):
        return "Translation_Start_Site" 

    if re.match(r"^(inframe_insertion|conservative_inframe_insertion|disruptive_inframe_insertion)", vep_seq_ontology) or (re.match(r"^(protein_altering_variant)", vep_seq_ontology) and inframe and (var_type == 'INS')):
        return "In_Frame_Ins"

    if re.match(r"^(inframe_deletion|disruptive_inframe_deletion|conservative_inframe_deletion)", vep_seq_ontology) or (re.match(r"^(protein_altering_variant)", vep_seq_ontology) and inframe and (var_type == 'DEL')):
        return "In_Frame_Del"

    if re.match(r"^(missense_variant|coding_sequence_variant|conservative_missense_variant|rare_amino_acid_variant)", vep_seq_ontology):
        return "Missense_Mutation"

    if re.match(r"^(transcript_amplification|intron_variant|INTRAGENIC|intragenic_variant)", vep_seq_ontology):
        return "Intron"

    if re.match(r"^(incomplete_terminal_codon_variant|synonymous_variant|stop_retained_variant|NMD_transcript_variant|start_retained_variant)", vep_seq_ontology):
        return "Silent"

    if re.match(r"^(splice_region_variant|splice_polypyrimidine_tract_variant|splice_donor_5th_base_variant|splice_donor_region_variant)", vep_seq_ontology):
        return "Splice_Region"

    if re.match(r"^(mature_miRNA_variant|exon_variant|non_coding_exon_variant|non_coding_transcript_exon_variant|non_coding_transcript_variant|nc_transcript_variant|coding_transcript_variant)", vep_seq_ontology):
        return "RNA"

    if re.match(r"^(5_prime_UTR_variant|5_prime_UTR_premature_start_codon_gain_variant)", vep_seq_ontology):
        return "5'UTR"

    if re.match(r"^3_prime_UTR_variant", vep_seq_ontology):
        return "3'UTR"

    if re.match(r"^upstream_gene_variant", vep_seq_ontology):
        return "5'Flank"

    if re.match(r"^downstream_gene_variant", vep_seq_ontology):
        return "3'Flank"

    if re.match(r"^(TF_binding_site_variant|regulatory_region_variant|regulatory_region|intergenic_variant|intergenic_region)", vep_seq_ontology):
        return "IGR" 

    if vep_seq_ontology == "":
        return "NoAnnotation"

    return "TargetedRegion"


In [None]:
maf_variants_23q4_clean_v3.loc[:, 'Strand'] = '+'

In [None]:
maf_variants_23q4_clean_v3.loc[:, 'Start_Position'] = formatted_coords[0]
maf_variants_23q4_clean_v3.loc[:, 'End_Position'] = formatted_coords[1]
maf_variants_23q4_clean_v3.loc[:, 'Variant_Type'] = formatted_coords[2]
maf_variants_23q4_clean_v3.loc[:, 'InFrame'] = formatted_coords[3]
maf_variants_23q4_clean_v3.loc[:, 'Variant_Classification'] = maf_variants_23q4_clean_v3.loc[:, ['variant_info', 'Variant_Type', 'InFrame']].apply(lambda x: GetVariantClassification(*x), axis=1)

In [None]:
# No missing annotations
maf_variants_23q4_clean_v3.loc[maf_variants_23q4_clean_v3.Variant_Classification == 'TargetedRegion', ['ref', 'alt', 'variant_info', 'Variant_Classification', 'variant_info', 'Variant_Type', 'InFrame']].values

In [None]:
maf_variants_23q4_clean_v3.Variant_Classification.unique()

In [None]:
maf_variants_23q4_clean_v3.shape

## Reorder columns to keep consistent with MAF standards

In [None]:
maf_variants_23q4_clean_v3.loc[:, 'Strand'].value_counts()

In [None]:
maf_variants_23q4_clean_v3.rename(
    columns={
        "hugo_symbol": "Hugo_Symbol",
        "chrom": "Chromosome",
        "ref": "Reference_Allele",
        "alt": "Alternate_Allele",
        "cds_id": "Tumor_Sample_Barcode",
        "protein_change": "Protein_Change",
    },
    inplace=True,
)

In [None]:
maf_variants_23q4_clean_v3.loc[:, "NCBI_Build"] = "GRCh38" 
maf_variants_23q4_clean_v3.loc[:, "Center"] = "DepMap" 

In [None]:
maf_variants_23q4_clean_v3.loc[:, "Tumor_Seq_Allele1"] = maf_variants_23q4_clean_v3.loc[
    :, "Reference_Allele"
] 
maf_variants_23q4_clean_v3.loc[:, "Tumor_Seq_Allele2"] = maf_variants_23q4_clean_v3.loc[:, "Alternate_Allele"]

In [None]:
reordered_columns = [
        "Hugo_Symbol",
        "NCBI_Build",
        "Chromosome",
        "Start_Position",
        "End_Position",
        "Variant_Type",
        "Reference_Allele",
        "Tumor_Seq_Allele1",
        "Tumor_Seq_Allele2",
        "Tumor_Sample_Barcode",
        "Variant_Classification",
        "Protein_Change",
    ] 

In [None]:
reordered_columns += list(set(maf_variants_23q4_clean_v3.columns) - set(reordered_columns))

In [None]:
maf_variants_23q4_clean_v3 = maf_variants_23q4_clean_v3.loc[:, reordered_columns]

In [None]:
maf_variants_23q4_clean_v2.shape

In [None]:
sns.set_style("white")
maf_variants_23q4_clean_v3.Variant_Classification.value_counts().plot.bar()
plt.ylabel("Variant Count")

In [None]:
maf_variants_23q4_clean_v3.shape[0] - maf_variants_23q4_clean_v3.Variant_Classification.value_counts()[-7:].sum()

In [None]:
# Filter based on maftools criterion of what is Silent mutations
# Exclude the following classifications

maf_variants_23q4_clean_v3 = maf_variants_23q4_clean_v3.loc[~maf_variants_23q4_clean_v3.Variant_Classification.isin(['Silent', 'RNA', 'Intron', "5'UTR", "3'Flank", 'Splice_Region', "5'Flank"]), :]
maf_variants_23q4_clean_v3.shape

In [None]:
# Remove variants without Hugo_Symbol

maf_variants_23q4_clean_v3 = maf_variants_23q4_clean_v3.loc[~maf_variants_23q4_clean_v3.Hugo_Symbol.isnull(), :]

In [None]:
maf_variants_23q4_clean_v3.shape

In [None]:
maf_variants_23q4_clean_v3 = maf_variants_23q4_clean_v3.sort_values(by=["Chromosome", "Start_Position", "End_Position"])

In [None]:
# Get from Gumbo for the meta information
oncotree_meta_profile = pd.read_csv("~/seq_table.csv")

In [None]:
oncotree_meta_profile.head()

In [None]:
maf_variants_23q4_clean_v3['oncotree'] = maf_variants_23q4_clean_v3.Tumor_Sample_Barcode.map(dict(zip(oncotree_meta_profile['SequencingID'], oncotree_meta_profile['depmap_model_type'])))
maf_variants_23q4_clean_v3['celltype'] = maf_variants_23q4_clean_v3.Tumor_Sample_Barcode.map(dict(zip(oncotree_meta_profile['SequencingID'], oncotree_meta_profile['StrippedCellLineName'])))
maf_variants_23q4_clean_v3['lineage'] = maf_variants_23q4_clean_v3.Tumor_Sample_Barcode.map(dict(zip(oncotree_meta_profile['SequencingID'], oncotree_meta_profile['Lineage'])))

In [None]:
# maf_variants_23q4_clean['Datatype'] = maf_variants_23q4_clean.Tumor_Sample_Barcode.map(dict(zip(oncotree_meta_profile['MainSequencingID'], oncotree_meta_profile['Datatype'])))

In [None]:
maf_variants_23q4_clean_v3['ModelID'] = maf_variants_23q4_clean_v3.Tumor_Sample_Barcode.map(dict(zip(oncotree_meta_profile['SequencingID'], oncotree_meta_profile['ModelID'])))

In [None]:
# Missing oncotree information
# For the following information
maf_variants_23q4_clean_v3.loc[maf_variants_23q4_clean_v3.oncotree.isnull(), 'Tumor_Sample_Barcode'].unique()

In [None]:
maf_variants_23q4_clean_v3.loc[maf_variants_23q4_clean_v3.celltype.isnull(), 'Tumor_Sample_Barcode'].unique()

## Internal cohort AF

In [None]:
# Interal cohort allele frequencies filtering

internal_afs_23q4 = maf_variants_23q4_clean_v3.loc[:, ['Chromosome', 'Start_Position', 'End_Position', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2']].apply(lambda x: ':'.join(map(str, x)), axis=1)

In [None]:
total_samples = maf_variants_23q4_clean_v3.Tumor_Sample_Barcode.unique().shape[0]

In [None]:
from collections import Counter

# assume there are very few duplicated variants per sample
# actually we have total 4 duplicated variants, it is trivial

internal_afs_23q4_ratio_dict = {}
for k, v in Counter(internal_afs_23q4.tolist()).items():
    internal_afs_23q4_ratio_dict[k] = v / total_samples

In [None]:
# https://github.com/broadinstitute/depmap_omics/blob/update-vcf-to-depmap/depmapomics/mutations.py#L39

maf_variants_23q4_clean_v3.loc[:, "internal_afs_23q4"] = internal_afs_23q4.map(internal_afs_23q4_ratio_dict)

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
_ = ax.hist(maf_variants_23q4_clean_v3["internal_afs_23q4"], color='blue', bins=100)
ax.set_yscale('log')
ax.set_xlabel("DepMap internal AF")
ax.set_ylabel("Count")

In [None]:
(~maf_variants_23q4_clean_v3.rescue).sum(), maf_variants_23q4_clean_v3.shape

In [None]:
max_recurrence = 0.05 

maf_variants_23q4_clean_v3_final = maf_variants_23q4_clean_v3.loc[(maf_variants_23q4_clean_v3.internal_afs_23q4 <= max_recurrence) | (maf_variants_23q4_clean_v3.rescue), :]

In [None]:
maf_variants_23q4_clean_v3_final.shape

In [None]:
maf_variants_23q4_clean_v3_final.to_csv(f"23Q4_somatic_mutations_profile_drafted_internal_af{max_recurrence}.csv", index=False)

In [None]:
maf_variants_23q4_clean_v3_final.to_csv(f"23Q4_somatic_mutations_profile_drafted_internal_af{max_recurrence}.maf", index=False, sep='\t')

In [None]:
maf_variants_23q4_clean_v3_final.shape

In [None]:
# Upload to Taiga for data version control

update = True
if update:
    new_dataset_id = tc.update_dataset(
        "23q4-mutation-maf-clean-4161",
        changes_description="this is a clean and standarised draft for 23Q4 maf of mutation",
        upload_files=[
            {
                "path": f"23Q4_somatic_mutations_profile_drafted_internal_af{max_recurrence}.csv",
                "name": "23Q4_CSV_mutations", # optional, will use file name if not provided
                "format": "TableCSV", # or "NumericMatrixCSV" or "TableCSV"
                "encoding": "utf-8" # optional (but recommended), will use iso-8859-1 if not provided
            },
            {
                "path": f"23Q4_somatic_mutations_profile_drafted_internal_af{max_recurrence}.maf",
                "name": "23Q4_MAF_mutations", # optional, will use file name if not provided
                "format": "Raw", # or "NumericMatrixCSV" or "TableCSV"
                "encoding": "utf-8" # optional (but recommended), will use iso-8859-1 if not provided
            }
        ],
        add_all_existing_files=True,
    )
else:
    new_dataset_id = tc.create_dataset(
        "23Q4_mutation_maf_clean",
        dataset_description="this is a clean and standarised draft for 23Q4 maf of mutation",
        upload_files=[
            {
                "path": f"23Q4_somatic_mutations_profile_drafted_internal_af{max_recurrence}.maf",
                "name": "MAF", # optional, will use file name if not provided
                "format": "Raw", # or "NumericMatrixCSV" or "TableCSV"
                "encoding": "utf-8" # optional (but recommended), will use iso-8859-1 if not provided
            }
        ],
        folder_id="a9eedc220a6a4e70b8f1e64d2e57ed87", # optional, will default to your home folder if not provided
    )


In [None]:
maf_variants_23q4_clean_v3_final.query("(rescue)").to_csv("23Q4_rescue_list.csv")

In [None]:
update = True
if update:
    new_dataset_id = tc.update_dataset(
        "23q4-mutation-rescuelist-6aa7",
        changes_description="this is a union set of the variants that are considered as whitelisting mutation", # optional (but recommended)
        upload_files=[
            {
                "path": "23Q4_rescue_list.csv",
                "name": "RescueList", # optional, will use file name if not provided
                "format": "TableCSV", # or "NumericMatrixCSV" or "TableCSV"
                "encoding": "utf-8" # optional (but recommended), will use iso-8859-1 if not provided
            }
        ],
        add_all_existing_files=True,
    )

In [None]:
# Splitted mafs by the oncotree informations 

# for cancer_type in maf_variants_23q4_clean.oncotree.unique():
for cancer_type in ['READ', 'ESCA', 'AML', 'LUSC', 'SKCM', 'LUAD', 'BRCA']:
    cancer_type_maf = maf_variants_23q4_clean_v3_final.query("oncotree == @cancer_type")
    cancer_type_maf.to_csv(f"23Q4_somatic_mutations_profile_drafted_{cancer_type}_final.maf", index=False, sep='\t')

In [None]:
# Miss two only St Jude InDel now

## MONOMAC1	chr11.47355346.GC.	chr11.47355353.CGCC.CC	InDel	TP53	NA
## SKNAS	chr6.156778971..GCA	chr6.156778980.GG.GCAGG	InDel
## For St Jude results to be left-aligned as well
maf_variants_23q4_clean_v3_final.loc[maf_variants_23q4_clean_v3_final.pos.isin(np.array([105272634, 41224922, 124064038, 48791113, 23803335, 7675067, 47355346, 68293321, 156778971, 226064457, 49041176]) - 1), ].shape

# Check sample specific question

## 0. EGFR mutations

In [None]:
maf_variants_23q4_clean_v3_final.loc[(maf_variants_23q4_clean_v3_final.Hugo_Symbol == 'EGFR') & (maf_variants_23q4_clean_v3_final.celltype.isin(['PC9', 'HCC827'])), ['Chromosome', 'celltype', 'Start_Position', 'Reference_Allele', 'Tumor_Seq_Allele2', 'uniprot_id', 'Protein_Change']]

## 1. The mutations of MTAP don't match with uniprot and ncbi

I found that many of the mutations of MTAP (Uniprot ID: Q13126) don’t match with any sequence from uniprot or ncbi. For example, HCT116 has a mutation p.V106A, but on uniprot, the 106 should be R. And there are many other mutations of MTAP have the same problem.

Answer: We actually do not have such a protein version in this drafted version, thus we checked the original output of maf from vcf_to_depmap, there is still no such a position

In [None]:
maf_variants_23q4_clean_v3_final.loc[(maf_variants_23q4_clean_v3_final.Hugo_Symbol == 'MTAP'), ['Chromosome', 'Start_Position', 'Reference_Allele', 'Tumor_Seq_Allele2', 'uniprot_id', 'Protein_Change']]

In [None]:
release_maf_23q2.loc[release_maf_23q2.loc[:, 'HugoSymbol'] == 'MTAP', ['Chrom', 'Pos', 'Ref', 'Alt', 'HugoSymbol', 'UniprotID', 'ProteinChange', 'DNAChange']].head(8)

## KRAS G12D mutation

The PANC1005 line is well characterized as having a KRAS G12D mutation, but this is missing from the latest release of DepMap data while present in the older releases. Is this is indeed the case or have I made a mistake in querying the data?

Answer: we actually found the G12D mutation (Gly12Asp). We double checked 23Q2, it is not there.

In [None]:
maf_variants_23q4_clean_v3_final.loc[(maf_variants_23q4_clean_v3_final.celltype == 'PANC1005') & (maf_variants_23q4_clean_v3_final.Hugo_Symbol == 'KRAS'), ['uniprot_id', 'Protein_Change']]

In [None]:
ids_23q2_g12d = release_maf_23q2.loc[release_maf_23q2.loc[:, 'ProteinChange'] == 'p.G12D', ['ModelID', 'Ref', 'Alt', 'HugoSymbol', 'UniprotID', 'ProteinChange', 'DNAChange']]

In [None]:
oncotree_meta_profile.loc[oncotree_meta_profile.ModelID.isin(ids_23q2_g12d.ModelID), 'StrippedCellLineName'].isin(['PANC1005']).sum()

## Mutation and protein change

“I have spent a bit of time trying to align the residue numbering on
targets/mutations in DepMap to other public databases.  For some reason the
sequence variants DepMap uses for assigning residue numbers are different
than those used for all other datasources I use.  For example on EGFR, the
common mutations L858R and T790M are listed as “L813R” and “T745M”.  Other
databases like COSMIC, TCGA/cBioPortal, and our own internal database do
not have this issue – the residue numbering on 99%+ of proteins is
identical between the other databases, but then DepMap numbering is
different.

Answer: L858R and T790M now can be found in the 23Q4 version. 

In [None]:
# L858R
maf_variants_23q4_clean_v3_final.query("Hugo_Symbol == 'EGFR' & Protein_Change.str.contains('Leu')")['Protein_Change']

In [None]:
# Thr790Met -> T790M
maf_variants_23q4_clean_v3_final.query("Hugo_Symbol == 'EGFR' & Protein_Change.str.contains('Thr')")['Protein_Change']

## Incorrect germline mutation
I had a question that I didn’t want to post to the forum but ask rather privately:
In the gene RBM10:

  *   ACH-000875 has a variant c.e20+1G>T marked as deleterious in 22q2 but not deleterious in 23q2
  *   ACH-000414 (NCIH1944) has a frameshift p.P617fs in 22q2 but SNP p.V616F in 23q2
Could you please clarify the first point as to changes in variant classification and for the second point what changed in the variant caller to call a frameshift a SNP instead? According to CBIOportal CCLE data on NCIH1944 it should also have a frameshift A540Rfs*10 but I don’t think this is in the mutation calls. Would be great to understand the underlying reason.


Answer: 
1. ACH-00875 will have this high impact mutation in 23Q4 as deleterious 
2. We will have both frameshift ENSP00000366829.3:p.Ala618ArgfsTer10 and SNP p.Val616Phe for 23Q4.

In [None]:
maf_variants_23q4_clean_v3_final.query("ModelID == 'ACH-000875' & Hugo_Symbol == 'RBM10'")[['vep_impact', 'dna_change', 'Protein_Change']]

In [None]:
maf_variants_23q4_clean_v3_final.query("ModelID == 'ACH-000414' & Hugo_Symbol == 'RBM10'")[['vep_impact', 'variant_info', 'dna_change', 'Protein_Change']]

## L858R hotspot

L858R hotspot is being annotated as L813R and exon 19 deletions should be between 729-761 and most common in-frame del in our data is annotated at 701. Wanted to flag this for you and something to check with our new annotations.

Answer: this has been annotated as ENSP00000275233.7:p.Leu858ArgfsTer47 by VEP now. We also have a few missense mutations

In [None]:
maf_variants_23q4_clean_l585r = maf_variants_23q4_clean_v3_final.loc[~maf_variants_23q4_clean_v3_final.Protein_Change.isnull(), :]
maf_variants_23q4_clean_l585r.loc[maf_variants_23q4_clean_l585r.Protein_Change.str.contains('Leu858Arg'), ['Protein_Change', 'vep_impact', 'variant_info']]

## Start codon change classification


- The OmicsSomaticMutationsMAFProfile.maf file registers missense mutants in the start codon as “silent” when in fact the mutation will most likely lead to a loss of function. I’ve noticed this in one specific gene of interest, but there could be more.
Interestingly, the OmicsSomaticMutations.csv file recognizes it as a START_CODON_SNP and marks the mutant as gene_function_loss .
The contents of OmicsSomaticMutationsMAFProfile.maf
GNA11 GRCh38 19 3094654 3094655 SNP G G A PR-Ds9QdK Silent p.M1I
GNA11 GRCh38 19 3094654 3094655 SNP G G A PR-Yz6AIC Silent p.M1I

- The contents of OmicsSomaticMutations.csv
chr19,3094654,G,A,0.286,9,3,0/1,,SNP,START_CODON_SNP,c.3G>A,p.M1I,GNA11,G protein subunit alpha 11,"G protein subunits alpha, group q",ENST00000078429.9,1.0,+,P29992,False,,,,0.8029925187032418,E,Dom,False,,False,,gene_function_loss;,False,False,False,,,,,7.3,False,True,False,,0.921563,0.9822536945303076,0.596,4.0,,,,,,,ACH-000995,2767.0


In [None]:
maf_variants_23q4_clean_v3_final.query("Start_Position == 3094654")[['Chromosome', 'Start_Position', 'Variant_Classification', 'vep_impact']]

# Statistics Compared to Past release

In [None]:
release_maf_23q2_stat = release_standard_maf_23q2.value_counts('Variant_Classification').reset_index().rename({0: 'counts'}, axis=1)
release_maf_23q4_stat = maf_variants_23q4_clean_v3_final['Variant_Classification'].value_counts().reset_index().rename({'index':'Variant_Classification', 'Variant_Classification': 'counts'}, axis=1)

In [None]:
release_maf_23q2_stat['release'] = '23Q2'
release_maf_23q4_stat['release'] = '23Q4'

In [None]:
ccolor = '#555555'
dodge_text = position_dodge(width=0.9)

ggplot(pd.concat([release_maf_23q2_stat, release_maf_23q4_stat], axis=0), aes(x='Variant_Classification', y='counts', fill='release')) + geom_col(stat='identity', position='dodge') + \
      theme(
               #axis_title_y=element_blank(),
               axis_line_x=element_line(color='black'),
               #axis_line_y=element_blank(),
               #axis_text_y=element_blank(),
               axis_text_x=element_text(color=ccolor, angle=90, va='top'),
               #axis_ticks_major_y=element_blank(),
               #panel_grid=element_blank(),
               panel_border=element_blank())