In [None]:
from taigapy import TaigaClient
tc = TaigaClient()

OmicsSomaticMutationsProfile = tc.get(name='internal-23q2-1e49', version=97, file='OmicsSomaticMutationsProfile')

In [None]:
OmicsSomaticMutationsProfile

In [None]:
from collections import Counter

def annotateLikelyImmortalized(
    maf,
    sample_col="ProfileID",
    genome_change_col="DNAChange",
    chrom_col="Chrom",
    pos_col="Pos",
    hotspotcol="cosmic_hotspot",
    max_recurrence=0.05,
):
    """Annotate the maf file with the likely immortalized mutations

    Based on occurence accross samples

    Args:
        maf (pandas.DataFrame): the maf file with columns: sample_col, genome_change_col, TCGAlocs
        sample_col (str): the column name of the sample id
        genome_change_col (str, optional): the column name of the genome change. Defaults to "Genome_Change".
        TCGAlocs (list, optional): the column names of the counts that would make the mutation non immortalization induced. Defaults to ['TCGAhsCnt', 'COSMIChsCnt'].
        max_recurrence (float, optional): the maximum recurrence rate to call immortalize. Defaults to 0.05.
        min_tcga_true_cancer (int, optional): the minimum number of TCGA true cancer samples to not call immortalize. Defaults to 5.

    Returns:
        pandas.DataFrame: the maf file with the added column: immortalized
    """
    maf["LikelyImmortalized"] = False
    maf["combined_mut"] = (
        maf[chrom_col] + "_" + maf[pos_col].astype(str) + "_" + maf[genome_change_col]
    )
    leng = len(set(maf[sample_col]))
    maf.loc[maf[(
            maf["combined_mut"].isin(
                [
                    k
                    for k, v in Counter(maf["combined_mut"].tolist()).items()
                    if v > max_recurrence * leng
                ]
            )
        )].index, "LikelyImmortalized"] = True
    # maf = maf.drop(columns=["combined_mut"])
    return maf

In [None]:
annotated_maf = annotateLikelyImmortalized(OmicsSomaticMutationsProfile)

In [None]:
annotated_maf['occurrence'] = annotated_maf['combined_mut'].map(annotated_maf['combined_mut'].value_counts())

In [None]:
len(annotated_maf[annotated_maf.combined_mut == "chr1_13225068_c.653T>C"])

In [None]:
import matplotlib.pyplot as plt
import numpy as np

x = annotated_maf.combined_mut.value_counts().tolist()
#x = [np.log(c) for c in x]

plt.hist(x, bins=200)
plt.show() 

In [None]:
# histogram with a broken y-axis
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(19680801)

pts = np.random.rand(30)*.2
pts = x

# If we were to simply plot pts, we'd lose most of the interesting
# details due to the outliers. So let's 'break' or 'cut-out' the y-axis
# into two portions - use the top (ax1) for the outliers, and the bottom
# (ax2) for the details of the majority of our data
fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
fig.subplots_adjust(hspace=0.05)  # adjust space between axes
fig.set_size_inches(16, 10)

# plot the same data on both axes
ax1.hist(pts, bins=200)
ax2.hist(pts, bins=200)

# zoom-in / limit the view to different portions of the data
ax1.set_ylim(50, 1300000)  # outliers only
ax2.set_ylim(0, 50)  # most of the data

# hide the spines between ax and ax2
ax1.spines.bottom.set_visible(False)
ax2.spines.top.set_visible(False)
ax1.xaxis.tick_top()
ax1.tick_params(labeltop=False)  # don't put tick labels at the top
ax2.xaxis.tick_bottom()

# Now, let's turn towards the cut-out slanted lines.
# We create line objects in axes coordinates, in which (0,0), (0,1),
# (1,0), and (1,1) are the four corners of the axes.
# The slanted lines themselves are markers at those locations, such that the
# lines keep their angle and position, independent of the axes size or scale
# Finally, we need to disable clipping.

d = .5  # proportion of vertical to horizontal extent of the slanted line
kwargs = dict(marker=[(-1, -d), (1, d)], markersize=12,
              linestyle="none", color='k', mec='k', mew=1, clip_on=False)
ax1.plot([0, 1], [0, 0], transform=ax1.transAxes, **kwargs)
ax2.plot([0, 1], [1, 1], transform=ax2.transAxes, **kwargs)
plt.axvline(x = 133, color = 'r', label = '0.05 cutoff')


plt.show()

In [None]:
len(annotated_maf[annotated_maf.LikelyImmortalized].combined_mut.unique())

In [None]:
len(set(annotated_maf["ProfileID"])) * 0.05

In [None]:
Counter(annotated_maf["combined_mut"].tolist()).items()