In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from bgreference import hg38
from scipy.stats import mannwhitneyu
from glob import glob
import seaborn as sns
import matplotlib as mpl
from tqdm import tqdm
import os
import numpy as np
import json
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import json


In [None]:
# config for matplotlib
def config_params(font_size=7):
    
    mpl.rcParams.update(mpl.rcParamsDefault)
    plt.rcParams['font.sans-serif'] = ['arial']
    plt.rcParams['font.size'] = font_size
    plt.rcParams['font.family'] = ['sans-serif']
    plt.rcParams['svg.fonttype'] = 'none'
    plt.rcParams['mathtext.fontset'] = 'custom'
    plt.rcParams['mathtext.cal'] = 'arial'
    plt.rcParams['mathtext.rm'] = 'arial'
    
def create_snv_class(df):

    pyr = ['C', 'T']
    rev = {'A':'T', 'T':'A', 'G':'C', 'C':'G'}
    x=df['TRIPLET']
    
    if x[1] in pyr:
        out = '{}[{}>{}]{}'.format(x[0], x[1], df['alt'], x[2])
    else:
        out = '{}[{}>{}]{}'.format(rev[x[2]], rev[x[1]], rev[df['alt']], rev[x[0]])
    return out
    
def create_xticklabels(ax1, ax2, ax3, ax4, subset, ):
        

    ax1.set_xticks(np.arange(len(subset)))
    ax1.set_xticklabels(subset['alt'].tolist())

    # make every three of one color
    all_s = np.arange(len(subset))

    for i in all_s[::6]:
        for s in [0,1,2]:
            ax1.get_xticklabels()[i+s].set_color("red")


    ax1.set_xlim(0, len(subset))

    ax2.xaxis.set_ticks_position("bottom")
    ax2.xaxis.set_label_position("bottom")

    ax2.spines["bottom"].set_position(("axes", -0.15))

    ax2.set_xlim(0, len(subset))


    ax2.set_xticks([i for i in range(1, len(subset)+1, 3)])
    all_refs = subset['REF'].tolist()
    ax2.set_xticklabels([all_refs[i] for i in range(1, len(subset)+1, 3)])

    ax3.spines["bottom"].set_visible(True)
    ax3.set_ylim(-0.0001, 0.005)
    ax3.set_xlim(0, len(subset))

    # Move twinned axis ticks and label from top to bottom
    ax3.xaxis.set_ticks_position("bottom")
    ax3.xaxis.set_label_position("bottom")

    # Offset the twin axis below the host
    ax3.spines["bottom"].set_position(("axes", -0.30))
    ax3.spines["bottom"].set_visible(True)
    ax3.set_ylim(-0.0001, 0.005)

    # Offset the twin axis below the host
    ax3.set_xticks(np.arange(len(subset)))
    ax3.set_xticklabels(subset['AA_new'].tolist())

    # make every three of one color
    all_s = np.arange(len(subset))

    for i in all_s[::18]:
        for s in [0,1,2,3,4,5,6,7,8]:
            ax3.get_xticklabels()[i+s].set_color("darkblue")


    ax4.spines["bottom"].set_visible(True)
    ax4.set_ylim(-0.0001, 0.005)
    ax4.set_xlim(0, len(subset))

    # Move twinned axis ticks and label from top to bottom
    ax4.xaxis.set_ticks_position("bottom")
    ax4.xaxis.set_label_position("bottom")

    # Offset the twin axis below the host
    ax4.spines["bottom"].set_position(("axes", -0.4))
    ax4.spines["bottom"].set_visible(True)
    ax4.set_ylim(-0.0001, 0.005)

    ax4.set_xticks([i for i in range(4, len(subset), 9)])
    all_refs = subset['AA_old'].tolist()
    ax4.set_xticklabels([all_refs[i] for i in range(4, len(subset), 9)])

def plot_subset(subset_df, NAME,  positions_wanted, dic_color_sigs, dic_sigs, list_signatures = ['Signature.1', 
                                                                                       'Signature.2', 
                                                                                        'Signature.5']):

  
    fig = plt.figure(figsize = (10, 2))
    ax1 = fig.add_subplot(111)
    ax2 = ax1.twiny()
    ax3 = ax1.twiny()
    ax4 = ax1.twiny()

    ax1.set_ylabel('Probability')
    
    subset_sigs = subset_df[list_signatures]
    subset_sigs = subset_sigs/subset_sigs.sum()
    bottom = np.array([0.0 for i in range(len(subset))])
    
    total = np.zeros(len(subset_df))
    for sig in list_signatures:
        vals = np.array([dic_sigs[sig][c] for c in subset_df['CLASS'].tolist()])
        total +=vals

    for sig in list_signatures:
        
        vals = np.array([dic_sigs[sig][c] for c in subset_df['CLASS'].tolist()])/total
            
        ax1.bar(np.arange(len(subset)), vals, 
                color = dic_color_sigs[sig], bottom = bottom)
        
        bottom += vals
    
        
    create_xticklabels(ax1, ax2, ax3, ax4, subset_df)
    ax1.set_ylim(0, 1)
    
    plt.show()
    
    fig = plt.figure(figsize = (10, 2))
    ax1 = fig.add_subplot(111)
    ax2 = ax1.twiny()
    ax3 = ax1.twiny()
    ax4 = ax1.twiny()

    ax1.set_ylabel('MutRate')
    
    
    ax1.plot(subset_df['PROBABILITY'].tolist(), 
                color = 'darkred')
        
    
    ax1.set_xlim(0, len(subset_df))
    
    ax1.set_xlim(0, len(subset_df))
    
    plt.show()

def order_muts():

    order = []
    first = ['A', 'C', 'G', 'T']
    pyr = ['C', 'T']
    for p in pyr:
        for mut in first:
            if mut != p:
                for f in first:
                    for f2 in first:
                        comb = '{}[{}>{}]{}'.format(f, p, mut, f2)
                        order.append(comb)
    
    return order
    
config_params(7)

# Generate the files

In [None]:
signatures_file = 'SigProfiler_COSMIC.deconstructsigs.tsv'
df_sigs = pd.read_csv(signatures_file, sep ='\t')
dic_sigs = df_sigs.to_dict(orient='index')


In [None]:
path_out = '/workspace/datasets/boostdm_runs/20200205//saturation_prediction/signatures/'

In [None]:
for f in tqdm(glob('/workspace/datasets/boostdm_runs/20200205//saturation_prediction/*.prediction.out.gz')):
    df = pd.read_csv(f, 
                    sep ='\t')
    df['TRIPLET'] = df.apply(lambda x : hg38(x['chr'], int(x['pos'])-1, 3), axis = 1)
    df['CLASS'] = df.apply(create_snv_class, axis = 1)

    for signature in dic_sigs:
        df[signature] = df['CLASS'].map(dic_sigs[signature])
        df[signature] = df[signature]/df[signature].sum()
    
    outname = '{}/{}'.format(path_out, os.path.basename(f).replace('out.gz', 'out.signatures.gz'))
    df.to_csv(outname, sep ='\t', index = False, header = True, compression = 'gzip')

In [None]:
dic_color_sigs = {
        'Signature.5': '#F2BD1F',
        'Signature.15': '#b15928',
        'Signature.2': 'darkred',
        'Signature.4': '#6a3d9a',
        'Signature.9': '#cab2d6',
        'Signature.10': '#e31a1c',
        'Signature.18': '#c69f04',
        'Signature.26': '#fdbf6f',
        'Signature.17': '#33a02c',
        'Signature.7': 'black',
        'Signature.13': '#D32181',
        'Signature.28': '#b2df8a',
        'Signature.1': '#4188C6',
        'Signature.27': '#a6cee3',
        'Signature.16': 'darkgreen',
        'Signature.14': '#aa00ff',
        'Signature.3': '#38aa9d',
        'Signature.6': '#9992aa',
        'Signature.12': '#aaa1a1',
        'Signature.30': '#7d3a3b',
        'Signature.11': 'green',
        'Signature.19': 'grey',
        'Signature.20': 'pink',
        'Signature.21': 'blue',
        'Signature.22': 'white',
        'Signature.23': 'darkblue',
        'Signature.24': 'orange',
        'Signature.25': 'darkorange',
        'Signature.29': 'grey',
        'Signature.8': '#E3A663'
    }

## PIK3CA

In [None]:

df = pd.read_csv('/workspace/datasets/boostdm_runs/20200205//saturation_prediction/signatures/PIK3CA.BRCA.prediction.out.signatures.gz', 
                sep ='\t')

signature_columns = [x for x in df.columns if 'Signature' in x]
df['REF'] = df['TRIPLET'].apply(lambda x :x[1])
df['AA_new'] = df['aachange'].apply(lambda x : x[0])
df['AA_old'] = df['aachange'].apply(lambda x : x[-1])
df['Protein_position'] = df['aachange'].apply(lambda x : int(x[1:-1]))

mutrate = '/workspace/projects/driver_potential/site_probability/mutrate_results/TCGA_WXS_BRCA.mutrate_output/norm_PIK3CA.out.json'

dic_mutrate = json.load(open(mutrate, 'rt'))
toappend = []
for sample, v in dic_mutrate['PIK3CA'].items():
    toappend.append(v)
    
mean_context = np.mean(toappend, axis = 0)

order_snvs = order_muts()
dic_probability = {o:mean_context[ix] for ix, o in enumerate(order_snvs)}
subset=df[(df['Protein_position']>540)&(df['Protein_position']<550)]
subset['PROBABILITY'] = subset['CLASS'].map(dic_probability)

# plot
plot_subset(subset, "PIK3CA", [9, 36],  dic_color_sigs, dic_sigs,)

## KRAS

In [None]:
df = pd.read_csv('/workspace/projects/intogen_2017/test/boostDM/output_run_global/saturation_analysis/KRAS.LUAD.annotated.out.signatures.gz', 
                sep ='\t')
signature_columns = [x for x in df.columns if 'Signature' in x]
df['REF'] = df['TRIPLET'].apply(lambda x :x[1])

df['AA_new'] = df['Amino_acids'].apply(lambda x : x.split('/')[1] if '/' in x else x)
df['AA_old'] = df['Amino_acids'].apply(lambda x : x.split('/')[0]  if '/' in x else x)

subset=df[(df['Protein_position']>5)&(df['Protein_position']<15)]
mutrate = '/workspace/projects/driver_potential/site_probability/mutrate_results/TCGA_WXS_LUAD.mutrate_output/norm_KRAS.out.json'

dic_mutrate = json.load(open(mutrate, 'rt'))
toappend = []
for sample, v in dic_mutrate['KRAS'].items():
    toappend.append(v)
    
mean_context = np.mean(toappend, axis = 0) 
order_snvs = order_muts()
dic_probability = {o:mean_context[ix] for ix, o in enumerate(order_snvs)}
subset['PROBABILITY'] = subset['CLASS'].map(dic_probability)

# plot
plot_subset(subset[::-1], "KRAS_", [12],  dic_color_sigs, dic_sigs, list_signatures = ['Signature.1', 
                                                                               'Signature.2', 
                                                                                'Signature.5', 
                                                                                'Signature.4'])
