In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
os.environ['PATH_SOURCE_DATA'] = '/workspace/projects/boostdm/nature-release/source-data'

# Figure 2a.

In [None]:
import sys
sys.path.append('./scripts/')
import warnings
warnings.filterwarnings('ignore')

from fetch_data_needle import create_observed_dataset, get_mutations, get_plot_data
from plot_needle import plot_observed_distribution

In [None]:
obs_muts = get_mutations()

def needleplot(gene, ttype, plotname=None):
    df = create_observed_dataset(gene, ttype, obs_muts)
    args = get_plot_data(df)
    plot_observed_distribution(gene, ttype, *args, plotname=plotname)

In [None]:
gene, ttype = 'EGFR', 'LUAD'
needleplot(gene, ttype, plotname=f'./raw_plots/{gene}.{ttype}')

In [None]:
gene, ttype = 'EGFR', 'GBM'
needleplot(gene, ttype, plotname=f'./raw_plots/{gene}.{ttype}')

# Figure 2c.

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from matplotlib import cm
from sklearn.decomposition import PCA
cmap = cm.RdYlGn_r

import conf
from clustering import generate_hierarchy, draw_flat_cluster
from retrieve import get_shaps

In [None]:
df_stats = pd.read_csv(conf.cohorts_path, sep="\t")

# mapping MoA colors to each (gene, ttype)

dg = pd.read_csv(conf.drivers_path, sep='\t')
d = dict(zip(zip(dg.SYMBOL, dg.CANCER_TYPE), map(lambda x: conf.dict_colors_role.get(x, '#808080'), dg.ROLE)))
d_gene = dict(zip(dg.SYMBOL, map(lambda x: conf.dict_colors_role.get(x, '#808080'), dg.ROLE)))

# load observed mutations

mutations = pd.read_csv(os.path.join(conf.output_boostdm, 'discovery', 'mutations.tsv'), sep='\t')

df = get_shaps(mutations, None, None)

In [None]:
def low_rank(gene, ttype):
    
    df = get_shaps(mutations, None, None)
    
    moa_colors = df['moa'].values
    complexity_colors = df['linear_complexity'].values
    
    df = df[conf.features]
    pca = PCA(10, whiten=True)
    projected = pca.fit_transform(df)
    return df, projected, moa_colors, complexity_colors


def cluster_heatmap(gene, ttype, mode='flat', labels=None, title=None, fn=None):
    
    df, projected, moa_colors, complexity_colors = low_rank(gene, ttype)
    X, linkage, scores, classes, labels_list = generate_hierarchy(df)
    if mode == 'flat':
        draw_flat_cluster(X, linkage, df.columns, labels_list[2], 
                          './raw_plots', mode=mode, title=title, fn=fn)
    elif mode == 'moa':
        draw_flat_cluster(X, linkage, df.columns, labels_list[2], 
                          './raw_plots', moa_colors=moa_colors,
                          mode=mode, title=title, fn=fn)
    return labels_list[2]

In [None]:
# unique observed mutations 
# per gene-ttype
# covered by specific models 
# with boostDM positive

labels_hierarchical = cluster_heatmap(None, None, mode='flat', 
                                      title='Specific Models', 
                                      fn='observed.specific.driver.hierarchy')

In [None]:
l = cluster_heatmap(None, None, mode='moa', title='Specific Models', fn='observed.specific.driver.moa')