In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib import cm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
plt.rcParams['svg.fonttype'] = 'none'

SMALL_SIZE = 12
MEDIUM_SIZE = 16
BIGGER_SIZE = 22

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [None]:
def get_mutated_cell_lines(ccle_df, ccle_map, cell_lines, mut_type_list):
    mut_cell_lines = set()
    ct = ccle_df[ccle_df.Hugo_Symbol.isin(mut_type_list)]
    for i,row in ct.iterrows():
        if row['DepMap_ID'] in ccle_map and ccle_map[row['DepMap_ID']] in cell_lines:
            mut_cell_lines.add(ccle_map[row['DepMap_ID']])
    return mut_cell_lines

In [None]:
def get_cell_lines_by_tissue_type(ccle_df, cell_lines, tissue_type):
    tissue_cl = set()
    for _,row in ccle_samples_df.iterrows():
        if row['lineage'] == tissue_type and row['CCLE_Name'] in cell_lines:
            tissue_cl.add(row['CCLE_Name'])
    return tissue_cl

In [None]:
def get_embedding_plot(pcs, mut_index):
    
    pc1 = pcs[:,0]
    pc2 = pcs[:,1]

    pc1_color = [pc1[i] for i in mut_index]
    pc2_color = [pc2[i] for i in mut_index]
    pc1_grey = [pc for i, pc in enumerate(pc1) if i not in mut_index]
    pc2_grey = [pc for i, pc in enumerate(pc2) if i not in mut_index]

    fig = plt.figure(figsize = (5, 5))
    ax = fig.add_subplot(111)
    ax.set_xlim(-5, 5)
    #ax.set_ylim(-5, 5)
    ax.set_xlabel('PC 1')
    ax.set_ylabel('PC 2')
    ax.set_xticks([])
    ax.set_yticks([])
    ax.scatter(pc1_grey, pc2_grey, color = 'silver', s = 6)
    ax.scatter(pc1_color, pc2_color, color = 'green', s = 12)
    plt.show()
    return fig

In [None]:
def get_embedding_plot_contigous(pcs, score):
    
    pc1 = pcs[:,0]
    pc2 = pcs[:,1]
    
    fig = plt.figure(figsize = (5, 6.2))
    ax = fig.add_subplot(111)
    ax.set_xlabel('PC 1')
    ax.set_ylabel('PC 2')
    ax.set_xlim(-5, 5)
    # ax.set_ylim(-5, 5)
    ax.set_xticks([])
    ax.set_yticks([])
    divnorm=colors.TwoSlopeNorm(vmin=0.0, vcenter=np.median(score), vmax=1.2)
    points = ax.scatter(pc1, pc2, c = score, cmap = 'coolwarm', s = 12, norm = divnorm)
    fig.colorbar(cm.ScalarMappable(norm=divnorm, cmap='coolwarm'), ax=ax, location='top', shrink=0.3, ticks=[0.0, 0.4, 0.8, 1.2])
    plt.show()
    return fig

In [None]:
def get_hiddens_pcs(root_hidden_df):
    
    hiddens_df = root_hidden_df.drop(['cell_line', 'smiles', 'auc', 'dataset'], axis=1)

    scaler = StandardScaler()
    scaled_hiddens = scaler.fit_transform(hiddens_df)

    pca = PCA()
    pca.fit(scaled_hiddens)
    root_hidden_pcs = pca.transform(scaled_hiddens)
    
    n = len(root_hidden_pcs[0])
    pc_df = pd.DataFrame(root_hidden_pcs[:, :n], index=root_hidden_df.index)
    pc_df = pd.concat([root_hidden_df['cell_line'], pc_df], axis=1).reindex(root_hidden_df.index)
    
    return pc_df

In [None]:
ccle_samples_df = pd.read_csv("../data/CCLE/sample_info.csv")
ccle_map = dict(zip(ccle_samples_df.DepMap_ID, ccle_samples_df.CCLE_Name))

ccle_df = pd.read_csv("../data/CCLE/CCLE_mutations.csv")

In [None]:
ont = 'ctg'
dataset = 'av'
zscore_method = 'auc'
folds = 5
drugs = ['Dabrafenib']

mut_list = ['BRAF']

for drug in drugs:
    
    df_list = []
    for i in range(1, folds+1):
        
        test_file = '../data/training_files_av/' + str(i) + '_test_' + dataset + '_' + drug + '.txt'
        test_df = pd.read_csv(test_file, sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
        
        modeldir = '../models/model_' + ont + '_' + dataset + '_' + drug + '_' + zscore_method + '_' + str(i)
        
        predict_data = np.loadtxt(modeldir + '/predict.txt')
        
        root_hidden_df = pd.read_csv(modeldir + '/hidden/NEST.hidden', sep=' ', header=None)
        root_hidden_df = pd.concat([test_df, root_hidden_df], axis=1).reindex(test_df.index)
        
        pc_df = get_hiddens_pcs(root_hidden_df)
        cell_lines = list(set(pc_df['cell_line']))
        mutated_cell_lines = get_mutated_cell_lines(ccle_df, ccle_map, cell_lines, mut_list)
        mut_index = [i for i, cell in enumerate(list(pc_df['cell_line'])) if cell in mutated_cell_lines]
        print('n =', len(mut_index))
        
        root_hidden_pc_df = pc_df.drop(['cell_line'], axis=1)
        fig_cont = get_embedding_plot_contigous(root_hidden_pc_df.to_numpy(), test_df['auc'])
        fig_mut = get_embedding_plot(root_hidden_pc_df.to_numpy(), mut_index)
        
        #fig_cont.savefig('../plots/' + drug + 'auc_embeddings_' + str(i) + '.svg')
        #fig_mut.savefig('../plots/' + drug + 'mutation_embeddings_' + str(i) + '.svg')

In [None]:
ont = 'ctg'
dataset = 'av'
zscore_method = 'auc'
folds = 5
drugs = ['Trametinib']

mut_list = ['BRAF', 'MAPK1', 'MAP2K1', 'MAP2K2', 'KRAS']

for drug in drugs:
    
    df_list = []
    for i in range(1, folds+1):
        
        test_file = '../data/training_files_av/' + str(i) + '_test_' + dataset + '_' + drug + '.txt'
        test_df = pd.read_csv(test_file, sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
        
        modeldir = '../models/model_' + ont + '_' + dataset + '_' + drug + '_' + zscore_method + '_' + str(i)
        
        predict_data = np.loadtxt(modeldir + '/predict.txt')
        
        root_hidden_df = pd.read_csv(modeldir + '/hidden/NEST.hidden', sep=' ', header=None)
        root_hidden_df = pd.concat([test_df, root_hidden_df], axis=1).reindex(test_df.index)
        pc_df = get_hiddens_pcs(root_hidden_df)
        
        cell_lines = list(set(pc_df['cell_line']))
        mutated_cell_lines = get_mutated_cell_lines(ccle_df, ccle_map, cell_lines, mut_list)
        mut_index = [i for i, cell in enumerate(list(pc_df['cell_line'])) if cell in mutated_cell_lines]
        print('n =', len(mut_index))
        
        root_hidden_pc_df = pc_df.drop(['cell_line'], axis=1)
        fig_cont = get_embedding_plot_contigous(root_hidden_pc_df.to_numpy(), test_df['auc'])
        fig_mut = get_embedding_plot(root_hidden_pc_df.to_numpy(), mut_index)
        
        fig_cont.savefig('../plots/' + drug + '_auc_embeddings_' + str(i) + '.svg')
        fig_mut.savefig('../plots/' + drug + '_mutation_embeddings_' + str(i) + '.svg')

In [None]:
ont = 'ctg'
dataset = 'av'
zscore_method = 'auc'
folds = 5
drugs = ['Palbociclib']

mut_list = ['RB1']

for drug in drugs:
    
    df_list = []
    for i in range(1, folds+1):
        
        test_file = '../data/training_files_av/' + str(i) + '_test_' + dataset + '_' + drug + '.txt'
        test_df = pd.read_csv(test_file, sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
        
        modeldir = '../models/model_' + ont + '_' + dataset + '_' + drug + '_' + zscore_method + '_' + str(i)
        
        predict_data = np.loadtxt(modeldir + '/predict.txt')
        
        root_hidden_df = pd.read_csv(modeldir + '/hidden/NEST.hidden', sep=' ', header=None)
        root_hidden_df = pd.concat([test_df, root_hidden_df], axis=1).reindex(test_df.index)
        pc_df = get_hiddens_pcs(root_hidden_df)
        
        cell_lines = list(set(pc_df['cell_line']))
        mutated_cell_lines = get_mutated_cell_lines(ccle_df, ccle_map, cell_lines, mut_list)
        mut_index = [i for i, cell in enumerate(list(pc_df['cell_line'])) if cell in mutated_cell_lines]
        print('n =', len(mut_index))
        
        root_hidden_pc_df = pc_df.drop(['cell_line'], axis=1)
        fig_cont = get_embedding_plot_contigous(root_hidden_pc_df.to_numpy(), test_df['auc'])
        fig_mut = get_embedding_plot(root_hidden_pc_df.to_numpy(), mut_index)
        
        fig_cont.savefig('../plots/' + drug + '_auc_embeddings_' + str(i) + '.svg')
        fig_mut.savefig('../plots/' + drug + '_mutation_embeddings_' + str(i) + '.svg')

In [None]:
all_genes = list(pd.read_csv('../data/training_files_av/gene2ind_ctg_av.txt', sep='\t', header=None, names=['I', 'G'])['G'])
cell_lines

In [None]:
ont = 'ctg'
dataset = 'av'
zscore_method = 'auc'
folds = 5
drugs = ['Palbociclib']

all_genes = list(pd.read_csv('../data/training_files_av/gene2ind_ctg_av.txt', sep='\t', header=None, names=['I', 'G'])['G'])
genie_cell_lines = list(pd.read_csv('../data/GENIE/GENIE_all_cell2ind.txt', sep='\t', header=None, names=['I', 'C'])['C'])
mutation_data = pd.read_csv('../data/GENIE/GENIE_cell2mutation_av.txt', header=None, names=all_genes)
mutation_data['cell_line'] = genie_cell_lines

mut_list = ['RB1']

for drug in drugs:
    
    test_file = '../data/GENIE/GENIE_test_av_Palbociclib.txt'
    test_df = pd.read_csv(test_file, sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
    
    mutated_cell_lines = set()
    for gene in mut_list:
        mutated_cell_lines.update([row['cell_line'] for _, row in mutation_data.iterrows() if row[gene] == 1])
    print(mutated_cell_lines)
    
    df_list = []
    for i in range(1, folds+1):
                
        modeldir = '../models/model_' + ont + '_' + dataset + '_' + drug + '_' + zscore_method + '_' + str(i)
        root_hidden_df = pd.read_csv(modeldir + '/hidden_genie/NEST.hidden', sep=' ', header=None)
        root_hidden_df = pd.concat([test_df, root_hidden_df], axis=1).reindex(test_df.index) 
        pc_df = get_hiddens_pcs(root_hidden_df)
        
        mut_index = [i for i, cell in enumerate(list(pc_df['cell_line'])) if cell in mutated_cell_lines]
        print('n =', len(mut_index))
        
        root_hidden_pc_df = pc_df.drop(['cell_line'], axis=1)
        fig = get_embedding_plot(root_hidden_pc_df.to_numpy(), mut_index)
        # fig.savefig(modeldir + '/rb1_genie_mutations.png')