In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
from scipy import stats
import seaborn as sns
from sklearn.metrics import r2_score
from sklearn.preprocessing import scale
from sklearn.preprocessing import robust_scale
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from tabulate import tabulate
from sklearn.decomposition import PCA
from ddot import Ontology

pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
sns.set()
sns.set_style("white")
sns.set_style("ticks", {"xtick.major.size":8, "ytick.major.size":8})
sns.axes_style("whitegrid")
sns.set_palette("muted")
sns.color_palette("muted")

In [None]:
plt.rcParams['svg.fonttype'] = 'none'

SMALL_SIZE = 12
MEDIUM_SIZE = 16
BIGGER_SIZE = 22

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [None]:
#Separating data for each drug/cell

def get_pos_map(obj_list, test_df, col):
    pos_map = {obj:[] for obj in obj_list}
    for i, row in test_df.iterrows():
        if row[col] in pos_map:
            pos_map[row[col]].append(i)
    return pos_map

In [None]:
#Arrange the obj_list in the descending order of the scores

def sort_scores(obj_list, scores):
    score_map = {}
    for i, obj in enumerate(obj_list):
        score_map[obj] = scores[i]
    return {obj:sc for obj,sc in sorted(score_map.items(), key=lambda item:item[1], reverse=True)}

In [None]:
def create_confusion_matrix(sens, res, pred_sens, pred_res):
    tp = len([c for c in pred_sens if c in sens]) + 1
    fn = len([c for c in pred_sens if c in res]) + 1
    fp = len([c for c in pred_res if c in sens]) + 1
    tn = len([c for c in pred_res if c in res]) + 1
    
    cont_table = [['Pred. Sensitive', tp, fn],
                  ['Pred. Resistant', fp, tn]]
    odds_ratio = (tp*tn)/(fn*fp)
    
    return cont_table, odds_ratio

In [None]:
def create_true_auc_waterfall(test_df, sens_df, res_df, pred_sens_df, pred_res_df, title):
    
    fig, ax = plt.subplots(figsize=(8, 8))

    data_list = []
    test_df.sort_values(by='auc', inplace=True, ignore_index=True, ascending=False)
    for _,row in test_df.iterrows():
        cell_name = str(row['cell_line'])
        
        if cell_name not in list(res_df['cell_line']) and cell_name not in list(sens_df['cell_line']):
            continue
        
        if cell_name in list(pred_res_df['cell_line']):
            data_list.append((cell_name, row['auc'], 'goldenrod'))

        if cell_name in list(pred_sens_df['cell_line']):
            data_list.append((cell_name, row['auc'], 'royalblue'))
            
    ax.bar([d[0] for d in data_list], [d[1] for d in data_list], color=[d[2] for d in data_list], width=1, edgecolor='none')
    ax.set_xticks([])
    ax.set_ylim((0, np.max(test_df['auc']) + 0.02))
    ax.set_xlabel('Cell lines')
    ax.set_ylabel('True AUC')
    ax.set_title(title)
    return fig

In [None]:
def plot_drug_performance(drug_corr_map, cut_off):
    fig, ax = plt.subplots(figsize=(8, 8))
    x_red = []
    y_red = []
    x_blue = []
    y_blue = []
    for d in drug_corr_map.keys():
        if drug_corr_map[d] >= cut_off:
            x_red.append(d)
            y_red.append(drug_corr_map[d])
        else:
            x_blue.append(d)
            y_blue.append(drug_corr_map[d])
    
    ratio = float(len(x_red))/float(len(drug_corr_map.keys()))
    print('Red ratio = ' + str(ratio))
    ax.bar(x_red, y_red, color='red', width=1.0, alpha=0.9)
    ax.bar(x_blue, y_blue, color='blue', width=1.0, alpha=0.9)
    ax.set_xticks([])
    ax.set_xlabel('Drugs')
    ax.set_ylabel('Performance\nSpearman ρ (Predicted vs. Actual)')
    plt.show()
    return fig

In [None]:
def create_drug_performance_plot(drugs, drug_corr_list, top=False, cut_off=0.1):
    drug_corr_map = sort_scores(drugs, drug_corr_list)
    if top:
        fig_drug_perf = plot_top_drug_performance(drug_corr_map, cut_off)
    else:
        fig_drug_perf = plot_drug_performance(drug_corr_map, cut_off)
        print('Median spearman rho:', np.median(list(drug_corr_map.values())))
    return fig_drug_perf

In [None]:
def create_scatter_plot(X, Y, x_title, y_title):
    
    idx_del = []
    for i, x in enumerate(X):
        if math.isnan(x):
            idx_del.append(i)
    for i, y in enumerate(Y):
        if math.isnan(y):
            idx_del.append(i)
    X = [x for i, x in enumerate(X) if i not in idx_del]
    Y = [y for i, y in enumerate(Y) if i not in idx_del]
    
    scatter_df = pd.DataFrame({x_title: X, y_title: Y})
    
    fig, ax = plt.subplots(figsize=(6,6))
    sns.scatterplot(data=scatter_df, x=x_title, y=y_title, s=35, ax=ax)
    
    ax.set_xlabel(x_title)
    ax.set_ylabel(y_title)
    lims = [
        np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
        np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
    ]
    plt.plot(lims, lims, 'k--', alpha=0.75, zorder=0)
    ax.set_aspect('equal')
    ax.set_xlim(lims)
    ax.set_ylim(lims)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    
    print('t-test', stats.ttest_ind(Y, X, alternative='greater', equal_var=False))
    
    return fig

In [None]:
def get_boxplot(df, x_title, y_title):
    
    fig, ax = plt.subplots(figsize=(9,9))
    ax.boxplot(df, showfliers=False, widths=0.5, patch_artist=True)
    ax.set_xticklabels(list(df.columns))
    ax.set_xlabel(x_title)
    ax.set_ylabel(y_title)
    
    plt.show()
    
    return fig

In [None]:
def create_bar_chart(x, data, x_title, y_title):
    fig = plt.figure(figsize=(9, 9))
    ax = fig.add_subplot(111)
    x_pos = [i for i, _ in enumerate(x)]
    x_labels = [s for _,s in enumerate(x)]
    ax.bar(x_pos, data, align='center', alpha=1.0)
    plt.xticks(x_pos, x)
    ax.set_xlabel(x_title)
    ax.set_ylabel(y_title)
    plt.show()
    return fig

In [None]:
def create_bar_chart_with_errors(x, data, errors, x_title, y_title):
    fig = plt.figure(figsize=(9, 9))
    ax = fig.add_subplot(111)
    x_pos = [i for i, _ in enumerate(x)]
    x_labels = [s for _,s in enumerate(x)]
    ax.bar(x_pos, data, align='center', alpha=1.0)
    ax.errorbar(x_pos, data, yerr=errors, fmt='o', color='r', capsize=10)
    plt.xticks(x_pos, x)
    ax.set_xlabel(x_title)
    ax.set_ylabel(y_title)
    plt.show()
    return fig

In [None]:
def create_histogram(data, x_title, y_title):
    fig = plt.figure(figsize=(9, 8))
    ax = fig.add_subplot(111)
    ax.hist(x=data, bins='auto', alpha=0.7, rwidth=0.85)
    ax.grid(False)
    ax.set_xlim(0, 1)
    ax.set_xlabel(x_title)
    ax.set_ylabel(y_title)
    plt.show()
    return fig

In [None]:
def get_violinplot(df, x_title, y_title):
    
    fig, ax = plt.subplots(figsize=(6,6))
    ax.violinplot(df, showmedians=True)
    ax.set_xticklabels(list(df.columns), rotation=45)
    ax.set_xlabel(x_title)
    ax.set_ylabel(y_title)
    
    plt.show()
    
    return fig

In [None]:
def create_line_curve(X, Y, x_title, y_title, x_lim, y_lim):
    fig = plt.figure(figsize=(5, 5))
    ax = fig.add_subplot(111)
    ax.plot(X, Y)
    ax.grid(False)
    ax.set_xlim(x_lim)
    ax.set_ylim(y_lim)
    ax.set_xlabel(x_title)
    ax.set_ylabel(y_title)
    plt.show()
    return fig

In [None]:
def all_models_scatterplot(performance_df, x_title, y_title):
    
    fig, ax = plt.subplots(figsize=(6, 14))
    
    performance_df.plot(kind='scatter', x='Random Forest', y='Drug', color='DarkGreen', label='Random Forest', ax=ax)
    performance_df.plot(kind='scatter', x='Multi layer perceptron', y='Drug', color='Green', label='Multi layer perceptron', ax=ax)
    performance_df.plot(kind='scatter', x='ElasticNet', y='Drug', color='LightGreen', label='ElasticNet', ax=ax)
    performance_df.plot(kind='scatter', x='DCoDR', y='Drug', color='red', label='DCoDR', s=30, ax=ax)
    
    ax.set_xlabel(x_title)
    ax.set_ylabel(y_title)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    
    return fig

In [None]:
def calc_std_vals(df, zscore_method):
    std_df = pd.DataFrame(columns=['smiles', 'center', 'scale'])
    std_list = []

    if zscore_method == 'zscore':
        for name, group in df.groupby(['smiles'])['auc']:
            center = group.mean()
            scale = group.std()
            if math.isnan(scale) or scale == 0.0:
                scale = 1.0
            temp = pd.DataFrame([[name, center, scale]], columns=std_df.columns)
            std_list.append(temp)

    elif zscore_method == 'robustz':
        for name, group in df.groupby(['smiles'])['auc']:
            center = group.median()
            scale = group.quantile(0.75) - group.quantile(0.25)
            if math.isnan(scale) or scale == 0.0:
                scale = 1.0
            temp = pd.DataFrame([[name, center, scale]], columns=std_df.columns)
            std_list.append(temp)
    else:
        for name, group in df.groupby(['smiles'])['auc']:
            temp = pd.DataFrame([[name, 0.0, 1.0]], columns=std_df.columns)
            std_list.append(temp)

    std_df = pd.concat(std_list, ignore_index=True)
    return std_df

In [None]:
def standardize_data(df, std_df, zscore_method):
    merged = pd.merge(df, std_df, how="left", on=['smiles'], sort=False)
    merged[zscore_method] = (merged['auc'] - merged['center']) / merged['scale']
    merged = merged[['cell_line', 'smiles', zscore_method]]
    return merged

In [None]:
def normalize_auc(train_std_df, test_df, zscore_method):

    test_std_df = calc_std_vals(test_df, zscore_method)
    for i, row in test_std_df.iterrows():
        smiles = row['smiles']
        train_entry = train_std_df.query('smiles == @smiles')
        if not train_entry.empty:
            test_std_df.loc[i, 'center'] = float(train_entry['center'])
            test_std_df.loc[i, 'scale'] = float(train_entry['scale'])
            
    test_df = standardize_data(test_df, test_std_df, zscore_method)
    return test_df

In [None]:
def print_best_params(modeldir):
    log_file = modeldir + "/train.log"
    with open(log_file, 'r') as fi:
        logs = fi.readlines()
    for i, log in enumerate(logs):
        if 'Params:' in log:
            print(logs[i+1].strip())
            print(logs[i+2].strip())
            break

In [None]:
def get_avg_cv_data(ont, dataset, zscore_method, drug, fold_size=5):
    
    avg_corr = 0.0
    fold_corr_list = []
    for i in range(1, fold_size+1):
        
        test_file = '../data/training_files_av/' + str(i) + '_test_' + dataset + '_' + drug + '.txt'
        test_df = pd.read_csv(test_file, sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
        
        modeldir = '../models/model_' + ont + '_' + dataset + '_' + drug + '_' + zscore_method + '_' + str(i) 
        pred_file = modeldir + '/predict.txt'
        try:
            pred = np.loadtxt(pred_file)
        except:
            continue
        
        corr = stats.pearsonr(test_df['auc'], pred)[0]
        fold_corr_list.append(corr)
        avg_corr += corr
        
    avg_corr /= fold_size
    return fold_corr_list, avg_corr

In [None]:
def get_avg_cv_data_alternate(method, dataset, drug, fold_size=5):
    
    avg_corr = 0.0
    fold_corr_list = []
    for i in range(1, fold_size+1):
        
        test_file = '../data/training_files_av/' + str(i) + '_test_' + dataset + '_' + drug + '.txt'
        test_df = pd.read_csv(test_file, sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
        
        modeldir = '../models/' + method
        pred_file = modeldir + '/predict' + '_' + dataset + '_' + drug + '_' + str(i) + '.txt'
        try:
            pred = np.loadtxt(pred_file)
        except:
            continue
        
        corr = stats.pearsonr(test_df['auc'], pred)[0]
        fold_corr_list.append(corr)
        avg_corr += corr
        
    avg_corr /= fold_size
    return fold_corr_list, avg_corr

In [None]:
def get_avg_cv_data_mut(ont, dataset, zscore_method, drug, fold_size=5):
    
    avg_corr = 0.0
    fold_corr_list = []
    for i in range(1, fold_size+1):
        
        test_file = '../data/training_files_av/' + str(i) + '_test_' + dataset + '_' + drug + '.txt'
        test_df = pd.read_csv(test_file, sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
        
        modeldir = '../../nest_drugcell/models/model_' + ont + '_' + dataset + '_' + drug + '_' + zscore_method + '_' + str(i) 
        pred_file = modeldir + '/predict.txt'
        try:
            pred = np.loadtxt(pred_file)
        except:
            continue
        
        corr = stats.pearsonr(test_df['auc'], pred)[0]
        fold_corr_list.append(corr)
        avg_corr += corr
        
    avg_corr /= fold_size
    return fold_corr_list, avg_corr

In [None]:
def stratify_data(test_df, col, sens_cutoff=0, res_cutoff=1, is_mad=True):
    
    sens_samples = []
    res_samples = []
    median = np.median(np.array(test_df[col]))
    mad = stats.median_abs_deviation(np.array(test_df[col]), scale='normal')
    if is_mad:
        t_low = median - mad
        t_high = median + mad
    else:
        t_low = sens_cutoff
        t_high = res_cutoff
    
    for _,row in test_df.iterrows():
        
        temp = pd.DataFrame([[row['cell_line'], row['auc'], row['pred']]], columns=['cell_line', 'auc', 'pred'])
        
        if row[col] <= t_low:
            sens_samples.append(temp)
            
        elif row[col] >= t_high:
            res_samples.append(temp)

    res_df = []
    if len(res_samples) > 0:
        res_df = pd.concat(res_samples, axis=0, ignore_index=True)
        
    sens_df = []
    if len(sens_samples) > 0:
        sens_df = pd.concat(sens_samples, axis=0, ignore_index=True)
        
    return sens_df, res_df

In [None]:
def get_filtered_data(test_df, fraction, is_inverted=False):

    cell_name_map = dict()
    for i, row in test_df.iterrows():
        cell_line_name = str(row['cell_line'])
        if cell_line_name not in cell_name_map:
            cell_name_map[cell_line_name] = 1
        else:
            cell_name_map[cell_line_name] += 1
        test_df.at[i, 'cell_line'] = cell_line_name + str(cell_name_map[cell_line_name])
        
    sens_df, res_df = stratify_data(test_df, 'auc')
    
    for i, row in test_df.iterrows():
        if str(row['cell_line']) not in list(sens_df['cell_line']) and str(row['cell_line']) not in list(res_df['cell_line']):
            test_df.drop(i, inplace=True)
            
    data_size = len(test_df)
    class_size = int(data_size * fraction)
    test_df.sort_values(by='pred', inplace=True, ignore_index=True)
    if is_inverted:
        pred_res_df = test_df.iloc[ : class_size]
        pred_sens_df = test_df.iloc[data_size - class_size : data_size]
    else:
        pred_sens_df = test_df.iloc[ : class_size]
        pred_res_df = test_df.iloc[data_size - class_size : data_size]
    
    return pred_sens_df, pred_res_df

In [None]:
def get_filtered_data_mad(test_df, t_low=0, t_high=1, is_mad=True):

    sens_df, res_df = stratify_data(test_df, 'auc')
    
    for i, row in test_df.iterrows():
        if str(row['cell_line']) not in list(sens_df['cell_line']) and str(row['cell_line']) not in list(res_df['cell_line']):
            test_df.drop(i, inplace=True)
    
    pred_sens_df, pred_res_df = stratify_data(test_df, 'pred', sens_cutoff=t_low, res_cutoff=t_high, is_mad=is_mad)
    
    return sens_df, res_df, pred_sens_df, pred_res_df

In [None]:
def get_all_test(ont, dataset, zscore_method, drug, fold_size=5):
    
    all_test = []
    for i in range(1, fold_size+1):
        
        test_file = '../data/training_files_av/' + str(i) + '_test_' + dataset + '_' + drug + '.txt'
        test_df = pd.read_csv(test_file, sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
        
        modeldir = '../models/model_' + ont + '_' + dataset + '_' + drug + '_' + zscore_method + '_' + str(i) 
        pred_file = modeldir + '/predict.txt'
        try:
            pred = np.loadtxt(pred_file)
        except:
            continue
        test_df['pred'] = pred
        all_test.append(test_df)
    
    all_test_df = pd.concat(all_test, axis=0, ignore_index=True)
    
    cell_name_map = dict()
    for i, row in all_test_df.iterrows():
        cell_line_name = str(row['cell_line'])
        if cell_line_name not in cell_name_map:
            cell_name_map[cell_line_name] = 1
        else:
            cell_name_map[cell_line_name] += 1
        all_test_df.at[i, 'cell_line'] = cell_line_name + str(cell_name_map[cell_line_name])
        
    return all_test_df

In [None]:
def get_avg_cv_data_3class_mad(ont, dataset, zscore_method, drug, t_low=0, t_high=1, is_mad=True, fold_size=5, is_show=True):
    
    avg_corr = 0.0
    fold_corr_list = []
    all_test_df = get_all_test(ont, dataset, zscore_method, drug, fold_size=5)
    sens_df, res_df, pred_sens_df, pred_res_df = get_filtered_data_mad(all_test_df, t_low=t_low, t_high=t_high, is_mad=is_mad)
    
    if is_show:
        fig = create_true_auc_waterfall(all_test_df, sens_df, res_df, pred_sens_df, pred_res_df, drug)
        cont_table, odds_ratio = create_confusion_matrix(list(sens_df['cell_line']), list(res_df['cell_line']), 
                                                     list(pred_sens_df['cell_line']), list(pred_res_df['cell_line']))
        return fold_corr_list, avg_corr, cont_table, odds_ratio, fig
    else:
        return fold_corr_list, avg_corr, all_test_df, pred_sens_df, pred_res_df

In [None]:
def get_avg_cv_data_3class(ont, dataset, zscore_method, drug, fold_size=5, fraction=0.5, is_show=False):
    
    avg_corr = 0.0
    fold_corr_list = []
    for i in range(1, fold_size+1):
        
        test_file = '../data/training_files_av/' + str(i) + '_test_' + dataset + '_' + drug + '.txt'
        test_df = pd.read_csv(test_file, sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
        
        modeldir = '../models/model_' + ont + '_' + dataset + '_' + drug + '_' + zscore_method + '_' + str(i) 
        pred_file = modeldir + '/predict.txt'
        try:
            pred = np.loadtxt(pred_file)
        except:
            continue
        test_df['pred'] = pred
        
        pred_sens_df, pred_res_df = get_filtered_data(test_df, fraction)
        filtered_df = pd.concat([pred_sens_df, pred_res_df], axis=0, ignore_index=True)
        corr = stats.pearsonr(filtered_df['auc'], filtered_df['pred'])[0]
        if math.isnan(corr):
            corr = 0.0
        fold_corr_list.append(corr)
        avg_corr += corr
        
    avg_corr /= fold_size
    
    return fold_corr_list, avg_corr

In [None]:
def get_avg_cv_data_3class_alternate(method, dataset, drug, fold_size=5, fraction=0.16):
    
    avg_corr = 0.0
    fold_corr_list = []
    for i in range(1, fold_size+1):
        
        test_file = '../data/training_files_av/' + str(i) + '_test_' + dataset + '_' + drug + '.txt'
        test_df = pd.read_csv(test_file, sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
        
        modeldir = '../models/' + method
        pred_file = modeldir + '/predict' + '_' + dataset + '_' + drug + '_' + str(i) + '.txt'
        try:
            pred = np.loadtxt(pred_file)
        except:
            continue
        test_df['pred'] = pred
        
        pred_sens_df, pred_res_df = get_filtered_data(test_df, fraction)
        filtered_df = pd.concat([pred_sens_df, pred_res_df], axis=0, ignore_index=True)
        corr = stats.pearsonr(filtered_df['auc'], filtered_df['pred'])[0]
        if math.isnan(corr):
            corr = 0.0
        fold_corr_list.append(corr)
        avg_corr += corr
        
    avg_corr /= fold_size
    return fold_corr_list, avg_corr

In [None]:
def get_avg_cv_data_3class_mut(ont, dataset, zscore_method, drug, fold_size=5, fraction=0.16):
    
    all_test = []
    avg_corr = 0.0
    fold_corr_list = []
    for i in range(1, fold_size+1):
        
        test_file = '../data/training_files_av/' + str(i) + '_test_' + dataset + '_' + drug + '.txt'
        test_df = pd.read_csv(test_file, sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
        
        modeldir = '../../nest_drugcell/models/model_' + ont + '_' + dataset + '_' + drug + '_' + zscore_method + '_' + str(i) 
        pred_file = modeldir + '/predict.txt'
        try:
            pred = np.loadtxt(pred_file)
        except:
            continue
        test_df['pred'] = pred

        pred_sens_df, pred_res_df = get_filtered_data(test_df, fraction)
        filtered_df = pd.concat([pred_sens_df, pred_res_df], axis=0, ignore_index=True)
        corr = stats.pearsonr(filtered_df['auc'], filtered_df['pred'])[0]
        if math.isnan(corr):
            corr = 0.0
        fold_corr_list.append(corr)
        avg_corr += corr
        
    avg_corr /= fold_size
    return fold_corr_list, avg_corr

In [None]:
def get_avg_cv_data_3class_cnv(ont, dataset, zscore_method, drug, fold_size=5, fraction=0.16):
    
    all_test = []
    avg_corr = 0.0
    fold_corr_list = []
    for i in range(1, fold_size+1):
        
        test_file = '../data/training_files_av/' + str(i) + '_test_' + dataset + '_' + drug + '.txt'
        test_df = pd.read_csv(test_file, sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
        
        modeldir = '../models/CNV/model_' + ont + '_' + dataset + '_' + drug + '_' + zscore_method + '_' + str(i) 
        pred_file = modeldir + '/predict.txt'
        try:
            pred = np.loadtxt(pred_file)
        except:
            continue
        test_df['pred'] = pred

        pred_sens_df, pred_res_df = get_filtered_data(test_df, fraction)
        filtered_df = pd.concat([pred_sens_df, pred_res_df], axis=0, ignore_index=True)
        corr = stats.pearsonr(filtered_df['auc'], filtered_df['pred'])[0]
        if math.isnan(corr):
            corr = 0.0
        fold_corr_list.append(corr)
        avg_corr += corr
        
    avg_corr /= fold_size
    return fold_corr_list, avg_corr

In [None]:
def get_best_avg_corr(drugs, ont, dataset, zscore_method):
    
    best_avg = 0.0
    f_min = 0.1
    f_max = 0.5
    param_list = []
    f = f_min
    while f <= f_max:
        
        all_avg_corr = 0.0
        for i, drug in enumerate(drugs):
            corr_list, avg_corr = get_avg_cv_data_3class(ont, dataset, zscore_method, drug, fraction=f)
            all_avg_corr += avg_corr

        all_avg_corr /= len(drugs)
        if best_avg <= all_avg_corr:
            best_avg = all_avg_corr
            f_best = f

        param_list.append((f, all_avg_corr))
        print(f, all_avg_corr)
        f += 0.05

    create_line_curve([p[0] for p in param_list], [p[1] for p in param_list], 'Cut_off', 'Avg. Correlation', (f_min, f_max), (0.0, 0.3))
    return f_best

In [None]:
dataset = 'av'
zscore_method = 'auc'
ont = 'ctg'

drugs = list(pd.read_csv('../data/training_files_av/drugname_av.txt', header=None, names=['D'])['D'])

In [None]:
vnn_results_df = pd.DataFrame(index=range(len(drugs)), columns=['Drug', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5', 'Average'])

to_remove = []
for i, drug in enumerate(drugs):
    
    corr_list, avg_corr = get_avg_cv_data(ont, dataset, zscore_method, drug)
    if math.isnan(avg_corr):
        drugs.remove(drug)
        vnn_results_df.drop(i, inplace=True)
        print(drug)
        continue
    print(drug, corr_list, avg_corr)
    
    vnn_results_df.loc[i]['Drug'] = drug
    vnn_results_df.loc[i]['Average'] = avg_corr
    for k in range(5):
        vnn_results_df.loc[i]['Fold_'+str(k+1)] = corr_list[k]

In [None]:
vnn_results_3class_df = pd.DataFrame(index=range(len(drugs)), columns=['Drug', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5', 'Average'])

f_best=0.5
    
failed = 0
for i, drug in enumerate(drugs):
    try:
        corr_list, avg_corr = get_avg_cv_data_3class(ont, dataset, zscore_method, drug, fraction=f_best)
        print(drug, corr_list, avg_corr)

        vnn_results_3class_df.loc[i]['Drug'] = drug
        vnn_results_3class_df.loc[i]['Average'] = avg_corr
        for k in range(5):
            vnn_results_3class_df.loc[i]['Fold_'+str(k+1)] = corr_list[k]
    except Exception as e:
        print(drug)

In [None]:
method = 'elastic_net'

elasticnet_df = pd.DataFrame(index=range(len(drugs)), columns=['Drug', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5', 'Average'])

for i, drug in enumerate(drugs):
    corr_list, avg_corr = get_avg_cv_data_alternate(method, dataset, drug)
    print(drug, corr_list, avg_corr)
    
    elasticnet_df.loc[i]['Drug'] = drug
    elasticnet_df.loc[i]['Average'] = avg_corr
    for k in range(5):
        elasticnet_df.loc[i]['Fold_'+str(k+1)] = corr_list[k]

In [None]:
method = 'elastic_net'

elasticnet_3class_df = pd.DataFrame(index=range(len(drugs)), columns=['Drug', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5', 'Average'])

for i, drug in enumerate(drugs):
    corr_list, avg_corr = get_avg_cv_data_3class_alternate(method, dataset, drug, fraction=f_best)
    print(drug, corr_list, avg_corr)
    
    elasticnet_3class_df.loc[i]['Drug'] = drug
    elasticnet_3class_df.loc[i]['Average'] = avg_corr
    for k in range(5):
        elasticnet_3class_df.loc[i]['Fold_'+str(k+1)] = corr_list[k]

In [None]:
method = 'random_forest'

randomforest_df = pd.DataFrame(index=range(len(drugs)), columns=['Drug', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5', 'Average'])

for i, drug in enumerate(drugs):
    corr_list, avg_corr = get_avg_cv_data_alternate(method, dataset, drug)
    print(drug, corr_list, avg_corr)
    
    randomforest_df.loc[i]['Drug'] = drug
    randomforest_df.loc[i]['Average'] = avg_corr
    for k in range(5):
        randomforest_df.loc[i]['Fold_'+str(k+1)] = corr_list[k]

In [None]:
method = 'random_forest'

randomforest_3class_df = pd.DataFrame(index=range(len(drugs)), columns=['Drug', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5', 'Average'])

for i, drug in enumerate(drugs):
    corr_list, avg_corr = get_avg_cv_data_3class_alternate(method, dataset, drug, fraction=f_best)
    print(drug, corr_list, avg_corr)
    
    randomforest_3class_df.loc[i]['Drug'] = drug
    randomforest_3class_df.loc[i]['Average'] = avg_corr
    for k in range(5):
        randomforest_3class_df.loc[i]['Fold_'+str(k+1)] = corr_list[k]

In [None]:
method = 'mlp'

mlp_df = pd.DataFrame(index=range(len(drugs)), columns=['Drug', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5', 'Average'])

for i, drug in enumerate(drugs):
    corr_list, avg_corr = get_avg_cv_data_alternate(method, dataset, drug)
    print(drug, corr_list, avg_corr)
    
    mlp_df.loc[i]['Drug'] = drug
    mlp_df.loc[i]['Average'] = avg_corr
    for k in range(5):
        mlp_df.loc[i]['Fold_'+str(k+1)] = corr_list[k]

In [None]:
method = 'mlp'

mlp_3class_df = pd.DataFrame(index=range(len(drugs)), columns=['Drug', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5', 'Average'])

for i, drug in enumerate(drugs):
    corr_list, avg_corr = get_avg_cv_data_3class_alternate(method, dataset, drug, fraction=f_best)
    print(drug, corr_list, avg_corr)
    
    mlp_3class_df.loc[i]['Drug'] = drug
    mlp_3class_df.loc[i]['Average'] = avg_corr
    for k in range(5):
        mlp_3class_df.loc[i]['Fold_'+str(k+1)] = corr_list[k]

In [None]:
mut_vnn_results_3class_df = pd.DataFrame(index=range(len(drugs)), columns=['Drug', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5', 'Average'])

for i, drug in enumerate(drugs):
    corr_list, avg_corr = get_avg_cv_data_3class_mut(ont, dataset, zscore_method, drug, fraction=f_best)
    print(drug, corr_list, avg_corr)
    
    mut_vnn_results_3class_df.loc[i]['Drug'] = drug
    mut_vnn_results_3class_df.loc[i]['Average'] = avg_corr
    for k in range(5):
        mut_vnn_results_3class_df.loc[i]['Fold_'+str(k+1)] = corr_list[k]

In [None]:
vnn_comp_fig = create_scatter_plot(list(mut_vnn_results_3class_df['Average']), list(vnn_results_3class_df['Average']), 
                                   'Pearson Correlation (Mutation Only)', 'Pearson Correlation (Mutation + Copy Number)')

In [None]:
vnn_comp_fig.savefig('../plots/vnn_comparison_scatterplot_2022-03-02.svg')

In [None]:
cnv_vnn_results_3class_df = pd.DataFrame(index=range(len(drugs)), columns=['Drug', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5', 'Average'])

for i, drug in enumerate(drugs):
    corr_list, avg_corr = get_avg_cv_data_3class_cnv(ont, dataset, zscore_method, drug, fraction=f_best)
    print(drug, corr_list, avg_corr)
    
    cnv_vnn_results_3class_df.loc[i]['Drug'] = drug
    cnv_vnn_results_3class_df.loc[i]['Average'] = avg_corr
    for k in range(5):
        cnv_vnn_results_3class_df.loc[i]['Fold_'+str(k+1)] = corr_list[k]

In [None]:
vnn_comp_fig = create_scatter_plot(list(cnv_vnn_results_3class_df['Average']), list(vnn_results_3class_df['Average']), 
                                   'Pearson Correlation (CNV Only)', 'Pearson Correlation (Mutation + CNV)')

In [None]:
vnn_comp_fig.savefig('../plots/figure2/cnv_comparison_scatterplot_April_03.svg')

In [None]:
performance_df = pd.concat([vnn_results_df['Drug'], vnn_results_df['Average'], mlp_df['Average'], 
                            randomforest_df['Average'], elasticnet_df['Average']], axis=1, ignore_index=True)

performance_df.columns = ['Drug', 'DCoDR', 'Multi layer perceptron', 'Random Forest', 'ElasticNet']
performance_df.sort_values(by=['DCoDR'], inplace=True, ignore_index=True)

In [None]:
x_title = 'Pearson Correlation'
y_title = 'Drug'
all_models_fig = all_models_scatterplot(performance_df, x_title, y_title)

In [None]:
all_models_fig.savefig('../plots/figure2/baseline_performance_comparison_alldata_May_2.svg')

In [None]:
performance_df.columns = ['Drug', 'DCoDR', 'Multi layer perceptron', 'Random Forest', 'ElasticNet']

mlp = set()
rf = set()
eln = set()

for _,row in performance_df.iterrows():
    
    if row['DCoDR'] >= row['Multi layer perceptron']:
        mlp.add(row['Drug'])
        
    if row['DCoDR'] >= row['Random Forest']:
        rf.add(row['Drug'])
        
    if row['DCoDR'] >= row['ElasticNet']:
        eln.add(row['Drug'])
        
print('mlp', len(mlp))

print('rf', len(rf))

print('eln', len(eln))

print('all', len(mlp.intersection(rf).intersection(eln)))

print('total', len(performance_df))

In [None]:
boxplot_df = pd.DataFrame({
    "D-CoDR": vnn_results_df['Average'], 
    "ANN": mlp_df['Average'], 
    "RandomForest": randomforest_df['Average'], 
    "ElasticNet": elasticnet_df['Average']
})

In [None]:
boxplot = get_boxplot(boxplot_3class_df, "", "Pearson Correlation")

In [None]:
boxplot.savefig('../plots/figure2/boxplot_comparison_May_02.svg')

In [None]:
stats.ttest_ind(list(performance_df['ElasticNet']), list(performance_df['DCoDR']), nan_policy='omit')

In [None]:
stats.ttest_ind(list(performance_df['Random Forest']), list(performance_df['DCoDR']), alternative='less')

In [None]:
stats.ttest_ind(list(performance_df['Multi layer perceptron']), list(performance_df['DCoDR']), alternative='less')

In [None]:
errors = []

vnn_data = np.array(vnn_results_df.query('Drug == "Palbociclib"').drop(['Drug'], axis=1).iloc[0])
vnn_ci = stats.t.interval(alpha=0.95, df=len(vnn_data)-1, loc=np.mean(vnn_data), scale=stats.sem(vnn_data))
errors.append(vnn_ci[1]-vnn_ci[0])

mlp_data = np.array(mlp_df.query('Drug == "Palbociclib"').drop(['Drug'], axis=1).iloc[0])
mlp_ci = stats.t.interval(alpha=0.95, df=len(mlp_data)-1, loc=np.mean(mlp_data), scale=stats.sem(mlp_data))
errors.append(mlp_ci[1]-mlp_ci[0])

rdf_data = np.array(randomforest_df.query('Drug == "Palbociclib"').drop(['Drug'], axis=1).iloc[0])
rdf_ci = stats.t.interval(alpha=0.95, df=len(rdf_data)-1, loc=np.mean(rdf_data), scale=stats.sem(rdf_data))
errors.append(rdf_ci[1]-rdf_ci[0])

eln_data = np.array(elasticnet_df.query('Drug == "Palbociclib"').drop(['Drug'], axis=1).iloc[0])
eln_ci = stats.t.interval(alpha=0.95, df=len(eln_data)-1, loc=np.mean(eln_data), scale=stats.sem(eln_data))
errors.append(eln_ci[1]-eln_ci[0])

x = list(performance_df.columns)
x.remove('Drug')
data = performance_df.query('Drug == "Palbociclib"').drop(['Drug'], axis=1)

palbo_barchart_err = create_bar_chart_with_errors(x, data.iloc[0], errors, 'Models', 'Pearson Correlation')

In [None]:
palbo_barchart_err.savefig('../plots/figure2/palbo_barchart_error_May_02.svg')

In [None]:
stats.ttest_1samp(list(mlp_df.query('Drug == "Palbociclib"').iloc[0].drop('Drug')), float(vnn_results_df.query('Drug == "Palbociclib"')['Average']), alternative='less')

In [None]:
stats.ttest_1samp(list(randomforest_df.query('Drug == "Palbociclib"').iloc[0].drop('Drug')), float(vnn_results_df.query('Drug == "Palbociclib"')['Average']), alternative='less')

In [None]:
stats.ttest_1samp(list(elasticnet_df.query('Drug == "Palbociclib"').iloc[0].drop('Drug')), float(vnn_results_df.query('Drug == "Palbociclib"')['Average']), alternative='less')

In [None]:
performance_3class_df = pd.concat([vnn_results_3class_df['Drug'], vnn_results_3class_df['Average'], mlp_3class_df['Average'], 
                            randomforest_3class_df['Average'], elasticnet_3class_df['Average']], axis=1, ignore_index=True)

performance_3class_df.columns = ['Drug', 'DCoDR', 'Multi layer perceptron', 'Random Forest', 'ElasticNet']
performance_3class_df.sort_values(by=['DCoDR'], inplace=True, ignore_index=True)

In [None]:
x_title = 'Pearson Correlation'
y_title = 'Drug'
all_models_3class_fig = all_models_scatterplot(performance_3class_df, x_title, y_title)

In [None]:
all_models_3class_fig.savefig('../plots/figure2/baseline_performance_comparison_May_01.svg')

In [None]:
performance_3class_df.columns = ['Drug', 'DCoDR', 'Multi layer perceptron', 'Random Forest', 'ElasticNet']

mlp = set()
rf = set()
eln = set()

for _,row in performance_3class_df.iterrows():
    
    if row['DCoDR'] >= row['Multi layer perceptron']:
        mlp.add(row['Drug'])
        
    if row['DCoDR'] >= row['Random Forest']:
        rf.add(row['Drug'])
        
    if row['DCoDR'] >= row['ElasticNet']:
        eln.add(row['Drug'])
        
print('mlp', len(mlp))

print('rf', len(rf))

print('eln', len(eln))

print('all', len(mlp.intersection(rf).intersection(eln)))

print('total', len(performance_3class_df))

In [None]:
boxplot_3class_df = pd.DataFrame({
    "D-CoDR": vnn_results_3class_df['Average'], 
    "ANN": mlp_3class_df['Average'], 
    "RandomForest": randomforest_3class_df['Average'], 
    "ElasticNet": elasticnet_3class_df['Average']
})

In [None]:
boxplot = get_boxplot(boxplot_3class_df, "", "Pearson Correlation")

In [None]:
boxplot.savefig('../plots/figure2/boxplot_comparison_May_01.svg')

In [None]:
stats.ttest_ind(list(performance_3class_df['ElasticNet']), list(performance_3class_df['DCoDR']), alternative='less')

In [None]:
stats.ttest_ind(list(performance_3class_df['Random Forest']), list(performance_3class_df['DCoDR']), alternative='less')

In [None]:
stats.ttest_ind(list(performance_3class_df['Multi layer perceptron']), list(performance_3class_df['DCoDR']), alternative='less')

In [None]:
errors = []

vnn_data = np.array(vnn_results_3class_df.query('Drug == "Palbociclib"').drop(['Drug'], axis=1).iloc[0])
vnn_ci = stats.t.interval(alpha=0.95, df=len(vnn_data)-1, loc=np.mean(vnn_data), scale=stats.sem(vnn_data))
errors.append(vnn_ci[1]-vnn_ci[0])

mlp_data = np.array(mlp_3class_df.query('Drug == "Palbociclib"').drop(['Drug'], axis=1).iloc[0])
mlp_ci = stats.t.interval(alpha=0.95, df=len(mlp_data)-1, loc=np.mean(mlp_data), scale=stats.sem(mlp_data))
errors.append(mlp_ci[1]-mlp_ci[0])

rdf_data = np.array(randomforest_3class_df.query('Drug == "Palbociclib"').drop(['Drug'], axis=1).iloc[0])
rdf_ci = stats.t.interval(alpha=0.95, df=len(rdf_data)-1, loc=np.mean(rdf_data), scale=stats.sem(rdf_data))
errors.append(rdf_ci[1]-rdf_ci[0])

eln_data = np.array(elasticnet_3class_df.query('Drug == "Palbociclib"').drop(['Drug'], axis=1).iloc[0])
eln_ci = stats.t.interval(alpha=0.95, df=len(eln_data)-1, loc=np.mean(eln_data), scale=stats.sem(eln_data))
errors.append(eln_ci[1]-eln_ci[0])

x = list(performance_3class_df.columns)
x.remove('Drug')
data = performance_3class_df.query('Drug == "Palbociclib"').drop(['Drug'], axis=1)

palbo_barchart_err = create_bar_chart_with_errors(x, data.iloc[0], errors, 'Models', 'Pearson Correlation')

In [None]:
palbo_barchart_err.savefig('../plots/figure2/palbo_barchart_error_May_01.svg')

In [None]:
stats.ttest_1samp(list(mlp_3class_df.query('Drug == "Palbociclib"').iloc[0].drop('Drug')), float(vnn_results_3class_df.query('Drug == "Palbociclib"')['Average']), alternative='less')

In [None]:
stats.ttest_1samp(list(randomforest_3class_df.query('Drug == "Palbociclib"').iloc[0].drop('Drug')), float(vnn_results_3class_df.query('Drug == "Palbociclib"')['Average']), alternative='less')

In [None]:
stats.ttest_1samp(list(elasticnet_3class_df.query('Drug == "Palbociclib"').iloc[0].drop('Drug')), float(vnn_results_3class_df.query('Drug == "Palbociclib"')['Average']), alternative='less')

In [None]:
drugs = ['Palbociclib']
drug = 'Palbociclib'

fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot(111)
ax.grid(False)
ax.set_xlim([0,1])
ax.set_ylim([0,1])
plt.plot([0,1],[0,1], 'k--')
plt.ylabel("Sensitivity")
plt.xlabel("1 - Specificity")
plt.title('Palbociclib ROC Curves')

all_test_df = get_all_test(ont, dataset, zscore_method, drug, fold_size=5)
sens_df, res_df = stratify_data(all_test_df, 'auc')

all_test_df['true'] = 0
for i, row in all_test_df.iterrows():
    if str(row['cell_line']) in list(sens_df['cell_line']):
        all_test_df.at[i, 'true'] = -1
    elif str(row['cell_line']) in list(res_df['cell_line']):
        all_test_df.at[i, 'true'] = 1
    else:
        all_test_df.drop(i, inplace=True)

median = np.median(all_test_df['pred'])
mad = stats.median_abs_deviation(np.array(all_test_df['pred']), scale='normal')

fpr, tpr, threshold_res = metrics.roc_curve(all_test_df['true'], all_test_df['pred'])
print("AUC:", metrics.roc_auc_score(all_test_df['true'], all_test_df['pred']))
plt.plot(fpr, tpr, label='Predicted Drug Response')

med_i_res = 0
for i, t in enumerate(threshold_res):
    if t >= median:
        continue
    med_i_res = i
    break
plt.plot(fpr[med_i_res], tpr[med_i_res], marker='.', markersize=20)
plt.plot([fpr[med_i_res], fpr[med_i_res]], [0, tpr[med_i_res]], 'k-')
plt.plot([0, fpr[med_i_res]], [tpr[med_i_res], tpr[med_i_res]], 'k-')

std_i_res = 0
for i, t in enumerate(threshold_res):
    if t >= median + mad:
        continue
    std_i_res = i
    break
#std_i_res = np.where((tpr < 0.41) & (tpr > 0.39))[0][0]

plt.plot(fpr[std_i_res], tpr[std_i_res], marker='.', markersize=20)
plt.plot([fpr[std_i_res], fpr[std_i_res]], [0, tpr[std_i_res]], 'k-')
plt.plot([0, fpr[std_i_res]], [tpr[std_i_res], tpr[std_i_res]], 'k-')

ax.set_xticks(list(ax.get_xticks()) + [fpr[med_i_res], fpr[std_i_res]])
ax.set_yticks(list(ax.get_yticks()) + [tpr[med_i_res], tpr[std_i_res]])
ax.xaxis.set_major_formatter(FormatStrFormatter('%.2f'))
ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
#plt.show()

drugs = ['Palbociclib']
drug = 'Palbociclib'

#fig = plt.figure(figsize=(12, 12))
#ax = fig.add_subplot(111)
#ax.grid(False)
#ax.set_xlim([0,1])
#ax.set_ylim([0,1])
#plt.plot([0,1],[0,1], 'k--')
#plt.ylabel("Sensitivity")
#plt.xlabel("1 - Specificity")
#plt.title('ROC Curve')

all_test_df = get_all_test(ont, dataset, zscore_method, drug, fold_size=5)
sens_df, res_df = stratify_data(all_test_df, 'auc')
all_test_df['true'] = 0
for i, row in all_test_df.iterrows():
    if str(row['cell_line']) in list(sens_df['cell_line']):
        all_test_df.at[i, 'true'] = -1
    elif str(row['cell_line']) in list(res_df['cell_line']):
        all_test_df.at[i, 'true'] = 1
    else:
        all_test_df.drop(i, inplace=True)
median = np.median(all_test_df['pred'])
mad = stats.median_abs_deviation(np.array(all_test_df['pred']), scale='normal')
        
all_test_df['pred'] = all_test_df['pred'] * -1.0
all_test_df['true'] = all_test_df['true'] * -1
fpr, tpr, threshold_sens = metrics.roc_curve(all_test_df['true'], all_test_df['pred'])
print("AUC:", metrics.roc_auc_score(all_test_df['true'], all_test_df['pred']))
plt.plot(fpr, tpr, label='Predicted Drug Response')

med_i_sens = 0
median *= -1
for i, t in enumerate(threshold_sens):
    if t >= median:
        continue
    med_i_sens = i
    break
plt.plot(fpr[med_i_sens], tpr[med_i_sens], marker='.', markersize=20)
plt.plot([fpr[med_i_sens], fpr[med_i_sens]], [0, tpr[med_i_sens]], 'k-')
plt.plot([0, fpr[med_i_sens]], [tpr[med_i_sens], tpr[med_i_sens]], 'k-')

std_i_sens = 0
mad *= -1
for i, t in enumerate(threshold_sens):
    if t >= median - mad:
        continue
    std_i_sens = i
    break
#std_i_sens = np.where((tpr < 0.41) & (tpr > 0.39))[0][0]

plt.plot(fpr[std_i_sens], tpr[std_i_sens], marker='.', markersize=20)
plt.plot([fpr[std_i_sens], fpr[std_i_sens]], [0, tpr[std_i_sens]], 'k-')
plt.plot([0, fpr[std_i_sens]], [tpr[std_i_sens], tpr[std_i_sens]], 'k-')

ax.set_xticks(list(ax.get_xticks()) + [fpr[std_i_sens], fpr[med_i_sens]])
ax.set_yticks(list(ax.get_yticks()) + [tpr[std_i_sens], tpr[med_i_sens]])
#ax.xaxis.set_major_formatter(FormatStrFormatter('%.2f'))
#ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
plt.show()

In [None]:
threshold_res[std_i_res]

In [None]:
all_test_df['pred'] = all_test_df['pred'] * -1.0
all_test_df['true'] = all_test_df['true'] * -1

pred_list = sorted(all_test_df['pred'])

In [None]:
t_high_idx = pred_list.index(threshold_res[std_i_res])
t_high_idx/len(all_test_df)

In [None]:
t_low_idx = pred_list.index(threshold_sens[std_i_sens] * -1)
t_low_idx/len(all_test_df)

In [None]:
fig.savefig('../plots/figure2/roc_curve_common_May_02.svg')

In [None]:
drugs = ['Palbociclib']
drug = 'Palbociclib'

fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot(111)
ax.grid(False)
ax.set_xlim([0,1])
ax.set_ylim([0,1])
plt.plot([0,1],[0,1], 'k--')
plt.ylabel("Sensitivity")
plt.xlabel("1 - Specificity")
plt.title('ROC Curve for Sensitive')

all_test_df = get_all_test(ont, dataset, zscore_method, drug, fold_size=5)
sens_df, res_df = stratify_data(all_test_df, 'auc')
all_test_df['true'] = 0
for i, row in all_test_df.iterrows():
    if str(row['cell_line']) in list(sens_df['cell_line']):
        all_test_df.at[i, 'true'] = -1
    elif str(row['cell_line']) in list(res_df['cell_line']):
        all_test_df.at[i, 'true'] = 1
    else:
        all_test_df.drop(i, inplace=True)
median = np.median(all_test_df['pred'])
mad = stats.median_abs_deviation(np.array(all_test_df['pred']), scale='normal')
        
all_test_df['pred'] = all_test_df['pred'] * -1.0
all_test_df['true'] = all_test_df['true'] * -1
fpr, tpr, threshold_sens = metrics.roc_curve(all_test_df['true'], all_test_df['pred'])
print("AUC:", metrics.roc_auc_score(all_test_df['true'], all_test_df['pred']))
plt.plot(fpr, tpr, label='Predicted Drug Response')

med_i_sens = 0
median *= -1
for i, t in enumerate(threshold_sens):
    if t >= median:
        continue
    med_i_sens = i
    break
plt.plot(fpr[med_i_sens], tpr[med_i_sens], marker='.', markersize=20)
plt.plot([fpr[med_i_sens], fpr[med_i_sens]], [0, tpr[med_i_sens]], 'k-')
plt.plot([0, fpr[med_i_sens]], [tpr[med_i_sens], tpr[med_i_sens]], 'k-')

std_i_sens = 0
mad *= -1
for i, t in enumerate(threshold_sens):
    if t >= median - mad:
        continue
    std_i_sens = i
    break
#std_i_sens = np.where((tpr < 0.41) & (tpr > 0.39))[0][0]

plt.plot(fpr[std_i_sens], tpr[std_i_sens], marker='.', markersize=20)
plt.plot([fpr[std_i_sens], fpr[std_i_sens]], [0, tpr[std_i_sens]], 'k-')
plt.plot([0, fpr[std_i_sens]], [tpr[std_i_sens], tpr[std_i_sens]], 'k-')

ax.set_xticks(list(ax.get_xticks()) + [fpr[std_i_sens], fpr[med_i_sens]])
ax.set_yticks(list(ax.get_yticks()) + [tpr[std_i_sens], tpr[med_i_sens]])
ax.xaxis.set_major_formatter(FormatStrFormatter('%.2f'))
ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
plt.show()

In [None]:
fig.savefig('../plots/figure2/roc_curve_sensitive_May_02.svg')

In [None]:
drugs = ['Palbociclib']

t_high = threshold_res[std_i_res]
t_low = threshold_sens[std_i_sens] * -1
print(t_low, t_high)
    
for i, drug in enumerate(drugs):
    _, _, cont_table, odds_ratio, waterfall_plot  = get_avg_cv_data_3class_mad(ont, dataset, zscore_method, drug, t_low, t_high, is_mad=False)
    print(tabulate(cont_table, headers=['', 'True Sensitive', 'True Resistant'], tablefmt='grid'))
    print('Odds Ratio: ', odds_ratio)

In [None]:
waterfall_plot.savefig('../plots/figure2/waterfall_' + drug + '_May_02.svg')

In [None]:
drugs = ['Palbociclib']

if median < 0:
    median *= -1
    
for i, drug in enumerate(drugs):
    _, _, cont_table, odds_ratio, waterfall_plot  = get_avg_cv_data_3class_mad(ont, dataset, zscore_method, drug, median, median, is_mad=False)
    print(tabulate(cont_table, headers=['', 'True Sensitive', 'True Resistant'], tablefmt='grid'))
    print('Odds Ratio: ', odds_ratio)

In [None]:
waterfall_plot.savefig('../plots/figure2/waterfall_pred_50_' + drug + '_April_29.svg')

In [None]:
drugs = ['Palbociclib']
drug = 'Palbociclib'

fpr = []
tpr = []
test_df = get_all_test(ont, dataset, zscore_method, drug, fold_size=5)
sens_df, res_df = stratify_data(test_df, 'auc')
test_df['true'] = 0
for i, row in test_df.iterrows():
    if str(row['cell_line']) in list(sens_df['cell_line']):
        test_df.at[i, 'true'] = -1
    elif str(row['cell_line']) in list(res_df['cell_line']):
        test_df.at[i, 'true'] = 1
    else:
        test_df.drop(i, inplace=True)

for f in np.arange(0.01, 0.5, 0.01):
    
    data_size = len(test_df)
    class_size = int(data_size * f)
    test_df.sort_values(by='pred', inplace=True, ignore_index=True)
    pred_sens_df = test_df.iloc[ : class_size]
    pred_res_df = test_df.iloc[data_size - class_size : data_size]
    
    y_true = []
    y_pred = []
    for i, row in pred_sens_df.iterrows():
        y_pred.append(0)
        if str(row['cell_line']) in list(sens_df['cell_line']):
            y_true.append(0)
        else:
            y_true.append(1)

    for i, row in pred_res_df.iterrows():
        y_pred.append(1)
        if str(row['cell_line']) in list(sens_df['cell_line']):
            y_true.append(0)
        else:
            y_true.append(1)
    
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    fpr.append(fp/(fp+tn))
    tpr.append(tp/(tp+fn))
    
scatter_df = pd.DataFrame({'fpr': fpr, 'tpr': tpr})
fig, ax = plt.subplots(figsize=(6,6))
ax.set_xlim([0,1])
ax.set_ylim([0,1])
sns.scatterplot(data=scatter_df, x='fpr', y='tpr', s=35, ax=ax)
plt.show()

In [None]:
tpr[0]

In [None]:
f_best = 0.16
drugs = ['Palbociclib']
    
for i, drug in enumerate(drugs):
    corr_list, avg_corr, cont_table, odds_ratio, waterfall_plot_go  = get_avg_cv_data_3class('ctg_go', dataset, zscore_method, drug, fraction=f_best, is_show=True)
    print(drug, avg_corr, corr_list)
    print(tabulate(cont_table, headers=['', 'True Sensitive', 'True Resistant'], tablefmt='grid'))
    print('Odds Ratio: ', odds_ratio)

#waterfall_plot_go.savefig('../plots/waterfall_go_' + drug + '_2022-03-02.svg')

In [None]:
ont_file = '../data/training_files_av/ontology_ctg_av.txt'
ontology = Ontology.from_table(ont_file, clixo_format=True)
terms = ontology.term_2_gene.keys()

In [None]:
drugs = ['ML-210']
    
for i, drug in enumerate(drugs):
    
    corr_list, avg_corr = get_avg_cv_data(ont, dataset, zscore_method, drug)
    print(drug, corr_list, avg_corr)

In [None]:
f_best = 0.5
drugs = ['ML-210']
    
for i, drug in enumerate(drugs):
    corr_list, avg_corr, cont_table, odds_ratio, waterfall_plot  = get_avg_cv_data_3class(ont, dataset, zscore_method, 
                                                                                          drug, fraction=f_best, is_show=True)
    print(drug, avg_corr, corr_list)
    print(tabulate(cont_table, headers=['', 'True Sensitive', 'True Resistant'], tablefmt='grid'))
    print('Odds Ratio: ', odds_ratio)

In [None]:
#mut_vnn_results_3class_df = pd.DataFrame(index=range(len(drugs)), columns=['Drug', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5', 'Average'])

for i, drug in enumerate(drugs):
    corr_list, avg_corr = get_avg_cv_data_mut(ont, dataset, zscore_method, drug)
    print(drug, corr_list, avg_corr)