In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
from sklearn.metrics import r2_score
from sklearn.preprocessing import scale
from sklearn.preprocessing import robust_scale
from sklearn import metrics
from tabulate import tabulate

pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
sns.set()
sns.set_style("white")
sns.set_style("ticks", {"xtick.major.size":8, "ytick.major.size":8})
sns.axes_style("whitegrid")
sns.set_palette("muted")
sns.color_palette("muted")

In [None]:
plt.rcParams['svg.fonttype'] = 'none'

SMALL_SIZE = 12
MEDIUM_SIZE = 16
BIGGER_SIZE = 22

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [None]:
#Separating data for each drug/cell

def get_pos_map(obj_list, test_df, col):
    pos_map = {obj:[] for obj in obj_list}
    for i, row in test_df.iterrows():
        if row[col] in pos_map:
            pos_map[row[col]].append(i)
    return pos_map

In [None]:
#Arrange the obj_list in the descending order of the scores

def sort_scores(obj_list, scores):
    score_map = {}
    for i, obj in enumerate(obj_list):
        score_map[obj] = scores[i]
    return {obj:sc for obj,sc in sorted(score_map.items(), key=lambda item:item[1], reverse=True)}

In [None]:
def create_confusion_matrix(sens, res, pred_sens, pred_res):
    tp = len([c for c in pred_sens if c in sens])
    fn = len([c for c in pred_sens if c in res])
    fp = len([c for c in pred_res if c in sens])
    tn = len([c for c in pred_res if c in res])
    
    cont_table = [['Pred. Sensitive', tp, fn],
                  ['Pred. Resistant', fp, tn]]
    print(tabulate(cont_table, headers=['', 'True Sensitive', 'True Resistant'], tablefmt='grid'))
    if fn > 0 and fp > 0:
        print('Odds Ratio: ', (tp*tn)/(fn*fp))

In [None]:
def create_true_auc_waterfall(pred_sens_df, pred_res_df, median_auc, stddev_auc):
    
    all_data = pd.concat([pred_sens_df, pred_res_df], axis=0, ignore_index=True)    
    res_samples = []
    sens_samples = []
    for _,row in all_data.iterrows():
        temp = pd.DataFrame([[row['cell_line'], row['auc']]], columns=['cell_line', 'auc'])

        if row['auc'] <= (median_auc - stddev_auc):
            sens_samples.append(temp)
            
        elif row['auc'] >= (median_auc + stddev_auc):
            res_samples.append(temp)
    
    res_df = pd.concat(res_samples, axis=0, ignore_index=True)
    sens_df = pd.concat(sens_samples, axis=0, ignore_index=True)
    
    create_confusion_matrix(list(sens_df['cell_line']), list(res_df['cell_line']), list(pred_sens_df['cell_line']), list(pred_res_df['cell_line']))
    
    fig, ax = plt.subplots(figsize=(8, 8))
    
    data_list = []
    all_data.sort_values(by='auc', inplace=True, ignore_index=True, ascending=False)
    
    for _,row in all_data.iterrows():
        cell_name = str(row['cell_line'])
        
        if cell_name not in list(sens_df['cell_line']) and cell_name not in list(res_df['cell_line']):
            continue
            
        if cell_name in list(pred_sens_df['cell_line']):
            data_list.append((cell_name, row['auc'], 'blue'))
            
        elif cell_name in list(pred_res_df['cell_line']):
            data_list.append((cell_name, row['auc'], 'red'))

    ax.bar([d[0] for d in data_list], [d[1] for d in data_list], color=[d[2] for d in data_list], width=1, edgecolor='none')
    ax.set_xticks([])
    ax.set_ylim((0, np.max(all_data['auc']) + 0.02))
    ax.set_xlabel('Cell lines')
    ax.set_ylabel('True AUC')
    plt.show()
    return fig

In [None]:
def plot_drug_performance(drug_corr_map, cut_off):
    fig, ax = plt.subplots(figsize=(8, 8))
    x_red = []
    y_red = []
    x_blue = []
    y_blue = []
    for d in drug_corr_map.keys():
        if drug_corr_map[d] >= cut_off:
            x_red.append(d)
            y_red.append(drug_corr_map[d])
        else:
            x_blue.append(d)
            y_blue.append(drug_corr_map[d])
    
    ratio = float(len(x_red))/float(len(drug_corr_map.keys()))
    print('Red ratio = ' + str(ratio))
    ax.bar(x_red, y_red, color='red', width=1.0, alpha=0.9)
    ax.bar(x_blue, y_blue, color='blue', width=1.0, alpha=0.9)
    ax.set_xticks([])
    ax.set_xlabel('Drugs')
    ax.set_ylabel('Performance\nSpearman ρ (Predicted vs. Actual)')
    plt.show()
    return fig

In [None]:
def create_drug_performance_plot(drugs, drug_corr_list, top=False, cut_off=0.1):
    drug_corr_map = sort_scores(drugs, drug_corr_list)
    if top:
        fig_drug_perf = plot_top_drug_performance(drug_corr_map, cut_off)
    else:
        fig_drug_perf = plot_drug_performance(drug_corr_map, cut_off)
        print('Median spearman rho:', np.median(list(drug_corr_map.values())))
    return fig_drug_perf

In [None]:
def create_scatter_plot(X, Y, x_title, y_title):
    
    idx_del = []
    for i, x in enumerate(X):
        if math.isnan(x):
            idx_del.append(i)
    for i, y in enumerate(Y):
        if math.isnan(y):
            idx_del.append(i)
    X = [x for i, x in enumerate(X) if i not in idx_del]
    Y = [y for i, y in enumerate(Y) if i not in idx_del]
    
    scatter_df = pd.DataFrame({x_title: X, y_title: Y})
    
    fig, ax = plt.subplots(figsize=(6,6))
    sns.scatterplot(data=scatter_df, x=x_title, y=y_title, s=35, ax=ax)
    
    ax.set_xlabel(x_title)
    ax.set_ylabel(y_title)
    lims = [
        np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
        np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
    ]
    plt.plot(lims, lims, 'k--', alpha=0.75, zorder=0)
    ax.set_aspect('equal')
    ax.set_xlim(lims)
    ax.set_ylim(lims)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    
    print('t-test p-value:', stats.ttest_ind(X, Y)[1])
    
    return fig

In [None]:
def get_boxplot(df, x_title, y_title):
    
    fig, ax = plt.subplots(figsize=(9,9))
    ax.boxplot(df, showfliers=False, widths=0.5, patch_artist=True)
    ax.set_xticklabels(list(df.columns))
    ax.set_xlabel(x_title)
    ax.set_ylabel(y_title)
    
    plt.show()
    
    return fig

In [None]:
def create_bar_chart(x, data, x_title, y_title):
    fig = plt.figure(figsize=(9, 9))
    ax = fig.add_subplot(111)
    x_pos = [i for i, _ in enumerate(x)]
    ax.bar(x_pos, data, align='center', alpha=1.0)
    plt.xticks(x_pos, x)
    ax.set_xlabel(x_title)
    ax.set_ylabel(y_title)
    plt.show()
    return fig

In [None]:
def get_violinplot(df, x_title, y_title):
    
    fig, ax = plt.subplots(figsize=(6,6))
    ax.violinplot(df, showmedians=True)
    ax.set_xticklabels(list(df.columns), rotation=45)
    ax.set_xlabel(x_title)
    ax.set_ylabel(y_title)
    
    plt.show()
    
    return fig

In [None]:
def create_line_curve(X, Y, x_title, y_title, x_lim, y_lim):
    fig = plt.figure(figsize=(5, 5))
    ax = fig.add_subplot(111)
    ax.plot(X, Y)
    ax.grid(False)
    ax.set_xlim(x_lim)
    ax.set_ylim(y_lim)
    ax.set_xlabel(x_title)
    ax.set_ylabel(y_title)
    plt.show()
    return fig

In [None]:
def all_models_scatterplot(performance_df, x_title, y_title):
    
    fig, ax = plt.subplots(figsize=(6, 14))
    
    performance_df.plot(kind='scatter', x='Random Forest', y='Drug', color='DarkGreen', label='Random Forest', ax=ax)
    performance_df.plot(kind='scatter', x='Multi layer perceptron', y='Drug', color='Green', label='Multi layer perceptron', ax=ax)
    performance_df.plot(kind='scatter', x='ElasticNet', y='Drug', color='LightGreen', label='ElasticNet', ax=ax)
    performance_df.plot(kind='scatter', x='DCoDR', y='Drug', color='red', label='DCoDR', s=30, ax=ax)

    ax.set_xlabel(x_title)
    ax.set_ylabel(y_title)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    
    return fig

In [None]:
def calc_std_vals(df, zscore_method):
    std_df = pd.DataFrame(columns=['smiles', 'center', 'scale'])
    std_list = []

    if zscore_method == 'zscore':
        for name, group in df.groupby(['smiles'])['auc']:
            center = group.mean()
            scale = group.std()
            if math.isnan(scale) or scale == 0.0:
                scale = 1.0
            temp = pd.DataFrame([[name, center, scale]], columns=std_df.columns)
            std_list.append(temp)

    elif zscore_method == 'robustz':
        for name, group in df.groupby(['smiles'])['auc']:
            center = group.median()
            scale = group.quantile(0.75) - group.quantile(0.25)
            if math.isnan(scale) or scale == 0.0:
                scale = 1.0
            temp = pd.DataFrame([[name, center, scale]], columns=std_df.columns)
            std_list.append(temp)
    else:
        for name, group in df.groupby(['smiles'])['auc']:
            temp = pd.DataFrame([[name, 0.0, 1.0]], columns=std_df.columns)
            std_list.append(temp)

    std_df = pd.concat(std_list, ignore_index=True)
    return std_df

In [None]:
def standardize_data(df, std_df, zscore_method):
    merged = pd.merge(df, std_df, how="left", on=['smiles'], sort=False)
    merged[zscore_method] = (merged['auc'] - merged['center']) / merged['scale']
    merged = merged[['cell_line', 'smiles', zscore_method]]
    return merged

In [None]:
def normalize_auc(train_std_df, test_df, zscore_method):

    test_std_df = calc_std_vals(test_df, zscore_method)
    for i, row in test_std_df.iterrows():
        smiles = row['smiles']
        train_entry = train_std_df.query('smiles == @smiles')
        if not train_entry.empty:
            test_std_df.loc[i, 'center'] = float(train_entry['center'])
            test_std_df.loc[i, 'scale'] = float(train_entry['scale'])
            
    test_df = standardize_data(test_df, test_std_df, zscore_method)
    return test_df

In [None]:
def print_best_params(modeldir):
    log_file = modeldir + "/train.log"
    with open(log_file, 'r') as fi:
        logs = fi.readlines()
    for i, log in enumerate(logs):
        if 'Params:' in log:
            print(logs[i+1].strip())
            print(logs[i+2].strip())
            break

In [None]:
def get_avg_cv_data(ont, dataset, zscore_method, drug, fold_size=5):
    
    avg_corr = 0.0
    fold_corr_list = []
    for i in range(1, fold_size+1):
        
        test_file = '../data/training_files_av/' + str(i) + '_test_' + dataset + '_' + drug + '.txt'
        test_df = pd.read_csv(test_file, sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
        
        modeldir = '../models/model_' + ont + '_' + dataset + '_' + drug + '_' + zscore_method + '_' + str(i) 
        pred_file = modeldir + '/predict.txt'
        try:
            pred = np.loadtxt(pred_file)
        except:
            continue
        
        corr = stats.pearsonr(test_df['auc'], pred)[0]
        fold_corr_list.append(corr)
        avg_corr += corr
        
    avg_corr /= fold_size
    return fold_corr_list, avg_corr

In [None]:
def get_filtered_data(test_df, fraction):
    
    fig = 0
    
    stddev_auc = np.std(np.array(test_df['auc']))
    median_auc = np.median(np.array(test_df['auc']))

    test_df.sort_values(by='pred', inplace=True, ignore_index=True)
    
    cell_name_map = dict()
    for i, row in test_df.iterrows():
        cell_line_name = str(row['cell_line'])
        if cell_line_name not in cell_name_map:
            cell_name_map[cell_line_name] = 1
        else:
            cell_name_map[cell_line_name] += 1
        test_df.at[i, 'cell_line'] = cell_line_name + str(cell_name_map[cell_line_name])
    
    data_size = len(test_df)
    class_size = int(data_size * fraction)
    
    pred_sens_df = test_df.iloc[ : class_size]
    pred_res_df = test_df.iloc[data_size - class_size : data_size]
    filtered_df = pd.concat([pred_sens_df, pred_res_df], axis=0, ignore_index=True)
    
    fig = create_true_auc_waterfall(pred_sens_df, pred_res_df, median_auc, stddev_auc)
    
    return filtered_df, fig

In [None]:
def get_avg_cv_data_3class(ont, dataset, zscore_method, drug, fold_size=5, fraction=0.16):
    
    all_test = []
    avg_corr = 0.0
    fold_corr_list = []
    for i in range(1, fold_size+1):
        
        test_file = '../data/training_files_av/' + str(i) + '_test_' + dataset + '_' + drug + '.txt'
        test_df = pd.read_csv(test_file, sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
        
        modeldir = '../models/model_' + ont + '_' + dataset + '_' + drug + '_' + zscore_method + '_' + str(i) 
        pred_file = modeldir + '/predict.txt'
        try:
            pred = np.loadtxt(pred_file)
        except:
            continue
        
        test_df['pred'] = pred
        filtered_df, _ = get_filtered_data(test_df, fraction)
        
        corr = stats.pearsonr(filtered_df['auc'], filtered_df['pred'])[0]
        fold_corr_list.append(corr)
        avg_corr += corr
        
        all_test.append(test_df)
        
    all_test_df = pd.concat(all_test, axis=0, ignore_index=True)
    filtered_df, waterfall_plot = get_filtered_data(all_test_df, fraction)
        
    avg_corr /= fold_size
    return fold_corr_list, avg_corr, waterfall_plot

In [None]:
def get_avg_cv_data_3class_alternate(method, dataset, drug, fold_size=5, fraction=0.16):
    
    avg_corr = 0.0
    fold_corr_list = []
    for i in range(1, fold_size+1):
        
        test_file = '../data/training_files_av/' + str(i) + '_test_' + dataset + '_' + drug + '.txt'
        test_df = pd.read_csv(test_file, sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
        
        modeldir = '../models/' + method
        pred_file = modeldir + '/predict' + '_' + dataset + '_' + drug + '_' + str(i) + '.txt'
        try:
            pred = np.loadtxt(pred_file)
        except:
            continue
        
        test_df['pred'] = pred
        filtered_df, _ = get_filtered_data(test_df, fraction)
        
        corr = stats.pearsonr(filtered_df['auc'], filtered_df['pred'])[0]
        if math.isnan(corr):
            corr = 0.0
        fold_corr_list.append(corr)
        avg_corr += corr
        
    avg_corr /= fold_size
    return fold_corr_list, avg_corr

In [None]:
def get_avg_cv_data_3class_mut(ont, dataset, zscore_method, drug, fold_size=5, fraction=0.16):
    
    all_test = []
    avg_corr = 0.0
    fold_corr_list = []
    for i in range(1, fold_size+1):
        
        test_file = '../data/training_files_av/' + str(i) + '_test_' + dataset + '_' + drug + '.txt'
        test_df = pd.read_csv(test_file, sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
        
        modeldir = '../../nest_drugcell/models/model_' + ont + '_' + dataset + '_' + drug + '_' + zscore_method + '_' + str(i) 
        pred_file = modeldir + '/predict.txt'
        try:
            pred = np.loadtxt(pred_file)
        except:
            continue
        
        test_df['pred'] = pred
        filtered_df, _ = get_filtered_data(test_df, fraction)
        
        corr = stats.pearsonr(filtered_df['auc'], filtered_df['pred'])[0]
        fold_corr_list.append(corr)
        avg_corr += corr
        
        all_test.append(test_df)
        
    all_test_df = pd.concat(all_test, axis=0, ignore_index=True)
    filtered_df, _ = get_filtered_data(all_test_df, fraction)
        
    avg_corr /= fold_size
    return fold_corr_list, avg_corr

In [None]:
def get_best_avg_corr(drugs, ont, dataset, zscore_method):
    
    best_avg = 0.0
    f_min = 0.1
    f_max = 0.5
    param_list = []
    f = f_min
    while f <= f_max:
        
        all_avg_corr = 0.0
        for i, drug in enumerate(drugs):
            corr_list, avg_corr = get_avg_cv_data_3class(ont, dataset, zscore_method, drug, fraction=f)
            all_avg_corr += avg_corr

        all_avg_corr /= len(drugs)
        if best_avg <= all_avg_corr:
            best_avg = all_avg_corr
            f_best = f

        param_list.append((f, all_avg_corr))
        print(f, all_avg_corr)
        f += 0.05

    create_line_curve([p[0] for p in param_list], [p[1] for p in param_list], 'Cut_off', 'Avg. Correlation', (f_min, f_max), (0.0, 0.3))
    return f_best

In [None]:
dataset = 'av'
zscore_method = 'auc'
ont = 'ctg'

drugs = list(pd.read_csv('../data/training_files_av/drugname_av.txt', header=None, names=['D'])['D'])
#drugs = ['Palbociclib', 'Trametinib']

In [None]:
vnn_results_df = pd.DataFrame(index=range(len(drugs)), columns=['Drug', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5', 'Average'])

to_remove = []
for i, drug in enumerate(drugs):
    
    corr_list, avg_corr = get_avg_cv_data(ont, dataset, zscore_method, drug)
    if math.isnan(avg_corr):
        to_remove.append(drug)
        continue
    print(drug, corr_list, avg_corr)
    
    vnn_results_df.loc[i]['Drug'] = drug
    vnn_results_df.loc[i]['Average'] = avg_corr
    for k in range(5):
        vnn_results_df.loc[i]['Fold_'+str(k+1)] = corr_list[k]
        
drugs = [d for d in drugs if d not in to_remove]

In [None]:
vnn_results_3class_df = pd.DataFrame(index=range(len(drugs)), columns=['Drug', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5', 'Average'])

#f_best = get_best_avg_corr(drugs, ont, dataset, zscore_method)
f_best=0.16
    
for i, drug in enumerate(drugs):
    corr_list, avg_corr, _ = get_avg_cv_data_3class(ont, dataset, zscore_method, drug, fraction=f_best)
    print(drug, corr_list, avg_corr)
    
    vnn_results_3class_df.loc[i]['Drug'] = drug
    vnn_results_3class_df.loc[i]['Average'] = avg_corr
    for k in range(5):
        vnn_results_3class_df.loc[i]['Fold_'+str(k+1)] = corr_list[k]

In [None]:
method = 'elastic_net'

elasticnet_df = pd.DataFrame(index=range(len(drugs)), columns=['Drug', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5', 'Average'])

for i, drug in enumerate(drugs):
    corr_list, avg_corr = get_avg_cv_data_3class_alternate(method, dataset, drug, fraction=f_best)
    print(drug, corr_list, avg_corr)
    
    elasticnet_df.loc[i]['Drug'] = drug
    elasticnet_df.loc[i]['Average'] = avg_corr
    for k in range(5):
        elasticnet_df.loc[i]['Fold_'+str(k+1)] = corr_list[k]

In [None]:
vnn_en_fig = create_scatter_plot(list(vnn_results_3class_df['Average']), list(elasticnet_df['Average']), 'D-CoDR', 'ElasticNet')

In [None]:
method = 'random_forest'

randomforest_df = pd.DataFrame(index=range(len(drugs)), columns=['Drug', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5', 'Average'])

for i, drug in enumerate(drugs):
    corr_list, avg_corr = get_avg_cv_data_3class_alternate(method, dataset, drug, fraction=f_best)
    print(drug, corr_list, avg_corr)
    
    randomforest_df.loc[i]['Drug'] = drug
    randomforest_df.loc[i]['Average'] = avg_corr
    for k in range(5):
        randomforest_df.loc[i]['Fold_'+str(k+1)] = corr_list[k]

In [None]:
vnn_rf_fig = create_scatter_plot(list(vnn_results_3class_df['Average']), list(randomforest_df['Average']), 'D-CoDR', 'Random Forest')

In [None]:
method = 'mlp'

mlp_df = pd.DataFrame(index=range(len(drugs)), columns=['Drug', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5', 'Average'])

for i, drug in enumerate(drugs):
    corr_list, avg_corr = get_avg_cv_data_3class_alternate(method, dataset, drug, fraction=f_best)
    print(drug, corr_list, avg_corr)
    
    mlp_df.loc[i]['Drug'] = drug
    mlp_df.loc[i]['Average'] = avg_corr
    for k in range(5):
        mlp_df.loc[i]['Fold_'+str(k+1)] = corr_list[k]

In [None]:
vnn_mlp_fig = create_scatter_plot(list(vnn_results_3class_df['Average']), list(mlp_df['Average']), 'D-CoDR', 'Multi-layer Perceptron')

In [None]:
mut_vnn_results_3class_df = pd.DataFrame(index=range(len(drugs)), columns=['Drug', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5', 'Average'])

for i, drug in enumerate(drugs):
    corr_list, avg_corr = get_avg_cv_data_3class_mut(ont, dataset, zscore_method, drug, fraction=f_best)
    print(drug, corr_list, avg_corr)
    
    mut_vnn_results_3class_df.loc[i]['Drug'] = drug
    mut_vnn_results_3class_df.loc[i]['Average'] = avg_corr
    for k in range(5):
        mut_vnn_results_3class_df.loc[i]['Fold_'+str(k+1)] = corr_list[k]

In [None]:
vnn_comp_fig = create_scatter_plot(list(mut_vnn_results_3class_df['Average']), list(vnn_results_3class_df['Average']), 
                                   'Pearson Correlation (Mutation Only)', 'Pearson Correlation (Mutation + Copy Number)')

In [None]:
#vnn_en_fig.savefig('../plots/elasticnet_scatterplot_2022-02-26.svg')
#vnn_rf_fig.savefig('../plots/randomforest_scatterplot_2022-02-26.svg')
#vnn_mlp_fig.savefig('../plots/mlp_scatterplot_2022-02-26.svg')
vnn_comp_fig.savefig('../plots/vnn_comparison_scatterplot_2022-03-02.svg')

In [None]:
performance_df = pd.concat([vnn_results_3class_df['Drug'], vnn_results_3class_df['Average'], mlp_df['Average'], 
                            randomforest_df['Average'], elasticnet_df['Average']], axis=1, ignore_index=True)

performance_df.columns = ['Drug', 'DCoDR', 'Multi layer perceptron', 'Random Forest', 'ElasticNet']
performance_df.sort_values(by=['DCoDR'], inplace=True, ignore_index=True)

In [None]:
x_title = 'Pearson Correlation'
y_title = 'Drug'
all_models_fig = all_models_scatterplot(performance_df, x_title, y_title)

In [None]:
all_models_fig.savefig('../plots/baseline_performance_comparison_2022-03-02.svg')

In [None]:
boxplot_df = pd.DataFrame({
    "D-CoDR": vnn_results_3class_df['Average'], 
    "ANN": mlp_df['Average'], 
    "RandomForest": randomforest_df['Average'], 
    "ElasticNet": elasticnet_df['Average']
})

In [None]:
boxplot = get_boxplot(boxplot_df, "", "Pearson Correlation")

In [None]:
boxplot.savefig('../plots/boxplot_comparison_2022-02-28.svg')

In [None]:
f_best = 0.16
drugs = ['Palbociclib']
    
for i, drug in enumerate(drugs):
    corr_list, avg_corr, waterfall_plot  = get_avg_cv_data_3class(ont, dataset, zscore_method, drug, fraction=f_best)
    print(drug, avg_corr, corr_list)

waterfall_plot.savefig('../plots/waterfall_' + drug + '_2022-03-02.svg')

In [None]:
f_best = 0.16
drugs = ['Palbociclib']
    
for i, drug in enumerate(drugs):
    corr_list, avg_corr, waterfall_plot_go  = get_avg_cv_data_3class('ctg_go', dataset, zscore_method, drug, fraction=f_best)
    print(drug, avg_corr, corr_list)

waterfall_plot_go.savefig('../plots/waterfall_go_' + drug + '_2022-03-02.svg')