In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
import seaborn as sns
import json
# import own functions
from src.config.common import *
from src.prediction_functions.MatrixMultiplicationMemoryEffectiveChunks import estimation_with_zscore_calculation_in_chunks

# Benchmarking created models against other perturbations within LINCS
- eg. benchmark shRNA model against compound perturbation data
- if the model can not predict the receptors properly (ROCAUC > 0.6) for a perturbation type, the receptors will be exluded

## Functions

In [3]:
perturbations = ['xpr', 'sh', 'oe', 'lig', 'cp']

In [4]:
def read_in_data(data_pert_type, model_pert_type, genes_filename='lm'):
    # model
    path1 = f'data/lincs_consensus/{genes_filename}_{data_pert_type}_pert_cell_liana.csv'
    print('Read in signatures ', path1)
    signature = pd.read_csv(path1, index_col =0)

    path2 = f'data/coefficient_matrix/{model_pert_type}_pert_coef_liana.csv'
    print('Read in coefficients matrix', path2)
    coefficient_matrix = pd.read_csv(path2, index_col =0)
    return signature, coefficient_matrix


In [5]:
def create_negativ_and_positive_binary(data_pert_type):
    print("Create positive value matrices for ROC curve calculation")
    binary = pd.read_csv(f'data/design_matrices/{data_pert_type}_pert_binary_liana.csv', index_col =0)

    if (binary == -1).any().any() == False:
        print('There are only positive perturbations, no negative matrix')
        negative_binary = pd.DataFrame()
    else:
        negative_binary = binary.replace({1:0})
        assert set(np.unique(negative_binary.values)) == {-1, 0}
        # delete only 0 rows
        s = negative_binary.sum() != 0
        negative_binary = negative_binary.loc[:, s.values]
        # change sign
        negative_binary = negative_binary.replace({-1:1})
        assert set(np.unique(negative_binary.values)) == {0, 1}

    if (binary == 1).any().any() == False:
        print('There are only negative perturbations, no positive matrix')
        positive_binary = pd.DataFrame()
    else: 
        positive_binary = binary.replace({-1:0})
        assert set(np.unique(positive_binary.values)) == {0, 1}
        # delete only 0 rows
        s = positive_binary.sum() != 0
        positive_binary = positive_binary.loc[:, s.values]

    return negative_binary, positive_binary

In [6]:
def calculate_rocaucs(binary, signature, activities):
    print('Calculate ROC curves')
    receptors = list(set(activities.columns) & set(binary.columns))
    filtered_activities  = activities.loc[binary.index, receptors]
    filtered_binary = binary.loc[filtered_activities.index,receptors]

    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for receptor in filtered_binary.columns:
        fpr[receptor], tpr[receptor], _ = roc_curve(filtered_binary.loc[:, receptor], filtered_activities.loc[:, receptor])
        fpr[receptor] = fpr[receptor].tolist()
        tpr[receptor] = tpr[receptor].tolist()
        roc_auc[receptor] = auc(fpr[receptor], tpr[receptor])
    return fpr, tpr, roc_auc


In [7]:
def calculate_roccurves(data_pert_type, model_pert_type):
    print('Calculate ROC curves for receptor activity estimation.')
    print('Model: ', model_pert_type, ' data: ', data_pert_type)
    signature, coefficient_matrix = read_in_data(data_pert_type, model_pert_type)
    activities = estimation_with_zscore_calculation_in_chunks(signature, coefficient_matrix.T, number_of_permutation=1000)
    assert activities.shape[1] != 978
    negative_binary, positive_binary = create_negativ_and_positive_binary(data_pert_type)
    results = {}
    if len(negative_binary) != 0:
        negative_results = {}
        negative_results['fpr'], negative_results['tpr'], negative_results['rocauc'] = calculate_rocaucs(negative_binary, signature, activities)
        results['negative_results'] = negative_results
    if len(positive_binary) != 0:
        positive_results = {}
        positive_results['fpr'], positive_results['tpr'], positive_results['rocauc'] = calculate_rocaucs(positive_binary, signature, activities)
        results['positive_results'] = positive_results
    print('---- Done ----')
    return results

## Read in data, estimate receptor activities and save results to results_liana.json

In [9]:
all_results = {}
for model in perturbations:

    dataset_results = {}
    for data in perturbations:
        if data == model:
            continue
        results = calculate_roccurves(data_pert_type = data, model_pert_type = model)
        dataset_results[data] = results
    all_results[model] = dataset_results

Calculate ROC curves for receptor activity estimation.
Model:  xpr  data:  sh
Read in signatures  data/lincs_consensus/lm_sh_pert_cell_liana.csv
Read in coefficients matrix data/coefficient_matrix/xpr_pert_coef_liana.csv
Number of samples: 2592
Number of chunks: 26
Number of permutations: 1000


100%|██████████| 26/26 [01:42<00:00,  3.95s/it]


Create positive value matrices for ROC curve calculation
There are only negative perturbations, no positive matrix
Calculate ROC curves
---- Done ----
Calculate ROC curves for receptor activity estimation.
Model:  xpr  data:  oe
Read in signatures  data/lincs_consensus/lm_oe_pert_cell_liana.csv
Read in coefficients matrix data/coefficient_matrix/xpr_pert_coef_liana.csv
Number of samples: 1670
Number of chunks: 17
Number of permutations: 1000


100%|██████████| 17/17 [00:59<00:00,  3.49s/it]


Create positive value matrices for ROC curve calculation
There are only positive perturbations, no negative matrix
Calculate ROC curves
---- Done ----
Calculate ROC curves for receptor activity estimation.
Model:  xpr  data:  lig
Read in signatures  data/lincs_consensus/lm_lig_pert_cell_liana.csv
Read in coefficients matrix data/coefficient_matrix/xpr_pert_coef_liana.csv
Number of samples: 1870
Number of chunks: 19
Number of permutations: 1000


100%|██████████| 19/19 [01:04<00:00,  3.38s/it]


Create positive value matrices for ROC curve calculation
There are only positive perturbations, no negative matrix
Calculate ROC curves
---- Done ----
Calculate ROC curves for receptor activity estimation.
Model:  xpr  data:  cp
Read in signatures  data/lincs_consensus/lm_cp_pert_cell_liana.csv
Read in coefficients matrix data/coefficient_matrix/xpr_pert_coef_liana.csv
Number of samples: 26907
Number of chunks: 270
Number of permutations: 1000


100%|██████████| 270/270 [16:47<00:00,  3.73s/it]


Create positive value matrices for ROC curve calculation
Calculate ROC curves
Calculate ROC curves
---- Done ----
Calculate ROC curves for receptor activity estimation.
Model:  sh  data:  xpr
Read in signatures  data/lincs_consensus/lm_xpr_pert_cell_liana.csv
Read in coefficients matrix data/coefficient_matrix/sh_pert_coef_liana.csv
Number of samples: 5950
Number of chunks: 60
Number of permutations: 1000


100%|██████████| 60/60 [03:15<00:00,  3.26s/it]


Create positive value matrices for ROC curve calculation
There are only negative perturbations, no positive matrix
Calculate ROC curves
---- Done ----
Calculate ROC curves for receptor activity estimation.
Model:  sh  data:  oe
Read in signatures  data/lincs_consensus/lm_oe_pert_cell_liana.csv
Read in coefficients matrix data/coefficient_matrix/sh_pert_coef_liana.csv
Number of samples: 1670
Number of chunks: 17
Number of permutations: 1000


100%|██████████| 17/17 [00:57<00:00,  3.37s/it]


Create positive value matrices for ROC curve calculation
There are only positive perturbations, no negative matrix
Calculate ROC curves
---- Done ----
Calculate ROC curves for receptor activity estimation.
Model:  sh  data:  lig
Read in signatures  data/lincs_consensus/lm_lig_pert_cell_liana.csv
Read in coefficients matrix data/coefficient_matrix/sh_pert_coef_liana.csv
Number of samples: 1870
Number of chunks: 19
Number of permutations: 1000


100%|██████████| 19/19 [01:01<00:00,  3.22s/it]


Create positive value matrices for ROC curve calculation
There are only positive perturbations, no negative matrix
Calculate ROC curves
---- Done ----
Calculate ROC curves for receptor activity estimation.
Model:  sh  data:  cp
Read in signatures  data/lincs_consensus/lm_cp_pert_cell_liana.csv
Read in coefficients matrix data/coefficient_matrix/sh_pert_coef_liana.csv
Number of samples: 26907
Number of chunks: 270
Number of permutations: 1000


100%|██████████| 270/270 [15:17<00:00,  3.40s/it]


Create positive value matrices for ROC curve calculation
Calculate ROC curves
Calculate ROC curves
---- Done ----
Calculate ROC curves for receptor activity estimation.
Model:  oe  data:  xpr
Read in signatures  data/lincs_consensus/lm_xpr_pert_cell_liana.csv
Read in coefficients matrix data/coefficient_matrix/oe_pert_coef_liana.csv
Number of samples: 5950
Number of chunks: 60
Number of permutations: 1000


100%|██████████| 60/60 [02:51<00:00,  2.86s/it]


Create positive value matrices for ROC curve calculation
There are only negative perturbations, no positive matrix
Calculate ROC curves
---- Done ----
Calculate ROC curves for receptor activity estimation.
Model:  oe  data:  sh
Read in signatures  data/lincs_consensus/lm_sh_pert_cell_liana.csv
Read in coefficients matrix data/coefficient_matrix/oe_pert_coef_liana.csv
Number of samples: 2592
Number of chunks: 26
Number of permutations: 1000


100%|██████████| 26/26 [01:21<00:00,  3.12s/it]


Create positive value matrices for ROC curve calculation
There are only negative perturbations, no positive matrix
Calculate ROC curves
---- Done ----
Calculate ROC curves for receptor activity estimation.
Model:  oe  data:  lig
Read in signatures  data/lincs_consensus/lm_lig_pert_cell_liana.csv
Read in coefficients matrix data/coefficient_matrix/oe_pert_coef_liana.csv
Number of samples: 1870
Number of chunks: 19
Number of permutations: 1000


100%|██████████| 19/19 [00:59<00:00,  3.14s/it]


Create positive value matrices for ROC curve calculation
There are only positive perturbations, no negative matrix
Calculate ROC curves
---- Done ----
Calculate ROC curves for receptor activity estimation.
Model:  oe  data:  cp
Read in signatures  data/lincs_consensus/lm_cp_pert_cell_liana.csv
Read in coefficients matrix data/coefficient_matrix/oe_pert_coef_liana.csv
Number of samples: 26907
Number of chunks: 270
Number of permutations: 1000


100%|██████████| 270/270 [11:15<00:00,  2.50s/it]


Create positive value matrices for ROC curve calculation
Calculate ROC curves
Calculate ROC curves
---- Done ----
Calculate ROC curves for receptor activity estimation.
Model:  lig  data:  xpr
Read in signatures  data/lincs_consensus/lm_xpr_pert_cell_liana.csv
Read in coefficients matrix data/coefficient_matrix/lig_pert_coef_liana.csv
Number of samples: 5950
Number of chunks: 60
Number of permutations: 1000


100%|██████████| 60/60 [02:30<00:00,  2.51s/it]


Create positive value matrices for ROC curve calculation
There are only negative perturbations, no positive matrix
Calculate ROC curves
---- Done ----
Calculate ROC curves for receptor activity estimation.
Model:  lig  data:  sh
Read in signatures  data/lincs_consensus/lm_sh_pert_cell_liana.csv
Read in coefficients matrix data/coefficient_matrix/lig_pert_coef_liana.csv
Number of samples: 2592
Number of chunks: 26
Number of permutations: 1000


100%|██████████| 26/26 [01:06<00:00,  2.56s/it]


Create positive value matrices for ROC curve calculation
There are only negative perturbations, no positive matrix
Calculate ROC curves
---- Done ----
Calculate ROC curves for receptor activity estimation.
Model:  lig  data:  oe
Read in signatures  data/lincs_consensus/lm_oe_pert_cell_liana.csv
Read in coefficients matrix data/coefficient_matrix/lig_pert_coef_liana.csv
Number of samples: 1670
Number of chunks: 17
Number of permutations: 1000


100%|██████████| 17/17 [00:42<00:00,  2.48s/it]


Create positive value matrices for ROC curve calculation
There are only positive perturbations, no negative matrix
Calculate ROC curves
---- Done ----
Calculate ROC curves for receptor activity estimation.
Model:  lig  data:  cp
Read in signatures  data/lincs_consensus/lm_cp_pert_cell_liana.csv
Read in coefficients matrix data/coefficient_matrix/lig_pert_coef_liana.csv
Number of samples: 26907
Number of chunks: 270
Number of permutations: 1000


100%|██████████| 270/270 [13:10<00:00,  2.93s/it]


Create positive value matrices for ROC curve calculation
Calculate ROC curves
Calculate ROC curves
---- Done ----
Calculate ROC curves for receptor activity estimation.
Model:  cp  data:  xpr
Read in signatures  data/lincs_consensus/lm_xpr_pert_cell_liana.csv
Read in coefficients matrix data/coefficient_matrix/cp_pert_coef_liana.csv
Number of samples: 5950
Number of chunks: 60
Number of permutations: 1000


100%|██████████| 60/60 [03:14<00:00,  3.24s/it]


Create positive value matrices for ROC curve calculation
There are only negative perturbations, no positive matrix
Calculate ROC curves
---- Done ----
Calculate ROC curves for receptor activity estimation.
Model:  cp  data:  sh
Read in signatures  data/lincs_consensus/lm_sh_pert_cell_liana.csv
Read in coefficients matrix data/coefficient_matrix/cp_pert_coef_liana.csv
Number of samples: 2592
Number of chunks: 26
Number of permutations: 1000


100%|██████████| 26/26 [01:25<00:00,  3.28s/it]


Create positive value matrices for ROC curve calculation
There are only negative perturbations, no positive matrix
Calculate ROC curves
---- Done ----
Calculate ROC curves for receptor activity estimation.
Model:  cp  data:  oe
Read in signatures  data/lincs_consensus/lm_oe_pert_cell_liana.csv
Read in coefficients matrix data/coefficient_matrix/cp_pert_coef_liana.csv
Number of samples: 1670
Number of chunks: 17
Number of permutations: 1000


100%|██████████| 17/17 [00:55<00:00,  3.25s/it]


Create positive value matrices for ROC curve calculation
There are only positive perturbations, no negative matrix
Calculate ROC curves
---- Done ----
Calculate ROC curves for receptor activity estimation.
Model:  cp  data:  lig
Read in signatures  data/lincs_consensus/lm_lig_pert_cell_liana.csv
Read in coefficients matrix data/coefficient_matrix/cp_pert_coef_liana.csv
Number of samples: 1870
Number of chunks: 19
Number of permutations: 1000


100%|██████████| 19/19 [01:00<00:00,  3.17s/it]

Create positive value matrices for ROC curve calculation
There are only positive perturbations, no negative matrix
Calculate ROC curves
---- Done ----





In [17]:
with open(f"results/benchmark_LINCS/results_liana_zscore2.json","w") as f:
    json.dump(all_results,f)

# Read in results and show plots

In [18]:
with open('results/benchmark_LINCS/results_liana_zscore2.json','r') as f:
    all_results = json.load(f)

In [19]:
# get all receptors
all_receptor = []
for model in perturbations:
    for data in all_results[model].keys():
        for results in all_results[model][data].keys():
            rocaucs = all_results[model][data][results]['rocauc']
            all_receptor.extend(list(rocaucs.keys()))
all_receptor = set(all_receptor)

In [20]:
rocauc_count_df = pd.DataFrame(columns = ['model', 'data', 'results'] + list(all_receptor))

In [21]:
model_dict = {}
for model in perturbations:
    data_dict = {}
    for data in all_results[model].keys():
        res_dict = {}
        for results in all_results[model][data].keys():
            
            rocaucs = all_results[model][data][results]['rocauc']
            if results == 'negative_results':
            
                rocaucs = {k:(1 if v < 0.4 else 0) for k,v in rocaucs.items()}
                res_dict['negative_results'] = rocaucs
            if results == 'positive_results':
                rocaucs = {k:(1 if v > 0.6 else 0) for k,v in rocaucs.items()}
                res_dict['positive_results'] = rocaucs

        data_dict[data] = res_dict
    model_dict[model] = data_dict

In [22]:
flatten_results = pd.DataFrame.from_records(
    [
        (level1, level2, level3, level4, leaf)
        for level1, level2_dict in model_dict.items()
        for level2, level3_dict in level2_dict.items()
        for level3, level4_dict in level3_dict.items()
        for level4, leaf in level4_dict.items()
    ],
    columns=['model', 'data', 'results', 'receptors', 'value']
)


In [23]:
len(flatten_results[flatten_results['value'] > 0].receptors.unique())

255

# Plots

In [24]:
def calculate_table_for_plot(model):
    values = pd.DataFrame(columns = ['receptors', 'ROCAUC', 'data_type', 'positive_or_negative_values'])

    for data in all_results[model]:
        if 'negative_results' in all_results[model][data]:
            df = pd.Series(all_results[model][data]['negative_results']['rocauc']).reset_index()
            df.columns = ['receptors', 'ROCAUC']
            df['data_type'] = data
            df['positive_or_negative_values'] = 'negative'
            values = pd.concat([values, df])
        if 'positive_results' in all_results[model][data]:
            df = pd.Series(all_results[model][data]['positive_results']['rocauc']).reset_index()
            df.columns = ['receptors', 'ROCAUC']
            df['data_type'] = data
            df['positive_or_negative_values'] = 'positive'
            values = pd.concat([values, df])
    values['ROCAUC'] = values['ROCAUC'].astype('float')
    print(values.groupby(['data_type', 'positive_or_negative_values']).count()['receptors'])
    return values

In [25]:
def plot_ROCAUC_barplot(model):
    
    values = calculate_table_for_plot(model)
    ax = sns.barplot(data=values, x = 'data_type', y = 'ROCAUC', hue = 'positive_or_negative_values', estimator=np.mean, ci='sd', capsize=.1, palette = 'Pastel2')
    plt.axhline(0.5)
    plt.title('ROCAUC values of model: '+model.upper()+' for different datasets\nand negative or positive values')
    plt.show()

In [27]:
def plot_ROCAUC_violinplot(model, save = False):
    fig, ax = plt.subplots(figsize = (8,6))
    values = calculate_table_for_plot(model)
    ax = sns.violinplot(data=values, x = 'data_type', y = 'ROCAUC', hue = 'positive_or_negative_values', split=True, palette = 'Pastel2', inner = 'quart')
    plt.axhline(0.5)
    plt.title('ROCAUC values of model: '+model.upper()+' for different datasets\nand negative or positive values\nlandmark genes')
    plt.xlabel('Data types', size = 16)
    plt.ylabel('ROCAUC', size = 16)
    ax.tick_params(axis='both', which='major', labelsize=14)
    if save:
        plt.savefig(f'figures/benchmark/against_different_perturbation_types/{model}.png', bbox_inches = 'tight')
        plt.savefig(f'figures/benchmark/against_different_perturbation_types/{model}.pdf', bbox_inches = 'tight')
    plt.show()
    plt.clf()

In [1]:
# for model in perturbations:
#     plot_ROCAUC_violinplot(model, save = False)