# Selecting high quality receptors
- using matrix multiplications for filtering out receptors
- filtered consensus signature and design matrix are saved to lincs_consensus/hq_newfilter/inf_all_pert_cell_liana_transp.csv and design_matrices/hq_newfilter/all_pert_binary_liana.csv
- filtering out shRNA and CRISPR

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from src.config import *
from src.prediction_functions import *
from src.model_creation import *

In [3]:
with open("results/benchmark_LINCS/results_liana_zscore2.json","r") as f:
    all_results = json.load(f)

# Looking for HQ receptors

In [4]:
perturbations = ['cp', 'lig', 'oe', 'xpr', 'sh']

In [5]:
# get all receptors
all_receptor = []
for model in perturbations:
    for data in all_results[model].keys():
        for results in all_results[model][data].keys():
            rocaucs = all_results[model][data][results]['rocauc']
            all_receptor.extend(list(rocaucs.keys()))
all_receptor = set(all_receptor)

In [6]:
rocauc_count_df = pd.DataFrame(columns = ['model', 'data', 'results'] + list(all_receptor))

In [7]:
model_dict = {}
for model in perturbations:
    data_dict = {}
    for data in all_results[model].keys():
        res_dict = {}
        for results in all_results[model][data].keys():
            
            rocaucs = all_results[model][data][results]['rocauc']
            if results == 'negative_results':
            
                rocaucs = {k:(1 if v < 0.4 else 0) for k,v in rocaucs.items()}
                res_dict['negative_results'] = rocaucs
            if results == 'positive_results':
                rocaucs = {k:(1 if v > 0.6 else 0) for k,v in rocaucs.items()}
                res_dict['positive_results'] = rocaucs

                
        data_dict[data] = res_dict
    model_dict[model] = data_dict

In [8]:
flatten_results = pd.DataFrame.from_records(
    [
        (level1, level2, level3, level4, leaf)
        for level1, level2_dict in model_dict.items()
        for level2, level3_dict in level2_dict.items()
        for level3, level4_dict in level3_dict.items()
        for level4, leaf in level4_dict.items()
    ],
    columns=['model', 'data', 'results', 'receptors', 'value']
)

In [9]:
# exclude sh-xpr xpr-sh validating receptors
flatten_results_mod = flatten_results.copy()
def change_value(row):
    if (row.data == 'sh') & (row.model == 'xpr'):
        row.value = 0
    if (row.data == 'xpr') & (row.model == 'sh'):
        row.value = 0
    return row
flatten_results = flatten_results_mod.apply(change_value, axis = 1)


In [10]:
def fill_dataframe(result_row):
    model = result_row.model
    data = result_row.data
    receptor = result_row.receptors
    value = result_row.value
    if value == 1:
        receptor_df_model.loc[receptor, model] += 1
        receptor_df_data.loc[receptor, data] += 1
    return

In [11]:
receptor_df_model = pd.DataFrame(0, index = flatten_results.receptors.unique(), columns = list(set(flatten_results.model) | set(flatten_results.data)))
receptor_df_data = pd.DataFrame(0, index = flatten_results.receptors.unique(), columns = list(set(flatten_results.model) | set(flatten_results.data)))
_x = flatten_results.apply(fill_dataframe, axis = 1)
receptor_df = receptor_df_model + receptor_df_data

In [12]:
receptor_df.head()

Unnamed: 0,cp,sh,xpr,oe,lig
CALCRL,2,0,1,0,1
IGF1R,3,2,0,2,1
TEK,1,1,0,0,0
GCGR,2,0,2,0,0
AXL,1,0,0,0,1


# Create merged signature

In [13]:
def read_in_data_for_model_creation(pert_type, genes_filename = 'lm'):
    print('Read in signature')
    signature = pd.read_csv(f'data/lincs_consensus/{genes_filename}_{pert_type}_pert_cell_liana.csv', index_col = 0)
    return signature

In [14]:
data = {}
for pert_type in perturbations:
    print(pert_type, end = ': ')
    data[pert_type] = read_in_data_for_model_creation(pert_type, genes_filename = 'inf')
    data[pert_type].index = data[pert_type].index + '_' + pert_type

cp: Read in signature
lig: Read in signature
oe: Read in signature
xpr: Read in signature
sh: Read in signature


In [15]:
data['cp'].index = data['cp'].index.str.replace(r'^(.*)_cp(.*)_cp$', r'\1_cp\2')


  data['cp'].index = data['cp'].index.str.replace(r'^(.*)_cp(.*)_cp$', r'\1_cp\2')


In [21]:
design_matrix = pd.read_csv('data/design_matrices/high_quality/all_pert_binary_liana.csv', index_col = 0)

In [29]:
filtered_data = {}
for pert_type in perturbations:
    filtered_data[pert_type] = data[pert_type][data[pert_type].index.isin(design_matrix.index)]


In [38]:
design_matrices = {}
for pert_type in perturbations:
    print(pert_type, end =': ')
    print('Read in signature')
    design_matrices[pert_type] = pd.read_csv(f'{DATA_DIR}/design_matrices/{pert_type}_pert_binary_liana.csv', index_col =0 )
    design_matrices[pert_type].index = design_matrices[pert_type].index + '_' + pert_type


cp: Read in signature
lig: Read in signature
oe: Read in signature
xpr: Read in signature
sh: Read in signature


In [39]:
def merge_signatures(data, design_matrices):
    data_all = pd.DataFrame(columns = data[list(data.keys())[0]].columns)
    for i in data:
        data_all = pd.concat([data_all, data[i].loc[design_matrices[i].index]])
    data_all = data_all.astype('float')
    return data_all

In [40]:
def merge_design_matrces(gene_list, pert_list, design_matrices):
    df = pd.DataFrame(columns = gene_list, index = pert_list)
    for i in design_matrices:
        df.loc[design_matrices[i].index, design_matrices[i].columns] = design_matrices[i]
    df = df.fillna(0)
    df = df.astype('int')
    return df

In [41]:
def merge_data(design_matrices, data):
    # receptor list
    gene_list = []
    f = list(map(lambda y: gene_list.extend(y.columns), design_matrices.values()))
    gene_list = list(set(gene_list))
    # perturbations (cell_pert_list)
    pert_list = []
    f = list(map(lambda y: pert_list.extend(y.index), design_matrices.values()))
    print('Merge design matrices')
    dms = merge_design_matrces(gene_list, pert_list, design_matrices)
    print('Merge signatures')
    data_all = merge_signatures(data, design_matrices)
    return dms, data_all


In [42]:
designmatrix, gexdata = merge_data(design_matrices, data)

Merge design matrices
Merge signatures


In [46]:
gexdata.T.to_csv(f'data/lincs_consensus/high_quality/inf_pert_cell_liana_transp.csv')