In [2]:
%load_ext autoreload
%autoreload 2

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from src.config import *
from tqdm import tqdm

# Filter LINCS rawdata for receptors and ligands
- Read in receptor and ligand interactions (queried from Omnipath)
- Filter and save subset of gctx files that contain only receptors and ligands (for all perturbation type)

## Read in data

In [2]:
lr_matrix = pd.read_csv(LIG_REC_MATRIX, index_col =0)

In [3]:
ligands = set(lr_matrix.index)
receptors = set(lr_matrix.columns)

In [4]:
sig_info = pd.read_table(LINCS_DATA_DIR+'siginfo_beta.txt', low_memory=False)

In [5]:
cols = ['sig_id', 'pert_type', 'pert_id', 'cmap_name', 'pert_idose', 'pert_itime', 'cell_iname']
sig_info = sig_info[cols]

# Functions

In [8]:
def create_siginfo_for_all_type_ligand_receptor_perturbations(pert_type, save = False, name = 'notprovided'):
    sig_info_pert = sig_info[sig_info['pert_type'] == pert_type]
    sig_info_pert = sig_info_pert[sig_info_pert['cmap_name'].isin(receptors|ligands)]
    sig_info_pert['ligand'] = sig_info_pert['cmap_name'].apply(lambda x: 1 if x in ligands else 0)
    sig_info_pert['receptor'] = sig_info_pert['cmap_name'].apply(lambda x: 1 if x in receptors else 0)
    if save: sig_info_pert.to_csv(f'{DATA_DIR}filtered_lincs_meta/filtered_{name}_info_of_receptor_ligand_pert.csv')
    else: return sig_info_pert


# Filtering

### XPR

In [9]:
create_siginfo_for_all_type_ligand_receptor_perturbations('trt_xpr', save = True, name = 'xpr')


### SH

In [10]:
create_siginfo_for_all_type_ligand_receptor_perturbations('trt_sh.cgs', save = True, name = 'sh')


### OE

In [11]:
create_siginfo_for_all_type_ligand_receptor_perturbations('trt_oe', save = True, name = 'oe')


### Ligs

In [12]:
lig_siginfo = create_siginfo_for_all_type_ligand_receptor_perturbations('trt_lig', save = False, name = 'lig')

In [13]:
# Remove receptors (eg. EGFR) from ligand perturbation signatutes
lig_siginfo = lig_siginfo[~lig_siginfo.cmap_name.isin(receptors)]
lig_siginfo.to_csv('data/filtered_lincs_meta/filtered_lig_info_of_receptor_ligand_pert.csv')

### Compound

In [14]:
# before this run notebooks/create_compound_target_signed_metadata.ipynb

In [15]:
cmp_perturbations = pd.read_csv('data/filtered_lincs_meta/filtered_coumpound_info_to_receptor_perturbation_signatures_signed.csv', index_col =0)

In [16]:
pert_type = 'trt_cp'
sig_info_CP = sig_info[sig_info['pert_type'] == pert_type]
sig_info_CP = sig_info_CP[sig_info_CP['pert_id'].isin(set(cmp_perturbations.pert_id))]
sig_info_CP['ligand'] = 0
sig_info_CP['receptor'] = 1
sig_info_CP.to_csv(f'data/filtered_lincs_meta/filtered_cp_info_of_receptor_ligand_pert.csv')


# Save signatures based on filtered siginfo

In [4]:
from cmapPy.pandasGEXpress.parse import parse
from cmapPy.pandasGEXpress.write_gctx import write
from glob import glob


In [5]:
geneinfo = pd.read_table(LINCS_DATA_DIR+'geneinfo_beta.txt')
landmark_genes = geneinfo[geneinfo['feature_space']=='landmark']
landmark_genes = landmark_genes.set_index('gene_id', drop = True)['gene_symbol']
landmark_genes.index = landmark_genes.index.astype(str)


In [23]:
geneinfo = pd.read_table(LINCS_DATA_DIR+'geneinfo_beta.txt')
inf_genes = geneinfo[(geneinfo['feature_space']=='landmark') | (geneinfo['feature_space']=='best inferred')]
inf_genes = inf_genes.set_index('gene_id', drop = True)['gene_symbol']
inf_genes.index = inf_genes.index.astype(str)
inf_map = inf_genes.copy()
inf_map.index = inf_genes.index.astype(float)
inf_map = inf_map.to_dict()

In [8]:
pert_types = {'lig':'trt_misc', 'oe':'trt_oe', 'sh':'trt_sh', 'xpr':'trt_xpr', 'cp':'trt_cp'}

In [81]:
# before saving for cp run calculate_consensus_for_cp_inferred.py

import glob
from tqdm import tqdm

folder_path = 'data/lincs_consensus/inferred_genes_signatures/'
all_consensus_inf_cp = pd.DataFrame()
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

for file_path in tqdm(csv_files):
    df = pd.read_csv(file_path, index_col = 0)
    df = df.T
    filename = os.path.basename(file_path)
    drugname = filename.split('_')[3]
    new_index = f"{drugname}_{df.index}_cp"
    all_consensus_inf_cp = pd.concat([all_consensus_inf_cp, df.rename(index=lambda x: f"{drugname}_{x}_cp")])

output_filename = 'inf_cp_pert_cell_liana.csv'
all_consensus_inf_cp.to_csv(folder_path + '../' + output_filename)
print(f"Concatenated DataFrame saved to {output_filename}")


  0%|          | 0/1184 [00:00<?, ?it/s]

100%|██████████| 1184/1184 [16:29<00:00,  1.20it/s]


Concatenated DataFrame saved to inf_cp_pert_cell_liana.csv
