In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
from src.config import *
from tqdm import tqdm
import json


In [4]:
from src.model_creation.CreateModelFunctions import fit_linear_model_and_get_coefficients_by_receptor as fit_model
from src.prediction_functions.MatrixMultiplicationMemoryEffectiveChunks import estimation_with_zscore_calculation_in_chunks as estimate_receptor_activity


In [6]:
lincs_consensus = pd.read_csv('data/lincs_consensus/high_quality/lm_all_pert_cell_liana.csv', index_col = 0)
prior_knowledge = pd.read_csv('data/design_matrices/high_quality/all_pert_binary_liana.csv', index_col = 0)

# Split receptor activities

In [7]:
#  split dataframe into halves 5x and fit model on them.
train_sets = {}
test_sets = {}
np.random.seed(0) #42

for i in range(5):
    shuffled_indices = np.random.permutation(prior_knowledge.index)
    
    split_idx = len(shuffled_indices) // 2
    train_indices = shuffled_indices[:split_idx]
    test_indices = shuffled_indices[split_idx:]
    
    train_set_dm = prior_knowledge.loc[train_indices].copy()
    train_set_sig = lincs_consensus.loc[train_indices].copy()
    test_set_dm = prior_knowledge.loc[test_indices].copy()
    test_set_sig = lincs_consensus.loc[test_indices].copy()
    
    train_sets[i] = (train_set_dm, train_set_sig)
    test_sets[i] = (test_set_dm, test_set_sig)

In [22]:
train_indices_dict = {}
test_indices_dict = {}
for i in range(5):
    train_indices_dict[i] = train_sets[i][0].index.tolist()
    test_indices_dict[i] = test_sets[i][0].index.tolist()
with open('results/confidence/train_indices_229_s0.json', 'w') as f:
    json.dump(train_indices_dict, f, indent=4)
with open('results/confidence/test_indices_229_s0.json', 'w') as f:
    json.dump(test_indices_dict, f, indent=4)


In [23]:
#  fit model for random splits
# 109 min
coeff_matrices = {}
for i in tqdm(train_sets):
    coeff_matrices[i] = fit_model(train_sets[i][1], train_sets[i][0])

100%|██████████| 5/5 [1:40:36<00:00, 1207.22s/it]


In [24]:
for i in coeff_matrices:
    coeff_matrices[i].to_csv(f'results/confidence/coeff_matrix_split_229_s0_{i}.csv')

In [25]:
# predict values with train coeff_m
# 10 min
receptor_activities = {}
for i in tqdm(coeff_matrices):
    receptor_activities[i] = estimate_receptor_activity(test_sets[i][1], coeff_matrices[i].T.astype(float), number_of_permutation=1000, chunk_size=300)

  0%|          | 0/5 [00:00<?, ?it/s]

Number of samples: 7232
Number of chunks: 25
Number of permutations: 1000


100%|██████████| 25/25 [01:43<00:00,  4.15s/it]
 20%|██        | 1/5 [01:44<06:57, 104.27s/it]

Number of samples: 7232
Number of chunks: 25
Number of permutations: 1000


100%|██████████| 25/25 [01:38<00:00,  3.92s/it]
 40%|████      | 2/5 [03:22<05:02, 100.81s/it]

Number of samples: 7232
Number of chunks: 25
Number of permutations: 1000


100%|██████████| 25/25 [01:42<00:00,  4.11s/it]
 60%|██████    | 3/5 [05:05<03:23, 101.90s/it]

Number of samples: 7232
Number of chunks: 25
Number of permutations: 1000


100%|██████████| 25/25 [01:38<00:00,  3.93s/it]
 80%|████████  | 4/5 [06:44<01:40, 100.63s/it]

Number of samples: 7232
Number of chunks: 25
Number of permutations: 1000


100%|██████████| 25/25 [01:37<00:00,  3.92s/it]
100%|██████████| 5/5 [08:22<00:00, 100.58s/it]


In [26]:
for i in receptor_activities:
    receptor_activities[i].to_csv(f'results/confidence/receptor_activities_split_229_s0_{i}.csv')