In [1]:
import os
import csv
import re
import pandas as pd
import pyarrow.parquet as pq
from joblib import Parallel, delayed
from tqdm import tqdm

In [3]:
# set the base directory for the project
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, "..", ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

expression_files_path = get_data_path(['output', 'processed_DepMap22Q4'], 'expression_data.csv')
zexpression_files_path = get_data_path(['output', 'processed_DepMap22Q4'], 'zexpression_data.csv')

combined_interaction_score_path = get_data_path(['input', 'PPI'], 'combined_interaction_scores.parquet')

In [4]:
# downloaded from https://ndownloader.figshare.com/files/34990036
# DEPMAP 22Q4

gene_expression = pd.read_csv(expression_files_path, index_col=0)
gene_z_expression = pd.read_csv(zexpression_files_path, index_col=0)

In [5]:
gene_expression = gene_expression.drop(['cell_name'], axis=1)
gene_z_expression = gene_z_expression.drop(['cell_name'], axis=1)

In [6]:
gene_expression = gene_expression.T
gene_expression.index = gene_expression.index.astype('int64')
gene_expression = gene_expression.sort_index()
gene_expression[:3]

Unnamed: 0,ACH-001113,ACH-001289,ACH-001339,ACH-001538,ACH-000242,ACH-000708,ACH-000327,ACH-000233,ACH-000461,ACH-000705,...,ACH-001578,ACH-000036,ACH-000973,ACH-001128,ACH-000750,ACH-000285,ACH-002669,ACH-001858,ACH-001997,ACH-000052
1,3.970854,2.114367,2.627607,0.0,0.097611,0.226509,0.084064,5.28281,0.097611,2.871844,...,2.134221,2.560715,4.986866,4.304511,0.0,4.86839,0.014355,0.176323,0.0,4.868884
2,0.0,0.831877,5.910972,0.163499,0.226509,0.163499,0.111031,0.042644,0.214125,0.15056,...,1.238787,0.189034,0.014355,0.0,0.201634,0.163499,0.0,0.097611,0.0,1.996389
9,1.989139,1.790772,2.784504,2.182692,3.68818,3.042644,1.691534,2.891419,1.157044,1.718088,...,2.104337,2.204767,1.819668,1.469886,1.794936,3.104337,2.695994,2.408712,3.68818,1.718088


In [7]:
gene_z_expression = gene_z_expression.T
gene_z_expression.index = gene_z_expression.index.astype('int64')
gene_z_expression = gene_z_expression.sort_index()
gene_z_expression[:3]

Unnamed: 0,ACH-001113,ACH-001289,ACH-001339,ACH-001538,ACH-000242,ACH-000708,ACH-000327,ACH-000233,ACH-000461,ACH-000705,...,ACH-001578,ACH-000036,ACH-000973,ACH-001128,ACH-000750,ACH-000285,ACH-002669,ACH-001858,ACH-001997,ACH-000052
1,0.581077,-0.342639,-0.087271,-1.394666,-1.346099,-1.281964,-1.352839,1.233856,-1.346099,0.034252,...,-0.33276,-0.120553,1.086606,0.747092,-1.394666,1.027657,-1.387523,-1.306935,-1.394666,1.027902
2,-0.482406,-0.049706,2.592176,-0.397362,-0.364588,-0.397362,-0.424653,-0.460225,-0.371029,-0.404093,...,0.161947,-0.38408,-0.474939,-0.482406,-0.377526,-0.397362,-0.482406,-0.431634,-0.482406,0.556012
9,-0.324088,-0.53733,0.53092,-0.11602,1.502362,0.808418,-0.644009,0.645853,-1.21858,-0.615465,...,-0.200251,-0.09229,-0.506267,-0.882279,-0.532854,0.874737,0.435773,0.126948,1.502362,-0.615465


In [8]:
# read combined_interaction_scores.parquet
combined_interaction_scores = pq.read_table(combined_interaction_score_path)
combined_interaction_scores = combined_interaction_scores.to_pandas().sort_index()

In [9]:
combined_interaction_scores[:3]

Unnamed: 0,SMARCA2_SMARCA4,EXOC6_EXOC6B,STAG1_STAG2,RPL3_RPL3L,CNOT7_CNOT8,CKS1B_CKS2,HDAC1_HDAC2,HSP90AA1_HSP90AB1,RBBP4_RBBP7,PPP2R1A_PPP2R1B,...,ZNF138_ZNF141,ZNF320_ZNF611,ZNF141_ZNF195,ZNF195_ZNF253,NBPF1_NBPF15,ZIK1_ZNF530,ZNF138_ZNF737,ZNF93_ZNF141,ZIK1_ZNF211,ZNF117_ZNF493
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,


In [10]:
# filter gene_expression for shared interactors
filtered_gene_expression = gene_expression[gene_expression.index.isin(combined_interaction_scores.index)].sort_index()

filtered_combined_interaction_scores = combined_interaction_scores[combined_interaction_scores.index.isin(gene_expression.index)].sort_index()

In [None]:
def weighted_PPI(pair):
    df = filtered_gene_expression.mul(filtered_combined_interaction_scores[pair], axis=0)
    sum_of_cell_lines = df.sum()
    exp_of_shared_int = sum_of_cell_lines.div(filtered_combined_interaction_scores[pair].sum(), axis=0)
    exp_of_shared_int.name = pair
    return exp_of_shared_int

test_pair = 'SMARCA2_SMARCA4'
weighted_PPI(test_pair)

ACH-001113    4.336497
ACH-001289    4.219004
ACH-001339    3.970107
ACH-001538    3.967633
ACH-000242    4.258391
                ...   
ACH-000285    4.719824
ACH-002669    3.771905
ACH-001858    3.901608
ACH-001997    3.939896
ACH-000052    4.575195
Name: SMARCA2_SMARCA4, Length: 1408, dtype: float64

In [13]:
# filter gene_z_expression for shared interactors
filtered_gene_z_expression = gene_z_expression[gene_z_expression.index.isin(combined_interaction_scores.index)].sort_index()

zfiltered_combined_interaction_scores = combined_interaction_scores[combined_interaction_scores.index.isin(gene_z_expression.index)].sort_index()

In [19]:
def weighted_zPPI(pair):
    df = filtered_gene_z_expression.mul(zfiltered_combined_interaction_scores[pair], axis=0)
    sum_of_cell_lines = df.sum()
    exp_of_shared_int = sum_of_cell_lines.div(zfiltered_combined_interaction_scores[pair].sum(), axis=0)
    exp_of_shared_int.name = pair
    return exp_of_shared_int

test_pair = 'SMARCA2_SMARCA4'
weighted_zPPI(test_pair)

ACH-001113    0.260639
ACH-001289    0.159970
ACH-001339   -0.185762
ACH-001538   -0.224950
ACH-000242    0.150543
                ...   
ACH-000285    0.829550
ACH-002669   -0.432918
ACH-001858   -0.281732
ACH-001997   -0.244848
ACH-000052    0.562371
Name: SMARCA2_SMARCA4, Length: 1408, dtype: float64

In [20]:
combined_weighted_PPI = pd.concat([weighted_PPI(pair) for pair in filtered_combined_interaction_scores.columns[0:10]], axis=1)
display(combined_weighted_PPI)

Unnamed: 0,SMARCA2_SMARCA4,EXOC6_EXOC6B,STAG1_STAG2,RPL3_RPL3L,CNOT7_CNOT8,CKS1B_CKS2,HDAC1_HDAC2,HSP90AA1_HSP90AB1,RBBP4_RBBP7,PPP2R1A_PPP2R1B
ACH-001113,4.336497,4.044500,4.420613,7.301168,4.787432,5.325845,4.200424,5.137395,4.688187,4.143348
ACH-001289,4.219004,4.212694,4.171410,7.427948,4.636875,5.485654,4.067857,5.074649,4.631699,4.192616
ACH-001339,3.970107,4.048964,3.868243,7.335901,4.292613,5.365455,3.759007,5.080747,4.252093,3.878671
ACH-001538,3.967633,4.027167,3.897304,7.441265,4.439388,4.939608,3.816576,4.997209,4.206917,3.952240
ACH-000242,4.258391,4.092167,4.074723,7.400517,4.700591,5.258796,4.077576,5.039962,4.528516,4.032838
...,...,...,...,...,...,...,...,...,...,...
ACH-000285,4.719824,3.929269,4.699234,7.677712,5.067211,5.936168,4.351072,5.064166,5.010605,4.061917
ACH-002669,3.771905,3.891854,3.739119,7.318773,4.316186,5.056997,3.582378,4.823407,4.044260,3.746250
ACH-001858,3.901608,4.238788,3.743686,7.249076,4.307132,4.833525,3.771085,5.057669,4.152531,3.968048
ACH-001997,3.939896,4.004480,3.843355,7.277523,4.375051,4.821291,3.781947,4.887580,4.234103,3.798498


In [21]:
combined_weighted_zPPI = pd.concat([weighted_zPPI(pair) for pair in zfiltered_combined_interaction_scores.columns[0:10]], axis=1)
display(combined_weighted_zPPI)

Unnamed: 0,SMARCA2_SMARCA4,EXOC6_EXOC6B,STAG1_STAG2,RPL3_RPL3L,CNOT7_CNOT8,CKS1B_CKS2,HDAC1_HDAC2,HSP90AA1_HSP90AB1,RBBP4_RBBP7,PPP2R1A_PPP2R1B
ACH-001113,0.260639,0.140749,0.463402,0.049202,0.454632,0.271649,0.286881,0.171926,0.336901,0.188727
ACH-001289,0.159970,0.384904,0.264303,0.238025,0.334362,0.465294,0.217525,0.234738,0.310358,0.325805
ACH-001339,-0.185762,0.082123,-0.122506,0.106021,-0.214212,0.315783,-0.188189,0.084465,-0.147390,-0.096897
ACH-001538,-0.224950,0.023717,-0.100088,0.273790,-0.060387,-0.134705,-0.147596,0.000084,-0.228356,-0.007711
ACH-000242,0.150543,0.038006,0.050064,0.213166,0.265143,0.193010,0.158141,0.074127,0.172930,0.082046
...,...,...,...,...,...,...,...,...,...,...
ACH-000285,0.829550,0.120924,0.935646,0.637427,0.871069,0.992293,0.652207,0.299388,0.858271,0.238731
ACH-002669,-0.432918,-0.209585,-0.304904,0.123348,-0.210403,-0.019606,-0.373483,-0.180583,-0.401422,-0.258670
ACH-001858,-0.281732,0.281073,-0.308422,-0.016836,-0.229991,-0.251729,-0.195001,0.036124,-0.299534,-0.016027
ACH-001997,-0.244848,0.004224,-0.236817,0.041353,-0.224830,-0.269522,-0.187043,-0.139977,-0.192973,-0.207373


In [22]:
def weighted_PPI(pair):
    weights = filtered_combined_interaction_scores[pair].dropna()
    valid_genes = weights.index.intersection(filtered_gene_expression.index)

    if len(valid_genes) == 0:
        return pd.Series(dtype=float, name=pair)

    df = filtered_gene_expression.loc[valid_genes]

    # Multiply each gene's row by its weight
    weighted_df = df.mul(weights[valid_genes], axis=0)

    # Get weighted average across genes (rows), for each cell line (column)
    weighted_avg = weighted_df.sum(axis=0) / weights[valid_genes].sum()

    # Final result: Series with cell lines as index
    weighted_avg.name = pair
    return weighted_avg

In [23]:
test_pair = 'SMARCA2_SMARCA4'
print(test_pair)
weighted_PPI(test_pair)

SMARCA2_SMARCA4


ACH-001113    4.336497
ACH-001289    4.219004
ACH-001339    3.970107
ACH-001538    3.967633
ACH-000242    4.258391
                ...   
ACH-000285    4.719824
ACH-002669    3.771905
ACH-001858    3.901608
ACH-001997    3.939896
ACH-000052    4.575195
Name: SMARCA2_SMARCA4, Length: 1408, dtype: float64

In [24]:
test_pair = 'ZNF138_ZNF141'
print(test_pair)
weighted_PPI(test_pair)

ZNF138_ZNF141


ACH-001113    5.749534
ACH-001289    3.744161
ACH-001339    4.016140
ACH-001538    5.115200
ACH-000242    4.025915
                ...   
ACH-000285    4.712596
ACH-002669    3.465974
ACH-001858    4.912650
ACH-001997    4.523562
ACH-000052    4.750070
Name: ZNF138_ZNF141, Length: 1408, dtype: float64

In [25]:
pairs = list(filtered_combined_interaction_scores.columns)

results = Parallel(n_jobs=-1, backend="loky")(
    delayed(weighted_PPI)(pair) for pair in tqdm(pairs)
)

# Optional: remove truly empty Series (if any)
# results = [r for r in results if not r.empty]

# Combine results
weighted_ppi_df = pd.concat(results, axis=1)

100%|██████████| 34047/34047 [14:19<00:00, 39.63it/s] 


In [26]:
print(weighted_ppi_df.shape)
weighted_ppi_df.head()

(1408, 34047)


Unnamed: 0,SMARCA2_SMARCA4,EXOC6_EXOC6B,STAG1_STAG2,RPL3_RPL3L,CNOT7_CNOT8,CKS1B_CKS2,HDAC1_HDAC2,HSP90AA1_HSP90AB1,RBBP4_RBBP7,PPP2R1A_PPP2R1B,...,ZNF138_ZNF141,ZNF320_ZNF611,ZNF141_ZNF195,ZNF195_ZNF253,NBPF1_NBPF15,ZIK1_ZNF530,ZNF138_ZNF737,ZNF93_ZNF141,ZIK1_ZNF211,ZNF117_ZNF493
ACH-001113,4.336497,4.0445,4.420613,7.301168,4.787432,5.325845,4.200424,5.137395,4.688187,4.143348,...,5.749534,7.713825,5.261166,5.749534,1.385227,6.852173,5.749534,4.66294,2.280956,1.157724
ACH-001289,4.219004,4.212694,4.17141,7.427948,4.636875,5.485654,4.067857,5.074649,4.631699,4.192616,...,3.744161,7.933846,4.853672,3.744161,1.109011,7.424376,3.744161,2.284351,2.114367,2.961218
ACH-001339,3.970107,4.048964,3.868243,7.335901,4.292613,5.365455,3.759007,5.080747,4.252093,3.878671,...,4.01614,7.62458,4.315301,4.01614,0.812892,6.858447,4.01614,4.987349,1.104337,1.320109
ACH-001538,3.967633,4.027167,3.897304,7.441265,4.439388,4.939608,3.816576,4.997209,4.206917,3.95224,...,5.1152,8.307556,4.834721,5.1152,0.872833,7.917725,5.1152,3.816702,1.682573,1.320296
ACH-000242,4.258391,4.092167,4.074723,7.400517,4.700591,5.258796,4.077576,5.039962,4.528516,4.032838,...,4.025915,7.938582,5.02486,4.025915,1.811601,6.957827,4.025915,5.388743,1.584963,1.642437


In [27]:
def weighted_zPPI(pair):
    weights = zfiltered_combined_interaction_scores[pair].dropna()
    valid_genes = weights.index.intersection(filtered_gene_z_expression.index)

    if len(valid_genes) == 0:
        return pd.Series(dtype=float, name=pair)

    df = filtered_gene_z_expression.loc[valid_genes]

    # Multiply each gene's row by its weight
    weighted_df = df.mul(weights[valid_genes], axis=0)

    # Get weighted average across genes (rows), for each cell line (column)
    weighted_avg = weighted_df.sum(axis=0) / weights[valid_genes].sum()

    # Final result: Series with cell lines as index
    weighted_avg.name = pair
    return weighted_avg

In [None]:
test_pair = 'ZNF138_ZNF141'
print(test_pair)
weighted_zPPI(test_pair)

In [28]:
pairs = list(zfiltered_combined_interaction_scores.columns)

results = Parallel(n_jobs=-1, backend="loky")(
    delayed(weighted_zPPI)(pair) for pair in tqdm(pairs)
)

# Optional: remove truly empty Series (if any)
# results = [r for r in results if not r.empty]

# Combine results
weighted_zppi_df = pd.concat(results, axis=1)

100%|██████████| 34047/34047 [09:39<00:00, 58.73it/s] 


In [29]:
print(weighted_zppi_df.shape)
weighted_zppi_df.head()

(1408, 34047)


Unnamed: 0,SMARCA2_SMARCA4,EXOC6_EXOC6B,STAG1_STAG2,RPL3_RPL3L,CNOT7_CNOT8,CKS1B_CKS2,HDAC1_HDAC2,HSP90AA1_HSP90AB1,RBBP4_RBBP7,PPP2R1A_PPP2R1B,...,ZNF138_ZNF141,ZNF320_ZNF611,ZNF141_ZNF195,ZNF195_ZNF253,NBPF1_NBPF15,ZIK1_ZNF530,ZNF138_ZNF737,ZNF93_ZNF141,ZIK1_ZNF211,ZNF117_ZNF493
ACH-001113,0.260639,0.140749,0.463402,0.049202,0.454632,0.271649,0.286881,0.171926,0.336901,0.188727,...,1.533428,0.039117,0.144185,1.533428,-0.026308,0.326782,1.533428,0.835034,1.296211,-0.252194
ACH-001289,0.15997,0.384904,0.264303,0.238025,0.334362,0.465294,0.217525,0.234738,0.310358,0.325805,...,-0.816642,0.390475,-0.35611,-0.816642,-0.240089,1.197917,-0.816642,-0.761486,1.026181,0.812433
ACH-001339,-0.185762,0.082123,-0.122506,0.106021,-0.214212,0.315783,-0.188189,0.084465,-0.14739,-0.096897,...,-0.497914,-0.171287,-0.948454,-0.497914,-0.633622,0.073807,-0.497914,0.458931,-0.611012,-0.518073
ACH-001538,-0.22495,0.023717,-0.100088,0.27379,-0.060387,-0.134705,-0.147596,8.4e-05,-0.228356,-0.007711,...,0.79006,0.881313,-0.344976,0.79006,-0.572718,1.064455,0.79006,0.290448,0.326272,-0.235211
ACH-000242,0.150543,0.038006,0.050064,0.213166,0.265143,0.19301,0.158141,0.074127,0.17293,0.082046,...,-0.486458,0.268895,-0.15911,-0.486458,0.471965,0.394422,-0.486458,0.631104,0.168051,0.033312


In [30]:
output_path = get_data_path(['input', 'PPI'], '')

weighted_ppi_df.to_parquet(os.path.join(output_path, 'weighted_PPI_expression.parquet'))
weighted_zppi_df.to_parquet(os.path.join(output_path, 'weighted_zPPI_expression.parquet'))