In [1]:
import os
import csv
import re
import pandas as pd
import pyarrow.parquet as pq
from joblib import Parallel, delayed
from tqdm import tqdm

In [3]:
# set the base directory for the project
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, "..", ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

expression_files_path = get_data_path(['output', 'processed_DepMap22Q4'], 'expression_data.csv')
zexpression_files_path = get_data_path(['output', 'processed_DepMap22Q4'], 'zexpression_data.csv')

combined_interaction_score_path = get_data_path(['input', 'PPI'], 'combined_interaction_scores.parquet')

In [4]:
# downloaded from https://ndownloader.figshare.com/files/34990036
# DEPMAP 22Q4

gene_expression = pd.read_csv(expression_files_path, index_col=0)
gene_z_expression = pd.read_csv(zexpression_files_path, index_col=0)

In [5]:
gene_expression = gene_expression.drop(['cell_name'], axis=1)
gene_z_expression = gene_z_expression.drop(['cell_name'], axis=1)

In [6]:
gene_expression = gene_expression.T
gene_expression.index = gene_expression.index.astype('int64')
gene_expression = gene_expression.sort_index()
gene_expression[:3]

Unnamed: 0,ACH-001113,ACH-001289,ACH-001339,ACH-001538,ACH-000242,ACH-000708,ACH-000327,ACH-000233,ACH-000461,ACH-000705,...,ACH-001578,ACH-000036,ACH-000973,ACH-001128,ACH-000750,ACH-000285,ACH-002669,ACH-001858,ACH-001997,ACH-000052
1,3.970854,2.114367,2.627607,0.0,0.097611,0.226509,0.084064,5.28281,0.097611,2.871844,...,2.134221,2.560715,4.986866,4.304511,0.0,4.86839,0.014355,0.176323,0.0,4.868884
2,0.0,0.831877,5.910972,0.163499,0.226509,0.163499,0.111031,0.042644,0.214125,0.15056,...,1.238787,0.189034,0.014355,0.0,0.201634,0.163499,0.0,0.097611,0.0,1.996389
9,1.989139,1.790772,2.784504,2.182692,3.68818,3.042644,1.691534,2.891419,1.157044,1.718088,...,2.104337,2.204767,1.819668,1.469886,1.794936,3.104337,2.695994,2.408712,3.68818,1.718088


In [7]:
gene_z_expression = gene_z_expression.T
gene_z_expression.index = gene_z_expression.index.astype('int64')
gene_z_expression = gene_z_expression.sort_index()
gene_z_expression[:3]

Unnamed: 0,ACH-001113,ACH-001289,ACH-001339,ACH-001538,ACH-000242,ACH-000708,ACH-000327,ACH-000233,ACH-000461,ACH-000705,...,ACH-001578,ACH-000036,ACH-000973,ACH-001128,ACH-000750,ACH-000285,ACH-002669,ACH-001858,ACH-001997,ACH-000052
1,0.581077,-0.342639,-0.087271,-1.394666,-1.346099,-1.281964,-1.352839,1.233856,-1.346099,0.034252,...,-0.33276,-0.120553,1.086606,0.747092,-1.394666,1.027657,-1.387523,-1.306935,-1.394666,1.027902
2,-0.482406,-0.049706,2.592176,-0.397362,-0.364588,-0.397362,-0.424653,-0.460225,-0.371029,-0.404093,...,0.161947,-0.38408,-0.474939,-0.482406,-0.377526,-0.397362,-0.482406,-0.431634,-0.482406,0.556012
9,-0.324088,-0.53733,0.53092,-0.11602,1.502362,0.808418,-0.644009,0.645853,-1.21858,-0.615465,...,-0.200251,-0.09229,-0.506267,-0.882279,-0.532854,0.874737,0.435773,0.126948,1.502362,-0.615465


In [8]:
# read combined_interaction_scores.parquet
combined_interaction_scores = pq.read_table(combined_interaction_score_path)
combined_interaction_scores = combined_interaction_scores.to_pandas().sort_index()

In [9]:
combined_interaction_scores[:3]

Unnamed: 0,SMARCA2_SMARCA4,EXOC6_EXOC6B,STAG1_STAG2,RPL3_RPL3L,CNOT7_CNOT8,CKS1B_CKS2,HDAC1_HDAC2,HSP90AA1_HSP90AB1,RBBP4_RBBP7,PPP2R1A_PPP2R1B,...,ZNF138_ZNF141,ZNF320_ZNF611,ZNF141_ZNF195,ZNF195_ZNF253,NBPF1_NBPF15,ZIK1_ZNF530,ZNF138_ZNF737,ZNF93_ZNF141,ZIK1_ZNF211,ZNF117_ZNF493
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,


In [10]:
# filter gene_expression for shared interactors
filtered_gene_expression = gene_expression[gene_expression.index.isin(combined_interaction_scores.index)].sort_index()

filtered_combined_interaction_scores = combined_interaction_scores[combined_interaction_scores.index.isin(gene_expression.index)].sort_index()

In [11]:
test_pair = 'SMARCA2_SMARCA4'

In [12]:
def weighted_PPI(pair):
    df = filtered_gene_expression.mul(filtered_combined_interaction_scores[pair], axis=0)
    sum_of_cell_lines = df.sum()
    exp_of_shared_int = sum_of_cell_lines.div(filtered_combined_interaction_scores[pair].sum(), axis=0)
    exp_of_shared_int.name = pair
    return exp_of_shared_int

weighted_PPI(test_pair)

ACH-001113    4.336497
ACH-001289    4.219004
ACH-001339    3.970107
ACH-001538    3.967633
ACH-000242    4.258391
                ...   
ACH-000285    4.719824
ACH-002669    3.771905
ACH-001858    3.901608
ACH-001997    3.939896
ACH-000052    4.575195
Name: SMARCA2_SMARCA4, Length: 1408, dtype: float64

In [13]:
# filter gene_z_expression for shared interactors
filtered_gene_z_expression = gene_z_expression[gene_z_expression.index.isin(combined_interaction_scores.index)].sort_index()

zfiltered_combined_interaction_scores = combined_interaction_scores[combined_interaction_scores.index.isin(gene_z_expression.index)].sort_index()

In [14]:
def weighted_zPPI(pair):
    df = filtered_gene_z_expression.mul(zfiltered_combined_interaction_scores[pair], axis=0)
    sum_of_cell_lines = df.sum()
    exp_of_shared_int = sum_of_cell_lines.div(zfiltered_combined_interaction_scores[pair].sum(), axis=0)
    exp_of_shared_int.name = pair
    return exp_of_shared_int

In [15]:
test_pair = 'SMARCA2_SMARCA4'

In [18]:
weighted_zPPI(test_pair)

ACH-001113    0.260639
ACH-001289    0.159970
ACH-001339   -0.185762
ACH-001538   -0.224950
ACH-000242    0.150543
                ...   
ACH-000285    0.829550
ACH-002669   -0.432918
ACH-001858   -0.281732
ACH-001997   -0.244848
ACH-000052    0.562371
Name: SMARCA2_SMARCA4, Length: 1408, dtype: float64

In [15]:
combined_weighted_PPI = pd.concat([weighted_PPI(pair) for pair in filtered_combined_interaction_scores.columns[0:10]], axis=1)
display(combined_weighted_PPI)

Unnamed: 0,SMARCA2_SMARCA4,EXOC6_EXOC6B,STAG1_STAG2,RPL3_RPL3L,CNOT7_CNOT8,CKS1B_CKS2,HDAC1_HDAC2,HSP90AA1_HSP90AB1,RBBP4_RBBP7,PPP2R1A_PPP2R1B
ACH-001113,4.344704,4.044500,4.439364,7.329987,4.787432,5.355492,4.203522,5.143092,4.704135,4.144092
ACH-001289,4.227203,4.212694,4.202427,7.449875,4.636875,5.523815,4.071537,5.081449,4.643522,4.193324
ACH-001339,3.966348,4.048964,3.903259,7.365022,4.292613,5.400586,3.754676,5.084008,4.259780,3.879347
ACH-001538,3.968463,4.027167,3.931327,7.475972,4.439388,4.973875,3.816484,5.003503,4.218565,3.953111
ACH-000242,4.256749,4.092167,4.102667,7.429534,4.700591,5.285771,4.074544,5.046582,4.539215,4.033532
...,...,...,...,...,...,...,...,...,...,...
ACH-000285,4.720719,3.929269,4.721901,7.699794,5.067211,5.971230,4.352316,5.070063,5.024424,4.062341
ACH-002669,3.773902,3.891854,3.774883,7.354988,4.316186,5.095974,3.582961,4.830979,4.054889,3.746955
ACH-001858,3.905243,4.238788,3.774531,7.288208,4.307132,4.862605,3.771876,5.065071,4.164565,3.969138
ACH-001997,3.953236,4.004480,3.869224,7.311266,4.375051,4.850399,3.787687,4.894782,4.237743,3.799402


In [16]:
combined_weighted_zPPI = pd.concat([weighted_zPPI(pair) for pair in zfiltered_combined_interaction_scores.columns[0:10]], axis=1)
display(combined_weighted_zPPI)

Unnamed: 0,SMARCA2_SMARCA4,EXOC6_EXOC6B,STAG1_STAG2,RPL3_RPL3L,CNOT7_CNOT8,CKS1B_CKS2,HDAC1_HDAC2,HSP90AA1_HSP90AB1,RBBP4_RBBP7,PPP2R1A_PPP2R1B
ACH-001113,0.262745,0.140749,0.464002,0.047692,0.454632,0.267187,0.285340,0.172097,0.339382,0.188875
ACH-001289,0.161949,0.384904,0.273417,0.227973,0.334362,0.477291,0.218784,0.237837,0.308073,0.325943
ACH-001339,-0.190086,0.082123,-0.112256,0.107250,-0.214212,0.322264,-0.190653,0.082876,-0.150526,-0.096821
ACH-001538,-0.226509,0.023717,-0.090528,0.280720,-0.060387,-0.127426,-0.146888,0.001286,-0.228228,-0.007430
ACH-000242,0.147426,0.038006,0.055536,0.213707,0.265143,0.185300,0.154412,0.076000,0.172405,0.082145
...,...,...,...,...,...,...,...,...,...,...
ACH-000285,0.826890,0.120924,0.941773,0.628588,0.871069,0.994469,0.651606,0.301603,0.860502,0.238589
ACH-002669,-0.433300,-0.209585,-0.295027,0.132399,-0.210403,-0.004761,-0.372105,-0.177887,-0.402519,-0.258572
ACH-001858,-0.280990,0.281073,-0.302115,-0.005487,-0.229991,-0.253525,-0.194076,0.038208,-0.299683,-0.015527
ACH-001997,-0.240283,0.004224,-0.234044,0.050259,-0.224830,-0.271366,-0.184939,-0.137512,-0.206969,-0.207070


In [14]:
def weighted_PPI(pair):
    weights = filtered_combined_interaction_scores[pair].dropna()
    valid_genes = weights.index.intersection(filtered_gene_expression.index)

    if len(valid_genes) == 0:
        return pd.Series(dtype=float, name=pair)

    df = filtered_gene_expression.loc[valid_genes]

    # Multiply each gene's row by its weight
    weighted_df = df.mul(weights[valid_genes], axis=0)

    # Get weighted average across genes (rows), for each cell line (column)
    weighted_avg = weighted_df.sum(axis=0) / weights[valid_genes].sum()

    # Final result: Series with cell lines as index
    weighted_avg.name = pair
    return weighted_avg

In [15]:
test_pair = 'SMARCA2_SMARCA4'
print(test_pair)
weighted_PPI(test_pair)

SMARCA2_SMARCA4


ACH-001113    4.336497
ACH-001289    4.219004
ACH-001339    3.970107
ACH-001538    3.967633
ACH-000242    4.258391
                ...   
ACH-000285    4.719824
ACH-002669    3.771905
ACH-001858    3.901608
ACH-001997    3.939896
ACH-000052    4.575195
Name: SMARCA2_SMARCA4, Length: 1408, dtype: float64

In [16]:
test_pair = 'ZNF138_ZNF141'
print(test_pair)
weighted_PPI(test_pair)

ZNF138_ZNF141


ACH-001113    5.749534
ACH-001289    3.744161
ACH-001339    4.016140
ACH-001538    5.115200
ACH-000242    4.025915
                ...   
ACH-000285    4.712596
ACH-002669    3.465974
ACH-001858    4.912650
ACH-001997    4.523562
ACH-000052    4.750070
Name: ZNF138_ZNF141, Length: 1408, dtype: float64

In [19]:
pairs = list(filtered_combined_interaction_scores.columns)

results = Parallel(n_jobs=-1, backend="loky")(
    delayed(weighted_PPI)(pair) for pair in tqdm(pairs)
)

# Optional: remove truly empty Series (if any)
# results = [r for r in results if not r.empty]

# Combine results
weighted_ppi_df = pd.concat(results, axis=1)

100%|██████████| 34047/34047 [06:21<00:00, 89.13it/s] 


In [20]:
print(weighted_ppi_df.shape)
weighted_ppi_df.head()

(1408, 34047)


Unnamed: 0,SMARCA2_SMARCA4,EXOC6_EXOC6B,STAG1_STAG2,RPL3_RPL3L,CNOT7_CNOT8,CKS1B_CKS2,HDAC1_HDAC2,HSP90AA1_HSP90AB1,RBBP4_RBBP7,PPP2R1A_PPP2R1B,...,ZNF138_ZNF141,ZNF320_ZNF611,ZNF141_ZNF195,ZNF195_ZNF253,NBPF1_NBPF15,ZIK1_ZNF530,ZNF138_ZNF737,ZNF141_ZNF93,ZIK1_ZNF211,ZNF117_ZNF493
ACH-001113,4.336497,4.0445,4.420613,7.301168,4.787432,5.325845,4.200424,5.137395,4.688187,4.143348,...,5.749534,7.713825,5.261166,5.749534,1.385227,6.852173,5.749534,4.66294,2.280956,1.157724
ACH-001289,4.219004,4.212694,4.17141,7.427948,4.636875,5.485654,4.067857,5.074649,4.631699,4.192616,...,3.744161,7.933846,4.853672,3.744161,1.109011,7.424376,3.744161,2.284351,2.114367,2.961218
ACH-001339,3.970107,4.048964,3.868243,7.335901,4.292613,5.365455,3.759007,5.080747,4.252093,3.878671,...,4.01614,7.62458,4.315301,4.01614,0.812892,6.858447,4.01614,4.987349,1.104337,1.320109
ACH-001538,3.967633,4.027167,3.897304,7.441265,4.439388,4.939608,3.816576,4.997209,4.206917,3.95224,...,5.1152,8.307556,4.834721,5.1152,0.872833,7.917725,5.1152,3.816702,1.682573,1.320296
ACH-000242,4.258391,4.092167,4.074723,7.400517,4.700591,5.258796,4.077576,5.039962,4.528516,4.032838,...,4.025915,7.938582,5.02486,4.025915,1.811601,6.957827,4.025915,5.388743,1.584963,1.642437


In [21]:
weighted_ppi_df.to_parquet('./output_files/combined_weighted_PPI_expression_new.parquet', 
                                       engine='pyarrow', index=True )

In [17]:
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [18]:
# Define a wrapper function that applies get_interactor_scores to each item in paralog_pairs
def parallel_weighted_PPI(item):
    return weighted_PPI(item)

# Convert the list of items to a Pandas Series and apply parallel_get_interactor_scores to each item
paralog_pairs = pd.Series(filtered_combined_interaction_scores.columns, index=filtered_combined_interaction_scores.columns)
combined_weighted_PPI_all = paralog_pairs.parallel_apply(parallel_weighted_PPI)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=9162), Label(value='0 / 9162'))), …

In [19]:
combined_weighted_PPI_all = combined_weighted_PPI_all.T
combined_weighted_PPI_all

Unnamed: 0,SMARCA2_SMARCA4,EXOC6_EXOC6B,STAG1_STAG2,RPL3_RPL3L,CNOT7_CNOT8,CKS1B_CKS2,HDAC1_HDAC2,HSP90AA1_HSP90AB1,RBBP4_RBBP7,PPP2R1A_PPP2R1B,...,ZNF195_ZNF253,NBPF1_NBPF15,ZIK1_ZNF530,ZNF138_ZNF737,ZNF77_ZNF699,ZNF93_ZNF141,ZIK1_ZNF211,ZNF441_ZNF791,ZNF117_ZNF493,ZNF525_ZNF600
ACH-001113,4.344704,4.044500,4.439364,7.329987,4.787432,5.355492,4.203522,5.143092,4.704135,4.144092,...,5.749534,1.385227,6.852173,5.749534,,4.662940,2.280956,,1.193661,
ACH-001289,4.227203,4.212694,4.202427,7.449875,4.636875,5.523815,4.071537,5.081449,4.643522,4.193324,...,3.744161,1.109011,7.424376,3.744161,,2.284351,2.114367,,2.865577,
ACH-001339,3.966348,4.048964,3.903259,7.365022,4.292613,5.400586,3.754676,5.084008,4.259780,3.879347,...,4.016140,0.812892,6.858447,4.016140,,4.987349,1.104337,,1.475660,
ACH-001538,3.968463,4.027167,3.931327,7.475972,4.439388,4.973875,3.816484,5.003503,4.218565,3.953111,...,5.115200,0.872833,7.917725,5.115200,,3.816702,1.682573,,1.379941,
ACH-000242,4.256749,4.092167,4.102667,7.429534,4.700591,5.285771,4.074544,5.046582,4.539215,4.033532,...,4.025915,1.811601,6.957827,4.025915,,5.388743,1.584963,,1.466001,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ACH-000285,4.720719,3.929269,4.721901,7.699794,5.067211,5.971230,4.352316,5.070063,5.024424,4.062341,...,4.712596,1.218534,8.787689,4.712596,,2.032626,3.279471,,2.750501,
ACH-002669,3.773902,3.891854,3.774883,7.354988,4.316186,5.095974,3.582961,4.830979,4.054889,3.746955,...,3.465974,1.585094,6.239746,3.465974,,2.739358,1.321928,,0.971009,
ACH-001858,3.905243,4.238788,3.774531,7.288208,4.307132,4.862605,3.771876,5.065071,4.164565,3.969138,...,4.912650,2.187255,7.092627,4.912650,,5.650249,1.263034,,1.722298,
ACH-001997,3.953236,4.004480,3.869224,7.311266,4.375051,4.850399,3.787687,4.894782,4.237743,3.799402,...,4.523562,3.199723,6.822549,4.523562,,2.216516,1.104337,,1.679246,


In [21]:
# Define a wrapper function that applies get_interactor_scores to each item in paralog_pairs
def parallel_weighted_zPPI(item):
    return weighted_zPPI(item)

# Convert the list of items to a Pandas Series and apply parallel_get_interactor_scores to each item
paralog_pairs = pd.Series(zfiltered_combined_interaction_scores.columns, index=zfiltered_combined_interaction_scores.columns)
combined_weighted_zPPI_all = paralog_pairs.parallel_apply(parallel_weighted_zPPI)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=9162), Label(value='0 / 9162'))), …

In [22]:
combined_weighted_zPPI_all = combined_weighted_zPPI_all.T
combined_weighted_zPPI_all

Unnamed: 0,SMARCA2_SMARCA4,EXOC6_EXOC6B,STAG1_STAG2,RPL3_RPL3L,CNOT7_CNOT8,CKS1B_CKS2,HDAC1_HDAC2,HSP90AA1_HSP90AB1,RBBP4_RBBP7,PPP2R1A_PPP2R1B,...,ZNF195_ZNF253,NBPF1_NBPF15,ZIK1_ZNF530,ZNF138_ZNF737,ZNF77_ZNF699,ZNF93_ZNF141,ZIK1_ZNF211,ZNF441_ZNF791,ZNF117_ZNF493,ZNF525_ZNF600
ACH-001113,0.262745,0.140749,0.464002,0.047692,0.454632,0.267187,0.285340,0.172097,0.339382,0.188875,...,1.533428,-0.026308,0.326782,1.533428,,0.835034,1.296211,,-0.163944,
ACH-001289,0.161949,0.384904,0.273417,0.227973,0.334362,0.477291,0.218784,0.237837,0.308073,0.325943,...,-0.816642,-0.240089,1.197917,-0.816642,,-0.761486,1.026181,,0.701573,
ACH-001339,-0.190086,0.082123,-0.112256,0.107250,-0.214212,0.322264,-0.190653,0.082876,-0.150526,-0.096821,...,-0.497914,-0.633622,0.073807,-0.497914,,0.458931,-0.611012,,-0.375279,
ACH-001538,-0.226509,0.023717,-0.090528,0.280720,-0.060387,-0.127426,-0.146888,0.001286,-0.228228,-0.007430,...,0.790060,-0.572718,1.064455,0.790060,,0.290448,0.326272,,-0.142789,
ACH-000242,0.147426,0.038006,0.055536,0.213707,0.265143,0.185300,0.154412,0.076000,0.172405,0.082145,...,-0.486458,0.471965,0.394422,-0.486458,,0.631104,0.168051,,-0.094396,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ACH-000285,0.826890,0.120924,0.941773,0.628588,0.871069,0.994469,0.651606,0.301603,0.860502,0.238589,...,0.318254,-0.098361,1.653131,0.318254,,-0.581084,2.914738,,0.752368,
ACH-002669,-0.433300,-0.209585,-0.295027,0.132399,-0.210403,-0.004761,-0.372105,-0.177887,-0.402519,-0.258572,...,-1.142645,0.084297,-0.867407,-1.142645,,-0.651912,-0.258310,,-0.616963,
ACH-001858,-0.280990,0.281073,-0.302115,-0.005487,-0.229991,-0.253525,-0.194076,0.038208,-0.299683,-0.015527,...,0.552695,0.700436,0.377610,0.552695,,1.003776,-0.353773,,-0.133232,
ACH-001997,-0.240283,0.004224,-0.234044,0.050259,-0.224830,-0.271366,-0.184939,-0.137512,-0.206969,-0.207070,...,0.096728,1.697158,0.033749,0.096728,,-0.559467,-0.611012,,-0.171249,


In [20]:
combined_weighted_PPI_all.to_parquet('./output_files/combined_weighted_PPI_expression.parquet', 
                                       engine='pyarrow', index=True)

In [23]:
combined_weighted_zPPI_all.to_parquet('./output_files/combined_weighted_zPPI_expression.parquet', 
                                       engine='pyarrow', index=True)