In [None]:
import os
import csv
import re
import pandas as pd
import pyarrow.parquet as pq
from joblib import Parallel, delayed
from tqdm import tqdm

In [12]:
# set the base directory for the project
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, "..", ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

ranked_essentiality_files_path = get_data_path(['output', 'ranked_essentiality'], 'ranked_essentiality.csv')
ranked_zessentiality_files_path = get_data_path(['output', 'ranked_essentiality'], 'ranked_zessentiality.csv')

combined_interaction_score_path = get_data_path(['input', 'PPI'], 'combined_interaction_scores.parquet')

In [8]:
gene_effect = pd.read_csv(ranked_essentiality_files_path, index_col=0)

In [9]:
gene_effect.columns = gene_effect.columns.astype(int)
gene_effect = gene_effect.T.sort_index()
gene_effect[:3]

Unnamed: 0,ACH-000004,ACH-000005,ACH-000007,ACH-000009,ACH-000011,ACH-000012,ACH-000013,ACH-000014,ACH-000015,ACH-000017,...,ACH-002285,ACH-002294,ACH-002295,ACH-002296,ACH-002297,ACH-002298,ACH-002304,ACH-002305,ACH-000779,ACH-001086
1,11172.0,3608.0,9393.0,10907.0,14687.0,5946.0,6845.0,8565.0,3954.0,10169.0,...,4347.0,11909.0,8975.0,7259.0,14336.0,4673.0,9301.0,13377.0,6388.0,7412.0
2,5346.0,14189.0,12274.0,13473.0,11632.0,16815.0,12207.0,12329.0,7173.0,10517.0,...,11139.0,10805.0,16031.0,7139.0,12611.0,12894.0,8725.0,15926.0,16883.0,10395.0
9,15919.0,17197.0,15246.0,10793.0,13188.0,12370.0,14169.0,15968.0,16420.0,13331.0,...,16416.0,13834.0,11591.0,14976.0,16967.0,14952.0,14104.0,10443.0,17769.0,


In [10]:
gene_z_effect = pd.read_csv(ranked_zessentiality_files_path, index_col=0)

In [11]:
gene_z_effect.columns = gene_z_effect.columns.astype(int)
gene_z_effect = gene_z_effect.T.sort_index()
gene_z_effect[:3]

Unnamed: 0,ACH-000004,ACH-000005,ACH-000007,ACH-000009,ACH-000011,ACH-000012,ACH-000013,ACH-000014,ACH-000015,ACH-000017,...,ACH-002285,ACH-002294,ACH-002295,ACH-002296,ACH-002297,ACH-002298,ACH-002304,ACH-002305,ACH-000779,ACH-001086
1,0.591971,-1.651884,0.064232,0.513359,1.634694,-0.958318,-0.69163,-0.181393,-1.549243,0.294432,...,-1.43266,0.810602,-0.059767,-0.568817,1.53057,-1.335952,0.036941,1.246083,-0.827199,-0.52343
2,-1.861594,0.848667,0.261746,0.629223,0.064981,1.653501,0.241211,0.278602,-1.301643,-0.276751,...,-0.086117,-0.188483,1.413216,-1.312063,0.365032,0.451767,-0.825975,1.381035,1.674342,-0.314143
9,0.767615,1.19588,0.542089,-0.950138,-0.147559,-0.421676,0.18118,0.784035,0.935503,-0.099639,...,0.934163,0.068919,-0.682723,0.45161,1.118806,0.443567,0.159398,-1.067425,1.387561,


In [14]:
# read combined_interaction_scores.parquet
combined_interaction_scores = pq.read_table(combined_interaction_score_path)
combined_interaction_scores = combined_interaction_scores.to_pandas().sort_index()

In [15]:
combined_interaction_scores[:3]

Unnamed: 0,SMARCA2_SMARCA4,EXOC6_EXOC6B,STAG1_STAG2,RPL3_RPL3L,CNOT7_CNOT8,CKS1B_CKS2,HDAC1_HDAC2,HSP90AA1_HSP90AB1,RBBP4_RBBP7,PPP2R1A_PPP2R1B,...,ZNF138_ZNF141,ZNF320_ZNF611,ZNF141_ZNF195,ZNF195_ZNF253,NBPF1_NBPF15,ZIK1_ZNF530,ZNF138_ZNF737,ZNF93_ZNF141,ZIK1_ZNF211,ZNF117_ZNF493
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,


In [16]:
# Keep genes that are present in both gene_effect and combined_interaction_scores and sort them by index
filtered_gene_effect = gene_effect[gene_effect.index.isin(combined_interaction_scores.index)].sort_index()

filtered_combined_interaction_scores = combined_interaction_scores[combined_interaction_scores.index.isin(gene_effect.index)].sort_index()

In [None]:
zfiltered_gene_effect = gene_z_effect[gene_z_effect.index.isin(combined_interaction_scores.index)].sort_index()

zfiltered_combined_interaction_scores = combined_interaction_scores[combined_interaction_scores.index.isin(gene_z_effect.index)].sort_index()

### weighted PPI - for essentiality

In [None]:
# calculate the weighted PPI for a given pair of genes

def weighted_PPI(pair):
    df = filtered_gene_effect.mul(filtered_combined_interaction_scores[pair], axis=0)
    sum_of_cell_lines = df.sum()
    ess_of_shared_int = sum_of_cell_lines.div(filtered_combined_interaction_scores[pair].sum(), axis=0)
    ess_of_shared_int.name = pair
    return ess_of_shared_int

test_pair = 'SMARCA2_SMARCA4'
weighted_PPI(test_pair)

ACH-000004    5926.852569
ACH-000005    6029.278575
ACH-000007    5910.893796
ACH-000009    6043.326503
ACH-000011    5901.250868
                 ...     
ACH-002298    6100.192506
ACH-002304    6409.746242
ACH-002305    6013.356060
ACH-000779    6781.255992
ACH-001086    6767.771571
Name: SMARCA2_SMARCA4, Length: 1080, dtype: float64

In [20]:
test_pair = 'ZNF138_ZNF141'
weighted_PPI(test_pair)

ACH-000004     2226.0
ACH-000005     5013.0
ACH-000007     9851.0
ACH-000009    11303.0
ACH-000011     4236.0
               ...   
ACH-002298    12731.0
ACH-002304    11844.0
ACH-002305    13255.0
ACH-000779    16295.0
ACH-001086    17316.0
Name: ZNF138_ZNF141, Length: 1080, dtype: float64

In [21]:
# calculate the weighted PPI for 10 paralog pairs
combined_weighted_PPI = pd.concat([weighted_PPI(pair) for pair in filtered_combined_interaction_scores.columns[0:10]], axis=1)
display(combined_weighted_PPI)

Unnamed: 0,SMARCA2_SMARCA4,EXOC6_EXOC6B,STAG1_STAG2,RPL3_RPL3L,CNOT7_CNOT8,CKS1B_CKS2,HDAC1_HDAC2,HSP90AA1_HSP90AB1,RBBP4_RBBP7,PPP2R1A_PPP2R1B
ACH-000004,5926.852569,6595.893908,4949.176686,2680.367854,6625.864159,4058.208952,6325.965019,6114.696289,5216.321529,7317.322601
ACH-000005,6029.278575,6608.201117,4753.371297,2854.380390,6214.974696,4167.269062,6391.299539,6211.652807,5300.470512,7625.962976
ACH-000007,5910.893796,6787.893882,4847.837725,2240.483892,6906.262902,3881.828510,6603.329493,6303.998901,5511.824008,7387.546167
ACH-000009,6043.326503,6498.765056,4861.663830,2427.278798,6268.454063,3964.799738,6451.184157,6096.237306,5275.206458,6851.465262
ACH-000011,5901.250868,6634.480771,4109.543934,2827.552796,6217.913949,3744.371116,6863.127283,6414.806295,5584.393052,7141.114875
...,...,...,...,...,...,...,...,...,...,...
ACH-002298,6100.192506,6828.228492,4747.750829,2376.513167,6074.351024,3980.403783,6947.874676,6169.202382,5605.665840,7054.166834
ACH-002304,6409.746242,6472.441883,5272.835599,2548.029079,6832.877134,4576.597240,6730.827492,6335.552834,5679.771305,7486.704321
ACH-002305,6013.356060,6986.861745,4432.380535,2538.150675,6324.907846,3833.208519,6722.012575,6113.823647,5566.935819,7096.280058
ACH-000779,6781.255992,6898.720084,5195.730286,2740.424832,7067.163817,4588.724481,7632.063752,6872.579672,6395.462264,8452.647668


In [22]:
def weighted_PPI(pair):
    weights = filtered_combined_interaction_scores[pair].dropna()
    valid_genes = weights.index.intersection(filtered_gene_effect.index)

    if len(valid_genes) == 0:
        return pd.Series(dtype=float, name=pair)

    df = filtered_gene_effect.loc[valid_genes]

    # Multiply each gene's row by its weight
    weighted_df = df.mul(weights[valid_genes], axis=0)

    # Get weighted average across genes (rows), for each cell line (column)
    weighted_avg = weighted_df.sum(axis=0) / weights[valid_genes].sum()

    # Final result: Series with cell lines as index
    weighted_avg.name = pair
    return weighted_avg

In [23]:
print(test_pair)
weighted_PPI(test_pair)

ZNF138_ZNF141


ACH-000004     2226.0
ACH-000005     5013.0
ACH-000007     9851.0
ACH-000009    11303.0
ACH-000011     4236.0
               ...   
ACH-002298    12731.0
ACH-002304    11844.0
ACH-002305    13255.0
ACH-000779    16295.0
ACH-001086    17316.0
Name: ZNF138_ZNF141, Length: 1080, dtype: float64

In [24]:
pairs = list(filtered_combined_interaction_scores.columns)

results = Parallel(n_jobs=-1, backend="loky")(
    delayed(weighted_PPI)(pair) for pair in tqdm(pairs)
)

# Optional: remove truly empty Series (if any)
# results = [r for r in results if not r.empty]

# Combine results
weighted_ppi_df = pd.concat(results, axis=1)

  0%|          | 0/34047 [00:00<?, ?it/s]

100%|██████████| 34047/34047 [08:56<00:00, 63.41it/s] 


In [25]:
print(weighted_ppi_df.shape)
weighted_ppi_df.head()

(1080, 34047)


Unnamed: 0,SMARCA2_SMARCA4,EXOC6_EXOC6B,STAG1_STAG2,RPL3_RPL3L,CNOT7_CNOT8,CKS1B_CKS2,HDAC1_HDAC2,HSP90AA1_HSP90AB1,RBBP4_RBBP7,PPP2R1A_PPP2R1B,...,ZNF138_ZNF141,ZNF320_ZNF611,ZNF141_ZNF195,ZNF195_ZNF253,NBPF1_NBPF15,ZIK1_ZNF530,ZNF138_ZNF737,ZNF93_ZNF141,ZIK1_ZNF211,ZNF117_ZNF493
ACH-000004,5926.852569,6595.893908,4949.176686,2680.367854,6625.864159,4058.208952,6325.965019,6114.696289,5216.321529,7317.322601,...,2226.0,3347.346537,6126.02312,2226.0,5845.290698,2639.543929,2226.0,7781.875643,3010.0,10963.708967
ACH-000005,6029.278575,6608.201117,4753.371297,2854.38039,6214.974696,4167.269062,6391.299539,6211.652807,5300.470512,7625.962976,...,5013.0,2587.453621,11267.238758,5013.0,7590.583971,5986.138087,5013.0,9867.048954,1796.0,10015.276685
ACH-000007,5910.893796,6787.893882,4847.837725,2240.483892,6906.262902,3881.82851,6603.329493,6303.998901,5511.824008,7387.546167,...,9851.0,3153.272961,7246.129282,9851.0,7844.162717,1894.34345,9851.0,12048.522911,2819.0,7367.83753
ACH-000009,6043.326503,6498.765056,4861.66383,2427.278798,6268.454063,3964.799738,6451.184157,6096.237306,5275.206458,6851.465262,...,11303.0,2722.278018,6925.943239,11303.0,7135.177804,1367.288877,11303.0,10859.135494,1537.0,13937.653126
ACH-000011,5901.250868,6634.480771,4109.543934,2827.552796,6217.913949,3744.371116,6863.127283,6414.806295,5584.393052,7141.114875,...,4236.0,4341.504743,4852.016724,4236.0,9354.502134,2489.637735,4236.0,9916.617919,5178.0,7646.262286


### weighted PPI - for zscored essentiality

In [28]:
def weighted_zPPI(pair):
    df = zfiltered_gene_effect.mul(zfiltered_combined_interaction_scores[pair], axis=0)
    sum_of_cell_lines = df.sum()
    ess_of_shared_int = sum_of_cell_lines.div(zfiltered_combined_interaction_scores[pair].sum(), axis=0)
    ess_of_shared_int.name = pair
    return ess_of_shared_int

In [None]:
test_pair = 'SMARCA2_SMARCA4'
weighted_zPPI(test_pair)

ACH-000004   -1.243543
ACH-000005   -0.501138
ACH-000007    0.787615
ACH-000009    1.174400
ACH-000011   -0.708116
                ...   
ACH-002298    1.554793
ACH-002304    1.318513
ACH-002305    1.694377
ACH-000779    2.504176
ACH-001086    2.776151
Name: ZNF138_ZNF141, Length: 1080, dtype: float64

In [32]:
test_pair = 'ZNF138_ZNF141'
weighted_zPPI(test_pair)

ACH-000004   -1.243543
ACH-000005   -0.501138
ACH-000007    0.787615
ACH-000009    1.174400
ACH-000011   -0.708116
                ...   
ACH-002298    1.554793
ACH-002304    1.318513
ACH-002305    1.694377
ACH-000779    2.504176
ACH-001086    2.776151
Name: ZNF138_ZNF141, Length: 1080, dtype: float64

In [33]:
combined_weighted_zPPI = pd.concat([weighted_zPPI(pair) for pair in zfiltered_combined_interaction_scores.columns[0:10]], axis=1)
display(combined_weighted_zPPI)

Unnamed: 0,SMARCA2_SMARCA4,EXOC6_EXOC6B,STAG1_STAG2,RPL3_RPL3L,CNOT7_CNOT8,CKS1B_CKS2,HDAC1_HDAC2,HSP90AA1_HSP90AB1,RBBP4_RBBP7,PPP2R1A_PPP2R1B
ACH-000004,0.002921,0.062588,0.026549,0.215656,0.099948,0.010827,-0.066352,-0.002036,-0.036853,0.053870
ACH-000005,0.002188,0.051522,-0.030876,0.140815,-0.039614,0.001075,-0.060091,0.015116,-0.040510,0.113178
ACH-000007,-0.077776,0.050856,-0.001415,-0.271698,0.128201,-0.132441,-0.038151,0.005747,-0.016480,0.057374
ACH-000009,-0.012335,-0.114621,0.118494,-0.207754,-0.049195,-0.018614,-0.074426,-0.082785,-0.091077,-0.110116
ACH-000011,-0.065002,-0.033133,-0.192754,0.010368,-0.028728,-0.145870,0.039826,0.036907,0.000685,0.004256
...,...,...,...,...,...,...,...,...,...,...
ACH-002298,0.004774,0.076027,-0.013109,-0.222693,-0.076184,-0.055891,0.068502,-0.066670,-0.009054,-0.022989
ACH-002304,0.117991,-0.072193,0.226603,-0.089988,0.137949,0.219454,0.052465,0.037204,0.068235,0.132324
ACH-002305,-0.016206,0.170383,-0.061412,-0.048930,0.006389,-0.102302,0.020572,-0.088931,0.005229,-0.026016
ACH-000779,0.509210,0.222218,0.927113,0.388954,0.117962,0.678387,0.638110,0.314814,0.701978,0.476923


In [34]:
def weighted_zPPI(pair):
    weights = zfiltered_combined_interaction_scores[pair].dropna()
    valid_genes = weights.index.intersection(zfiltered_gene_effect.index)

    if len(valid_genes) == 0:
        return pd.Series(dtype=float, name=pair)

    df = zfiltered_gene_effect.loc[valid_genes]

    # Multiply each gene's row by its weight
    weighted_df = df.mul(weights[valid_genes], axis=0)

    # Get weighted average across genes (rows), for each cell line (column)
    weighted_avg = weighted_df.sum(axis=0) / weights[valid_genes].sum()

    # Final result: Series with cell lines as index
    weighted_avg.name = pair
    return weighted_avg

In [35]:
pairs = list(zfiltered_combined_interaction_scores.columns)

results = Parallel(n_jobs=-1, backend="loky")(
    delayed(weighted_zPPI)(pair) for pair in tqdm(pairs)
)

# Optional: remove truly empty Series (if any)
# results = [r for r in results if not r.empty]

# Combine results
weighted_zppi_df = pd.concat(results, axis=1)

100%|██████████| 34047/34047 [10:28<00:00, 54.21it/s]  


In [36]:
print(weighted_zppi_df.shape)
weighted_zppi_df.head()

(1080, 34047)


Unnamed: 0,SMARCA2_SMARCA4,EXOC6_EXOC6B,STAG1_STAG2,RPL3_RPL3L,CNOT7_CNOT8,CKS1B_CKS2,HDAC1_HDAC2,HSP90AA1_HSP90AB1,RBBP4_RBBP7,PPP2R1A_PPP2R1B,...,ZNF138_ZNF141,ZNF320_ZNF611,ZNF141_ZNF195,ZNF195_ZNF253,NBPF1_NBPF15,ZIK1_ZNF530,ZNF138_ZNF737,ZNF93_ZNF141,ZIK1_ZNF211,ZNF117_ZNF493
ACH-000004,0.002921,0.062588,0.026549,0.215656,0.099948,0.010827,-0.066352,-0.002036,-0.036853,0.05387,...,-1.243543,0.269186,-0.461901,-1.243543,-0.475436,0.33397,-1.243543,-0.689195,0.333533,0.361571
ACH-000005,0.002188,0.051522,-0.030876,0.140815,-0.039614,0.001075,-0.060091,0.015116,-0.04051,0.113178,...,-0.501138,0.492879,0.916397,-0.501138,-0.049437,1.617679,-0.501138,-0.082386,-0.503758,0.201281
ACH-000007,-0.077776,0.050856,-0.001415,-0.271698,0.128201,-0.132441,-0.038151,0.005747,-0.01648,0.057374,...,0.787615,0.041388,-0.167598,0.787615,0.046068,-0.388806,0.787615,0.512946,0.201801,-0.642559
ACH-000009,-0.012335,-0.114621,0.118494,-0.207754,-0.049195,-0.018614,-0.074426,-0.082785,-0.091077,-0.110116,...,1.1744,-0.140214,-0.254822,1.1744,-0.154658,-0.71373,1.1744,0.104314,-0.682389,1.259191
ACH-000011,-0.065002,-0.033133,-0.192754,0.010368,-0.028728,-0.14587,0.039826,0.036907,0.000685,0.004256,...,-0.708116,0.264895,-0.80575,-0.708116,0.522156,-0.314984,-0.708116,-0.050632,1.828794,-0.498046


In [None]:
output_path = get_data_path(['input', 'PPI'], '')

weighted_ppi_df.to_parquet(os.path.join(output_path, 'weighted_PPI_essentiality.parquet'))
weighted_zppi_df.to_parquet(os.path.join(output_path, 'weighted_zPPI_essentiality.parquet'))