# Weighted PPI Essentiality Calculation for Paralog Pairs

This notebook calculates weighted essentiality scores for paralog pairs based on their shared protein-protein interactions (PPIs).

## Overview
The analysis combines:
1. **Gene essentiality rankings** from DepMap CRISPR screens across cell lines
2. **Shared interactor confidence scores** from STRING database (calculated in 03a_IdentifySharedInteractors.ipynb)

## Methodology
For each paralog pair, we calculate a **weighted average essentiality score** across cell lines:

```
weighted_score = Σ(essentiality_score × interaction_confidence) / Σ(interaction_confidence)
```

Where:
- `essentiality_score`: Gene dependency score for each shared interactor in each cell line
- `interaction_confidence`: Combined STRING confidence score for paralog-interactor interactions
- Sum is over all shared interactors between the paralog pair

## Key Outputs
- **weighted_PPI_essentiality.parquet**: Weighted essentiality scores (raw rankings)
- **weighted_zPPI_essentiality.parquet**: Weighted essentiality scores (z-score normalized)
- Matrix format: rows = cell lines, columns = paralog pairs

In [1]:
import os
import csv
import re
import pandas as pd
import pyarrow.parquet as pq
from joblib import Parallel, delayed
from tqdm import tqdm

In [2]:
# set the base directory for the project
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, "..", ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

ranked_essentiality_files_path = get_data_path(['output', 'ranked_essentiality'], 'ranked_essentiality.csv')
ranked_zessentiality_files_path = get_data_path(['output', 'ranked_essentiality'], 'ranked_zessentiality.csv')

combined_interaction_score_path = get_data_path(['input', 'PPI'], 'combined_interaction_scores.parquet')

In [3]:
# Load raw gene essentiality rankings
# This contains ranked gene dependency scores across all cell lines
gene_effect = pd.read_csv(ranked_essentiality_files_path, index_col=0)
print(gene_effect.shape)

(1080, 19158)


In [4]:
# Transform gene essentiality data structure
# Convert column names from strings to integers (gene Entrez IDs)
gene_effect.columns = gene_effect.columns.astype(int)
# Transpose so rows = genes, columns = cell lines, then sort by gene ID
gene_effect = gene_effect.T.sort_index()
gene_effect[:3]

Unnamed: 0,ACH-000004,ACH-000005,ACH-000007,ACH-000009,ACH-000011,ACH-000012,ACH-000013,ACH-000014,ACH-000015,ACH-000017,...,ACH-002285,ACH-002294,ACH-002295,ACH-002296,ACH-002297,ACH-002298,ACH-002304,ACH-002305,ACH-000779,ACH-001086
1,11172.0,3608.0,9393.0,10907.0,14687.0,5946.0,6845.0,8565.0,3954.0,10169.0,...,4347.0,11909.0,8975.0,7259.0,14336.0,4673.0,9301.0,13377.0,6495.0,7497.0
2,5346.0,14189.0,12274.0,13473.0,11632.0,16815.0,12207.0,12329.0,7173.0,10517.0,...,11139.0,10805.0,16031.0,7139.0,12611.0,12894.0,8725.0,15926.0,17186.0,10511.0
9,15919.0,17197.0,15246.0,10793.0,13188.0,12370.0,14169.0,15968.0,16420.0,13331.0,...,16416.0,13834.0,11591.0,14976.0,16967.0,14952.0,14104.0,10443.0,18089.0,


In [5]:
# Load z-score normalized gene essentiality rankings
# This contains the same data but normalized across genes for each cell line
gene_z_effect = pd.read_csv(ranked_zessentiality_files_path, index_col=0)
print(gene_z_effect.shape)

(1080, 19158)


In [6]:
# Transform z-score essentiality data structure (same as raw data)
# Convert column names to integers and transpose for consistent format
gene_z_effect.columns = gene_z_effect.columns.astype(int)
gene_z_effect = gene_z_effect.T.sort_index()
gene_z_effect[:3]

Unnamed: 0,ACH-000004,ACH-000005,ACH-000007,ACH-000009,ACH-000011,ACH-000012,ACH-000013,ACH-000014,ACH-000015,ACH-000017,...,ACH-002285,ACH-002294,ACH-002295,ACH-002296,ACH-002297,ACH-002298,ACH-002304,ACH-002305,ACH-000779,ACH-001086
1,0.59194,-1.651996,0.064182,0.513325,1.634699,-0.958405,-0.691708,-0.181453,-1.549351,0.29439,...,-1.432764,0.810578,-0.059822,-0.56889,1.530572,-1.336053,0.036889,1.246075,-0.795539,-0.498285
2,-1.861455,0.848431,0.26159,0.629017,0.064853,1.653154,0.241059,0.278445,-1.301582,-0.276832,...,-0.086224,-0.188576,1.412902,-1.312001,0.364862,0.451586,-0.82598,1.380725,1.766845,-0.278671
9,0.767406,1.19561,0.541912,-0.950101,-0.147638,-0.421715,0.181054,0.783823,0.93527,-0.099724,...,0.933929,0.06881,-0.682725,0.451446,1.118546,0.443405,0.159275,-1.067372,1.494481,


In [7]:
# Load shared interactor confidence scores from previous analysis
# This matrix contains combined STRING confidence scores for shared interactors
# Rows = shared interactor genes, Columns = paralog pairs
combined_interaction_scores = pq.read_table(combined_interaction_score_path)
combined_interaction_scores = combined_interaction_scores.to_pandas().sort_index()
combined_interaction_scores[:3]

Unnamed: 0,SMARCA2_SMARCA4,EXOC6_EXOC6B,STAG1_STAG2,RPL3_RPL3L,CNOT7_CNOT8,CKS1B_CKS2,HDAC1_HDAC2,HSP90AA1_HSP90AB1,RBBP4_RBBP7,PPP2R1A_PPP2R1B,...,ZNF138_ZNF141,ZNF320_ZNF611,ZNF141_ZNF195,ZNF195_ZNF253,NBPF1_NBPF15,ZIK1_ZNF530,ZNF138_ZNF737,ZNF93_ZNF141,ZIK1_ZNF211,ZNF117_ZNF493
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,


In [8]:
# Align gene essentiality and interaction data to common gene sets
# Only keep genes that appear in both datasets to ensure valid calculations

# Filter raw essentiality data to genes with interaction data
filtered_gene_effect = gene_effect[gene_effect.index.isin(combined_interaction_scores.index)].sort_index()

# Filter interaction scores to genes with essentiality data  
filtered_combined_interaction_scores = combined_interaction_scores[combined_interaction_scores.index.isin(gene_effect.index)].sort_index()

In [9]:
# Align z-score essentiality and interaction data (same process as raw data)
# Create matched datasets for z-score normalized analysis

zfiltered_gene_effect = gene_z_effect[gene_z_effect.index.isin(combined_interaction_scores.index)].sort_index()

zfiltered_combined_interaction_scores = combined_interaction_scores[combined_interaction_scores.index.isin(gene_z_effect.index)].sort_index()

## Weighted PPI Essentiality Calculation

Calculate weighted average essentiality scores for each paralog pair using their shared interactors as weights.

**Formula**: For each cell line and paralog pair:
```
weighted_essentiality = Σ(gene_essentiality × interaction_confidence) / Σ(interaction_confidence)
```

This gives higher weight to shared interactors with stronger protein-protein interaction evidence.

In [10]:
# calculate the weighted PPI for a given pair of genes

def weighted_PPI(pair):
    df = filtered_gene_effect.mul(filtered_combined_interaction_scores[pair], axis=0)
    sum_of_cell_lines = df.sum()
    ess_of_shared_int = sum_of_cell_lines.div(filtered_combined_interaction_scores[pair].sum(), axis=0)
    ess_of_shared_int.name = pair
    return ess_of_shared_int

test_pair = 'SMARCA2_SMARCA4'
weighted_PPI(test_pair)

ACH-000004    5901.801720
ACH-000005    6003.794805
ACH-000007    5885.910399
ACH-000009    6017.783357
ACH-000011    5876.308229
                 ...     
ACH-002298    6074.409006
ACH-002304    6382.654361
ACH-002305    5987.939589
ACH-000779    6883.630916
ACH-001086    6821.547683
Name: SMARCA2_SMARCA4, Length: 1080, dtype: float64

In [11]:
# Test with another paralog pair to verify function works across different pairs
test_pair = 'ZNF138_ZNF141'
weighted_PPI(test_pair)

ACH-000004     2226.0
ACH-000005     5013.0
ACH-000007     9851.0
ACH-000009    11303.0
ACH-000011     4236.0
               ...   
ACH-002298    12731.0
ACH-002304    11844.0
ACH-002305    13255.0
ACH-000779    16591.0
ACH-001086    17464.0
Name: ZNF138_ZNF141, Length: 1080, dtype: float64

In [12]:
# Test function on a small batch of 10 paralog pairs
# This verifies the concatenation process works correctly
combined_weighted_PPI = pd.concat([weighted_PPI(pair) for pair in filtered_combined_interaction_scores.columns[0:10]], axis=1)
display(combined_weighted_PPI)

Unnamed: 0,SMARCA2_SMARCA4,EXOC6_EXOC6B,STAG1_STAG2,RPL3_RPL3L,CNOT7_CNOT8,CKS1B_CKS2,HDAC1_HDAC2,HSP90AA1_HSP90AB1,RBBP4_RBBP7,PPP2R1A_PPP2R1B
ACH-000004,5901.801720,6595.893908,4949.176686,2642.585126,6625.864159,4055.973211,6314.505243,6069.019004,5193.888059,7262.485138
ACH-000005,6003.794805,6608.201117,4753.371297,2814.144764,6214.974696,4164.973239,6379.721407,6165.251249,5277.675148,7568.812501
ACH-000007,5885.910399,6787.893882,4847.837725,2208.901811,6906.262902,3879.689940,6591.367258,6256.907510,5488.119691,7332.182435
ACH-000009,6017.783357,6498.765056,4861.663830,2393.063638,6268.454063,3962.615458,6439.497541,6050.697911,5252.519745,6800.119026
ACH-000011,5876.308229,6634.480771,4109.543934,2787.695334,6217.913949,3742.308274,6850.694413,6366.887163,5560.376642,7087.597947
...,...,...,...,...,...,...,...,...,...,...
ACH-002298,6074.409006,6828.228492,4747.750829,2343.013604,6074.351024,3978.210906,6935.288282,6123.117932,5581.557944,7001.301513
ACH-002304,6382.654361,6472.441883,5272.835599,2512.111811,6832.877134,4574.075909,6718.634289,6288.225732,5655.344709,7430.597478
ACH-002305,5987.939589,6986.861745,4432.380535,2502.372653,6324.907846,3831.096735,6709.835341,6068.152881,5542.994486,7043.099131
ACH-000779,6883.630916,7047.273695,5280.130450,2764.793503,7187.054953,4665.506576,7765.588724,6983.388897,6522.475027,8613.457998


In [13]:
def weighted_PPI(pair):
    """
    Improved weighted PPI calculation with proper handling of missing data.
    
    For each paralog pair, calculate weighted average essentiality across 
    shared interactors, using STRING confidence scores as weights.
    
    Args:
        pair: Column name in interaction scores matrix (e.g., "SMARCA2_SMARCA4")
    
    Returns:
        pandas.Series: Weighted essentiality scores indexed by cell line
    """
    # Get interaction weights for this paralog pair, excluding NaN values
    weights = filtered_combined_interaction_scores[pair].dropna()
    
    # Find genes that have both interaction weights and essentiality data
    valid_genes = weights.index.intersection(filtered_gene_effect.index)

    # Return empty series if no valid shared interactors
    if len(valid_genes) == 0:
        return pd.Series(dtype=float, name=pair)

    # Get essentiality data for valid shared interactor genes
    df = filtered_gene_effect.loc[valid_genes]

    # Weight each gene's essentiality scores by its interaction confidence
    weighted_df = df.mul(weights[valid_genes], axis=0)

    # Calculate weighted average: sum of weighted scores / sum of weights
    weighted_avg = weighted_df.sum(axis=0) / weights[valid_genes].sum()

    # Return series with cell lines as index and weighted scores as values
    weighted_avg.name = pair
    return weighted_avg

In [14]:
# Test the improved function with the same paralog pair
# Results should be more robust due to better handling of missing data
print(test_pair)
weighted_PPI(test_pair)

ZNF138_ZNF141


ACH-000004     2226.0
ACH-000005     5013.0
ACH-000007     9851.0
ACH-000009    11303.0
ACH-000011     4236.0
               ...   
ACH-002298    12731.0
ACH-002304    11844.0
ACH-002305    13255.0
ACH-000779    16591.0
ACH-001086    17464.0
Name: ZNF138_ZNF141, Length: 1080, dtype: float64

In [15]:
# Process all paralog pairs using parallel computation for efficiency
# This is the main computation step that can take significant time

pairs = list(filtered_combined_interaction_scores.columns)

# Use all available CPU cores for parallel processing
# joblib.Parallel with 'loky' backend provides robust multiprocessing
# tqdm shows progress bar for monitoring completion
results = Parallel(n_jobs=-1, backend="loky")(
    delayed(weighted_PPI)(pair) for pair in tqdm(pairs)
)

# Note: Some pairs might return empty Series if they have no valid shared interactors
# Optional: remove truly empty Series (if any)
# results = [r for r in results if not r.empty]

# Combine all results into a single DataFrame
# Rows = cell lines, Columns = paralog pairs, Values = weighted essentiality scores
weighted_ppi_df = pd.concat(results, axis=1)

100%|██████████| 34047/34047 [20:32<00:00, 27.63it/s]


In [16]:
# Display summary of the weighted PPI essentiality matrix
# Shows dimensions and first few rows/columns
print(weighted_ppi_df.shape)
weighted_ppi_df.head()

(1080, 34047)


Unnamed: 0,SMARCA2_SMARCA4,EXOC6_EXOC6B,STAG1_STAG2,RPL3_RPL3L,CNOT7_CNOT8,CKS1B_CKS2,HDAC1_HDAC2,HSP90AA1_HSP90AB1,RBBP4_RBBP7,PPP2R1A_PPP2R1B,...,ZNF138_ZNF141,ZNF320_ZNF611,ZNF141_ZNF195,ZNF195_ZNF253,NBPF1_NBPF15,ZIK1_ZNF530,ZNF138_ZNF737,ZNF93_ZNF141,ZIK1_ZNF211,ZNF117_ZNF493
ACH-000004,5901.80172,6595.893908,4949.176686,2642.585126,6625.864159,4055.973211,6314.505243,6069.019004,5193.888059,7262.485138,...,2226.0,3347.346537,6126.02312,2226.0,5845.290698,2639.543929,2226.0,7781.875643,3010.0,10963.708967
ACH-000005,6003.794805,6608.201117,4753.371297,2814.144764,6214.974696,4164.973239,6379.721407,6165.251249,5277.675148,7568.812501,...,5013.0,2587.453621,11267.238758,5013.0,7590.583971,5986.138087,5013.0,9867.048954,1796.0,10015.276685
ACH-000007,5885.910399,6787.893882,4847.837725,2208.901811,6906.262902,3879.68994,6591.367258,6256.90751,5488.119691,7332.182435,...,9851.0,3153.272961,7246.129282,9851.0,7844.162717,1894.34345,9851.0,12048.522911,2819.0,7367.83753
ACH-000009,6017.783357,6498.765056,4861.66383,2393.063638,6268.454063,3962.615458,6439.497541,6050.697911,5252.519745,6800.119026,...,11303.0,2722.278018,6925.943239,11303.0,7135.177804,1367.288877,11303.0,10859.135494,1537.0,13937.653126
ACH-000011,5876.308229,6634.480771,4109.543934,2787.695334,6217.913949,3742.308274,6850.694413,6366.887163,5560.376642,7087.597947,...,4236.0,4341.504743,4852.016724,4236.0,9354.502134,2489.637735,4236.0,9916.617919,5178.0,7646.262286


## Weighted PPI Essentiality Calculation (Z-Score Normalized)

Repeat the same weighted essentiality calculation using z-score normalized gene dependency data.

**Purpose**: Z-score normalization standardizes essentiality scores across genes, making them more comparable across different cell lines and reducing the influence of outlier genes.

The calculation process is identical to the raw rankings, but uses normalized input data.

In [17]:
# Initial implementation for z-score weighted PPI calculation
# Same logic as raw essentiality version, but using normalized data
def weighted_zPPI(pair):
    # Multiply z-score essentiality by interaction confidence weights
    df = zfiltered_gene_effect.mul(zfiltered_combined_interaction_scores[pair], axis=0)
    # Sum across shared interactor genes for each cell line
    sum_of_cell_lines = df.sum()
    # Normalize by total interaction confidence
    ess_of_shared_int = sum_of_cell_lines.div(zfiltered_combined_interaction_scores[pair].sum(), axis=0)
    ess_of_shared_int.name = pair
    return ess_of_shared_int

In [18]:
# Test z-score weighted calculation with SMARCA2_SMARCA4
# Results will be different from raw scores due to normalization
test_pair = 'SMARCA2_SMARCA4'
weighted_zPPI(test_pair)

ACH-000004    0.002862
ACH-000005    0.002144
ACH-000007   -0.077479
ACH-000009   -0.012323
ACH-000011   -0.064770
                ...   
ACH-002298    0.004673
ACH-002304    0.117381
ACH-002305   -0.016208
ACH-000779    0.544792
ACH-001086    0.642072
Name: SMARCA2_SMARCA4, Length: 1080, dtype: float64

In [19]:
# Test with second paralog pair using z-score data
test_pair = 'ZNF138_ZNF141'
weighted_zPPI(test_pair)

ACH-000004   -1.243294
ACH-000005   -0.501103
ACH-000007    0.787279
ACH-000009    1.173953
ACH-000011   -0.708022
                ...   
ACH-002298    1.554236
ACH-002304    1.318024
ACH-002305    1.693780
ACH-000779    2.582172
ACH-001086    2.814656
Name: ZNF138_ZNF141, Length: 1080, dtype: float64

In [20]:
# Test z-score function on 10 paralog pairs to verify batch processing
combined_weighted_zPPI = pd.concat([weighted_zPPI(pair) for pair in zfiltered_combined_interaction_scores.columns[0:10]], axis=1)
display(combined_weighted_zPPI)

Unnamed: 0,SMARCA2_SMARCA4,EXOC6_EXOC6B,STAG1_STAG2,RPL3_RPL3L,CNOT7_CNOT8,CKS1B_CKS2,HDAC1_HDAC2,HSP90AA1_HSP90AB1,RBBP4_RBBP7,PPP2R1A_PPP2R1B
ACH-000004,0.002862,0.062533,0.026525,0.212435,0.099896,0.010755,-0.066274,-0.002093,-0.036739,0.053385
ACH-000005,0.002144,0.051440,-0.030866,0.138716,-0.039655,0.001057,-0.060008,0.014927,-0.040357,0.112264
ACH-000007,-0.077479,0.050779,-0.001425,-0.267744,0.128119,-0.132352,-0.038123,0.005668,-0.016446,0.056899
ACH-000009,-0.012323,-0.114664,0.118458,-0.204803,-0.049253,-0.018661,-0.074328,-0.082202,-0.090721,-0.109324
ACH-000011,-0.064770,-0.033196,-0.192733,0.010315,-0.028806,-0.145801,0.039711,0.036585,0.000662,0.004127
...,...,...,...,...,...,...,...,...,...,...
ACH-002298,0.004673,0.075951,-0.013154,-0.219518,-0.076229,-0.055894,0.068305,-0.066218,-0.009071,-0.022883
ACH-002304,0.117381,-0.072244,0.226428,-0.088722,0.137897,0.219277,0.052250,0.036868,0.067822,0.131250
ACH-002305,-0.016208,0.170318,-0.061491,-0.048303,0.006333,-0.102313,0.020458,-0.088317,0.005135,-0.025897
ACH-000779,0.544792,0.261708,0.963028,0.405240,0.153338,0.709999,0.679359,0.350791,0.737042,0.517197


In [21]:
def weighted_zPPI(pair):
    """
    Improved z-score weighted PPI calculation with proper missing data handling.
    
    Identical to weighted_PPI() but uses z-score normalized essentiality data.
    This provides standardized scores that are more comparable across cell lines.
    
    Args:
        pair: Column name in interaction scores matrix (e.g., "SMARCA2_SMARCA4")
    
    Returns:
        pandas.Series: Weighted z-score essentiality indexed by cell line
    """
    # Get interaction weights for this paralog pair, excluding NaN values
    weights = zfiltered_combined_interaction_scores[pair].dropna()
    
    # Find genes that have both interaction weights and z-score essentiality data
    valid_genes = weights.index.intersection(zfiltered_gene_effect.index)

    # Return empty series if no valid shared interactors
    if len(valid_genes) == 0:
        return pd.Series(dtype=float, name=pair)

    # Get z-score essentiality data for valid shared interactor genes
    df = zfiltered_gene_effect.loc[valid_genes]

    # Weight each gene's z-score essentiality by its interaction confidence
    weighted_df = df.mul(weights[valid_genes], axis=0)

    # Calculate weighted average: sum of weighted z-scores / sum of weights
    weighted_avg = weighted_df.sum(axis=0) / weights[valid_genes].sum()

    # Return series with cell lines as index and weighted z-scores as values
    weighted_avg.name = pair
    return weighted_avg

In [22]:
pairs = list(zfiltered_combined_interaction_scores.columns)

results = Parallel(n_jobs=-1, backend="loky")(
    delayed(weighted_zPPI)(pair) for pair in tqdm(pairs)
)

# Optional: remove truly empty Series (if any)
# results = [r for r in results if not r.empty]

# Combine results
weighted_zppi_df = pd.concat(results, axis=1)

100%|██████████| 34047/34047 [24:11<00:00, 23.46it/s]  


In [23]:
print(weighted_zppi_df.shape)
weighted_zppi_df.head()

(1080, 34047)


Unnamed: 0,SMARCA2_SMARCA4,EXOC6_EXOC6B,STAG1_STAG2,RPL3_RPL3L,CNOT7_CNOT8,CKS1B_CKS2,HDAC1_HDAC2,HSP90AA1_HSP90AB1,RBBP4_RBBP7,PPP2R1A_PPP2R1B,...,ZNF138_ZNF141,ZNF320_ZNF611,ZNF141_ZNF195,ZNF195_ZNF253,NBPF1_NBPF15,ZIK1_ZNF530,ZNF138_ZNF737,ZNF93_ZNF141,ZIK1_ZNF211,ZNF117_ZNF493
ACH-000004,0.002862,0.062533,0.026525,0.212435,0.099896,0.010755,-0.066274,-0.002093,-0.036739,0.053385,...,-1.243294,0.267907,-0.461856,-1.243294,-0.475454,0.333812,-1.243294,-0.689157,0.332907,0.361426
ACH-000005,0.002144,0.05144,-0.030866,0.138716,-0.039655,0.001057,-0.060008,0.014927,-0.040357,0.112264,...,-0.501103,0.490034,0.916263,-0.501103,-0.049484,1.617432,-0.501103,-0.08244,-0.50333,0.201085
ACH-000007,-0.077479,0.050779,-0.001425,-0.267744,0.128119,-0.132352,-0.038123,0.005668,-0.016446,0.056899,...,0.787279,0.041613,-0.167737,0.787279,0.046025,-0.388826,0.787279,0.512744,0.201341,-0.642591
ACH-000009,-0.012323,-0.114664,0.118458,-0.204803,-0.049253,-0.018661,-0.074328,-0.082202,-0.090721,-0.109324,...,1.173953,-0.139861,-0.254984,1.173953,-0.154696,-0.713703,1.173953,0.104081,-0.681736,1.258887
ACH-000011,-0.06477,-0.033196,-0.192733,0.010315,-0.028806,-0.145801,0.039711,0.036585,0.000662,0.004127,...,-0.708022,0.265611,-0.805716,-0.708022,0.522117,-0.314974,-0.708022,-0.050665,1.826285,-0.498036


In [24]:
output_path = get_data_path(['input', 'PPI'], '')

weighted_ppi_df.to_parquet(os.path.join(output_path, 'weighted_PPI_essentiality.parquet'))
weighted_zppi_df.to_parquet(os.path.join(output_path, 'weighted_zPPI_essentiality.parquet'))