# Identify Shared Interactors for Paralog Pairs

This notebook identifies shared protein-protein interactions (PPIs) between paralog gene pairs using STRING database confidence scores.

## Overview
For each paralog pair, we:
1. Load paralog pairs from DeKegel Table S8 data
2. Load protein-protein interaction data from STRING database
3. Map gene identifiers (Ensembl to Entrez)
4. Find shared interactors between each paralog pair
5. Calculate interaction confidence scores as the product of individual scores
6. Store results in an efficient parquet format

## Key Outputs
- **combined_interaction_scores.parquet**: Matrix of shared interactor scores for all paralog pairs
- Each column represents a paralog pair (e.g., SMARCA2_SMARCA4)
- Each row represents a shared interactor gene (Entrez ID)
- Values are the product of STRING confidence scores between each paralog and the shared interactor

In [1]:
# Import required libraries
import os
import csv
import pandas as pd
import numpy as np
from pandarallel import pandarallel  # For parallel processing of pandas operations
import pyarrow as pa
import pyarrow.parquet as pq  # For efficient storage of large dataframes

In [2]:
# set the base directory for the project
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, "..", ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

file_path_genenames = get_data_path(['data', 'input', 'other'], 'approved_and_previous_symbols.csv')
dekegel_table8_path = get_data_path(['data', 'input', 'other'], 'processed_DeKegel_TableS8.csv')
string_scores_path = get_data_path(['data', 'input', 'other'], 'STRING_confidence_score_all.txt')

In [None]:
# Load gene identifier mappings
# This file contains mappings between different gene identifier systems (Entrez, Ensembl, symbols)
id_map = pd.read_csv(file_path_genenames)

# Ensembl ID to Entrez ID mapping (for STRING database conversion)
# STRING uses Ensembl IDs, but our analysis uses Entrez IDs
id_map_cleaned = id_map.dropna(axis=0, how='any', subset=['entrez_id', 'ensembl_id']).reset_index(drop=True)
ensembl_to_entrez = dict(zip(id_map_cleaned.ensembl_id, id_map_cleaned.entrez_id))

In [4]:
# Load paralog pairs from DeKegel Table S8
# This creates a mapping from (gene1_entrez, gene2_entrez) tuples to gene pair names
paralog_pairs_dict = {}
with open(dekegel_table8_path, "r") as f :
    reader = csv.DictReader(f)
    for r in reader :
        # Extract the formatted gene pair name (e.g., "SMARCA2_SMARCA4")
        sorted_gene_pair = r['genepair']
        # Extract Entrez IDs for both genes in the pair
        a1 = int(r['A1_entrez'])
        a2 = int(r['A2_entrez'])
        # Store mapping: (entrez_id1, entrez_id2) -> "GENE1_GENE2"
        paralog_pairs_dict[(a1, a2)] = sorted_gene_pair

In [5]:
# Convert paralog pairs dictionary keys to a pandas Series for easier iteration
# Each element is a tuple of (entrez_id1, entrez_id2)
paralog_pairs = pd.Series(list(paralog_pairs_dict.keys()))

In [6]:
# Display the first paralog pair as an example
# Format: (entrez_id1, entrez_id2)
paralog_pairs[0]

(6595, 6597)

In [7]:
# Display total number of paralog pairs to be processed
len(paralog_pairs)

36648

In [8]:
# Load STRING database protein-protein interactions
# STRING provides confidence scores for protein interactions (0-1000 scale)
stringdb_ppis = set()

with open(string_scores_path, "r") as f:
    reader = csv.DictReader(f, delimiter=",")
    for row in reader:
        geneA = str(row['GeneA'])  # Ensembl ID format
        geneB = str(row['GeneB'])  # Ensembl ID format
        score = int(row['confidence_score'])  # STRING confidence score (0-1000)
        
        # Only process interactions between different genes
        if geneA != geneB:
            # Convert Ensembl IDs to Entrez IDs using our mapping
            if geneA in ensembl_to_entrez and geneB in ensembl_to_entrez:
                stringdb_ppis.add((ensembl_to_entrez[geneA], ensembl_to_entrez[geneB], score))

# Convert to DataFrame for easier manipulation
# Columns: Interactor1 (Entrez ID), Interactor2 (Entrez ID), Score (confidence)
stringdb_df = pd.DataFrame(list(stringdb_ppis), columns=['Interactor1', 'Interactor2', 'Score'])

In [None]:
# Create a standardized representation of gene pairs in STRING data
# Sort each pair so (A,B) and (B,A) are treated as the same interaction
stringdb_df['combined'] = stringdb_df.apply(lambda x: tuple(sorted([x['Interactor1'], x['Interactor2']])), axis=1)

# Similarly standardize the paralog pairs for comparison
paralog_pairs_sorted = [tuple(sorted(pair)) for pair in paralog_pairs]
paralog_pairs_set = set(map(tuple, paralog_pairs_sorted))

In [None]:
# Testing code to verify paralog pair filtering works correctly
# These lines check for specific gene pairs in both orientations
#stringdb_df.loc[(stringdb_df.Interactor1 == 8428) & (stringdb_df.Interactor2 == 51765)]
#stringdb_df.loc[(stringdb_df.Interactor1 == 51765) & (stringdb_df.Interactor2 == 8428)]

In [9]:
# Preview the STRING interaction data structure
# Shows first 3 rows with Interactor1, Interactor2 (Entrez IDs), and confidence Score
stringdb_df[:3]

Unnamed: 0,Interactor1,Interactor2,Score
0,1654,140691,278
1,57580,5330,295
2,8477,3820,230


In [10]:
# Create a fast lookup structure for protein interactions
# STRING interactions are bidirectional, so we need to represent both directions
from collections import defaultdict

# Create two dataframes: original direction and reversed direction
df1 = stringdb_df[['Interactor1', 'Interactor2', 'Score']]  # A -> B
df2 = stringdb_df.rename(columns={'Interactor1': 'Interactor2', 'Interactor2': 'Interactor1'})[['Interactor1', 'Interactor2', 'Score']]  # B -> A

# Combine both directions and remove duplicates
all_edges = pd.concat([df1, df2], ignore_index=True).drop_duplicates()

# Build efficient lookup structure: gene → {interactor: confidence_score}
# This allows O(1) lookup of all partners for any gene
gene_to_interactors = defaultdict(dict)

for row in all_edges.itertuples():
    gene_to_interactors[row.Interactor1][row.Interactor2] = row.Score

In [11]:
def get_interactor_scores_fast(pair, paralog_pairs_dict, gene_to_interactors):
    """
    Calculate shared interactor scores for a paralog pair.
    
    For each shared interactor, the score is calculated as:
    score = STRING_confidence(gene1, interactor) × STRING_confidence(gene2, interactor)
    
    This product represents the combined confidence that both paralogs 
    interact with the shared partner.
    
    Args:
        pair: Tuple of (gene1_entrez_id, gene2_entrez_id)
        paralog_pairs_dict: Mapping from gene pairs to formatted names
        gene_to_interactors: Lookup dict for gene interactions
    
    Returns:
        pandas.Series: Shared interactor scores indexed by interactor Entrez ID
    """
    g1, g2 = pair
    # Get the formatted name (e.g., "SMARCA2_SMARCA4") or create default
    name = paralog_pairs_dict.get(pair, f"{g1}_{g2}")

    # Get all interaction partners and their confidence scores for each gene
    g1_partners = gene_to_interactors.get(g1, {})  # {interactor_id: confidence_score}
    g2_partners = gene_to_interactors.get(g2, {})  # {interactor_id: confidence_score}

    # Find genes that interact with both paralogs (shared interactors)
    shared = set(g1_partners) & set(g2_partners)
    if not shared:
        return pd.Series(dtype=float, name=name)

    # Calculate combined confidence score as product of individual scores
    # Higher product = both paralogs have high confidence interactions with this gene
    result = pd.Series({i: g1_partners[i] * g2_partners[i] for i in shared})
    result.name = name
    return result

In [12]:
# Test the function with a specific paralog pair (6595, 6597)
# This will show all shared interactors and their combined confidence scores
pair = (6595, 6597)
args = get_interactor_scores_fast(pair, paralog_pairs_dict, gene_to_interactors)
args

8193      861063
8202      123256
387082     58081
79885     111748
2064      114534
           ...  
57337      48614
124923     88209
8189      438306
55294     154008
6143       45747
Name: SMARCA2_SMARCA4, Length: 1144, dtype: int64

In [13]:
# Test with another paralog pair (7697, 7700)
# This demonstrates the function works across different gene pairs
pair = (7697, 7700)
args = get_interactor_scores_fast(pair, paralog_pairs_dict, gene_to_interactors)
args

23389    40804
Name: ZNF138_ZNF141, dtype: int64

In [14]:
# Process all paralog pairs to calculate shared interactor scores
# This is the main computation step that creates our feature matrix
all_scores = []

for pair in paralog_pairs:
    # Calculate shared interactor scores for this paralog pair
    s = get_interactor_scores_fast(pair, paralog_pairs_dict, gene_to_interactors)
    
    # Only include pairs that have at least one shared interactor
    if not s.empty:
        all_scores.append(s)
    
    # Progress indicator: print every 1000 processed pairs
    if len(all_scores) % 1000 == 0:
        print(f"Processed {len(all_scores)} pairs")

Processed 1000 pairs
Processed 2000 pairs
Processed 3000 pairs
Processed 4000 pairs
Processed 5000 pairs
Processed 6000 pairs
Processed 7000 pairs
Processed 8000 pairs
Processed 9000 pairs
Processed 10000 pairs
Processed 11000 pairs
Processed 12000 pairs
Processed 13000 pairs
Processed 14000 pairs
Processed 15000 pairs
Processed 16000 pairs
Processed 17000 pairs
Processed 18000 pairs
Processed 19000 pairs
Processed 20000 pairs
Processed 21000 pairs
Processed 22000 pairs
Processed 23000 pairs
Processed 24000 pairs
Processed 25000 pairs
Processed 26000 pairs
Processed 27000 pairs
Processed 28000 pairs
Processed 29000 pairs
Processed 29000 pairs
Processed 30000 pairs
Processed 31000 pairs
Processed 32000 pairs
Processed 33000 pairs
Processed 34000 pairs


In [15]:
# Combine all individual paralog pair results into a single matrix
# Rows = shared interactor genes (Entrez IDs)
# Columns = paralog pairs (e.g., "SMARCA2_SMARCA4")
# Values = combined confidence scores (product of individual STRING scores)
all_scores_df = pd.concat(all_scores, axis=1)

# Sort by interactor gene ID for consistent ordering
all_scores_df_v2 = all_scores_df.sort_index(axis=0).copy()

In [16]:
#testing

# Data validation: Check specific interactor genes across all paralog pairs
# These tests verify the matrix structure and content

# Check all paralog pairs that share interactor gene 25
display(all_scores_df_v2.loc[all_scores_df_v2.index == 25])

# Check all paralog pairs that share interactor gene 23389
display(all_scores_df_v2.loc[all_scores_df_v2.index == 23389])

Unnamed: 0,SMARCA2_SMARCA4,EXOC6_EXOC6B,STAG1_STAG2,RPL3_RPL3L,CNOT7_CNOT8,CKS1B_CKS2,HDAC1_HDAC2,HSP90AA1_HSP90AB1,RBBP4_RBBP7,PPP2R1A_PPP2R1B,...,ZNF138_ZNF141,ZNF320_ZNF611,ZNF141_ZNF195,ZNF195_ZNF253,NBPF1_NBPF15,ZIK1_ZNF530,ZNF138_ZNF737,ZNF93_ZNF141,ZIK1_ZNF211,ZNF117_ZNF493
25,,,,,,,249378.0,736344.0,,,...,,,,,,,,,,


Unnamed: 0,SMARCA2_SMARCA4,EXOC6_EXOC6B,STAG1_STAG2,RPL3_RPL3L,CNOT7_CNOT8,CKS1B_CKS2,HDAC1_HDAC2,HSP90AA1_HSP90AB1,RBBP4_RBBP7,PPP2R1A_PPP2R1B,...,ZNF138_ZNF141,ZNF320_ZNF611,ZNF141_ZNF195,ZNF195_ZNF253,NBPF1_NBPF15,ZIK1_ZNF530,ZNF138_ZNF737,ZNF93_ZNF141,ZIK1_ZNF211,ZNF117_ZNF493
23389,85986.0,,,,,,,,,,...,40804.0,,40804.0,40804.0,,,40804.0,40804.0,,40804.0


In [18]:
# Check all shared interactors for the SMARCA2_SMARCA4 paralog pair
# This shows which genes interact with both SMARCA2 and SMARCA4, and their combined scores
all_scores_df_v2.loc[all_scores_df_v2['SMARCA2_SMARCA4'].notna(), 'SMARCA2_SMARCA4']

58           211482.0
59           176231.0
60           831168.0
70           222438.0
71           665600.0
               ...   
100533467     64009.0
100996331     77841.0
102724631     77841.0
107282092     45796.0
114108587     44521.0
Name: SMARCA2_SMARCA4, Length: 1144, dtype: float64

In [19]:
# Display the final shared interactor score matrix
# This sparse matrix contains the core features for synthetic lethality prediction
print(all_scores_df_v2.shape)
print('')
display(all_scores_df_v2)

(18764, 34047)



Unnamed: 0,SMARCA2_SMARCA4,EXOC6_EXOC6B,STAG1_STAG2,RPL3_RPL3L,CNOT7_CNOT8,CKS1B_CKS2,HDAC1_HDAC2,HSP90AA1_HSP90AB1,RBBP4_RBBP7,PPP2R1A_PPP2R1B,...,ZNF138_ZNF141,ZNF320_ZNF611,ZNF141_ZNF195,ZNF195_ZNF253,NBPF1_NBPF15,ZIK1_ZNF530,ZNF138_ZNF737,ZNF93_ZNF141,ZIK1_ZNF211,ZNF117_ZNF493
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
12,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118568804,,,,,,,,,,,...,,,,,,,,,,
119086082,,,,,,,,,,,...,,,,,,,,,,
120356739,,,,,,,,,,,...,,,,,,,,,,
122394733,,,,,,,,,,,...,,,,,,,,,,


In [None]:
# Save the shared interactor score matrix for downstream analysis
# Using Parquet format for efficient storage and fast loading of large sparse matrices
output_path = get_data_path(['data', 'input', 'PPI'], '')

# combined_interaction_scores.parquet contains:
# - Rows: Shared interactor genes (Entrez IDs)
# - Columns: Paralog pairs (formatted names like "SMARCA2_SMARCA4") 
# - Values: Product of STRING confidence scores
# - NaN: No shared interaction between that gene and paralog pair
all_scores_df_v2.to_parquet(os.path.join(output_path, 'combined_interaction_scores.parquet'), engine='pyarrow', index=True)