In [1]:
import os
import csv
import pandas as pd
import numpy as np
from pandarallel import pandarallel
import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
# set the base directory for the project
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, "..", ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

file_path_genenames = get_data_path(['input', 'other'], 'genenames.txt')
dekegel_table8_path = get_data_path(['input', 'other'], 'processed_DeKegel_TableS8.csv')
string_scores_path = get_data_path(['input', 'other'], 'STRING_confidence_score_all.txt')

In [3]:
# read HGNC
id_map_raw = pd.read_table(file_path_genenames, dtype = "str")

id_map = id_map_raw[['HGNC ID', 'Approved symbol', 'NCBI Gene ID', 'NCBI Gene ID(supplied by NCBI)', 'Ensembl gene ID', 'Ensembl ID(supplied by Ensembl)']]

id_map = id_map.copy()
id_map['entrez_id'] = id_map['NCBI Gene ID(supplied by NCBI)'].combine_first(id_map['NCBI Gene ID'])
id_map['ensembl_id'] = id_map['Ensembl ID(supplied by Ensembl)'].combine_first(id_map['Ensembl gene ID'])

id_map_na = id_map.dropna(axis=0, how='any', subset=['entrez_id', 'ensembl_id']).reset_index(drop=True)
id_map_na['entrez_id'] = id_map_na['entrez_id'].astype(int)

# make dictionary
entrez_to_common = dict(zip(id_map_na['entrez_id'], id_map_na['Approved symbol']))
ensembl_to_entrez = dict(zip(id_map_na['ensembl_id'], id_map_na['entrez_id']))

In [4]:
paralog_pairs_dict = {}
with open(dekegel_table8_path, "r") as f :
    reader = csv.DictReader(f)
    for r in reader :
        sorted_gene_pair = r['genepair']
        a1 = int(r['A1_entrez'])
        a2 = int(r['A2_entrez'])
        paralog_pairs_dict[(a1, a2)] = sorted_gene_pair

In [5]:
paralog_pairs = pd.Series(list(paralog_pairs_dict.keys()))

In [6]:
paralog_pairs[0]

(6595, 6597)

In [7]:
#paralog_pairs[0]

In [8]:
#paralog_pairs_dict[tuple(paralog_pairs[0])]

In [9]:
stringdb_ppis = set()

with open(string_scores_path, "r") as f:
    reader = csv.DictReader(f, delimiter=",")
    for row in reader:
        geneA = str(row['GeneA'])
        geneB = str(row['GeneB'])
        score = int(row['confidence_score'])
        if geneA != geneB:
            if geneA in ensembl_to_entrez and geneB in ensembl_to_entrez:
                stringdb_ppis.add((ensembl_to_entrez[geneA], ensembl_to_entrez[geneB], score))

stringdb_df = pd.DataFrame(list(stringdb_ppis), columns=['Interactor1', 'Interactor2', 'Score'])

In [10]:
# remove paralog pairs from stringdb_df
stringdb_df['combined'] = stringdb_df.apply(lambda x: tuple(sorted([x['Interactor1'], x['Interactor2']])), axis=1)

# Sort the pairs in paralog_pairs
paralog_pairs_sorted = [tuple(sorted(pair)) for pair in paralog_pairs]
paralog_pairs_set = set(map(tuple, paralog_pairs_sorted))

#Remove rows where the sorted gene pair is in paralog_pairs_set
#stringdb_df = stringdb_df[~stringdb_df['combined'].isin(paralog_pairs_set)]

In [11]:
# test if it works accurately or not
#stringdb_df.loc[(stringdb_df.Interactor1 == 8428) & (stringdb_df.Interactor2 == 51765)]
#stringdb_df.loc[(stringdb_df.Interactor1 == 51765) & (stringdb_df.Interactor2 == 8428)]

In [12]:
stringdb_df[:3]

Unnamed: 0,Interactor1,Interactor2,Score,combined
0,1654,140691,278,"(1654, 140691)"
1,57580,5330,295,"(5330, 57580)"
2,8477,3820,230,"(3820, 8477)"


In [13]:
stringdb_df.loc[(stringdb_df['Interactor1'] == 6595) & (stringdb_df['Interactor2'] == 6597),] # check it paralog pair is present in stringdb_df

Unnamed: 0,Interactor1,Interactor2,Score,combined
2913545,6595,6597,987,"(6595, 6597)"


In [14]:
# gene -> {interactor: score} dictionary
from collections import defaultdict

# Normalized interactions (both directions)
df1 = stringdb_df[['Interactor1', 'Interactor2', 'Score']]
df2 = stringdb_df.rename(columns={'Interactor1': 'Interactor2', 'Interactor2': 'Interactor1'})[['Interactor1', 'Interactor2', 'Score']]
all_edges = pd.concat([df1, df2], ignore_index=True).drop_duplicates()

# Build a map: gene → {interactor: score}
gene_to_interactors = defaultdict(dict)

for row in all_edges.itertuples():
    gene_to_interactors[row.Interactor1][row.Interactor2] = row.Score

In [15]:
def get_interactor_scores_fast(pair, paralog_pairs_dict, gene_to_interactors):
    g1, g2 = pair
    name = paralog_pairs_dict.get(pair, f"{g1}_{g2}")

    # Get interactors and scores
    g1_partners = gene_to_interactors.get(g1, {})
    g2_partners = gene_to_interactors.get(g2, {})

    # Find shared interactors
    shared = set(g1_partners) & set(g2_partners)
    if not shared:
        return pd.Series(dtype=float, name=name)

    # Compute product of scores
    result = pd.Series({i: g1_partners[i] * g2_partners[i] for i in shared})
    result.name = name
    return result

In [16]:
pair = (6595, 6597)
args = get_interactor_scores_fast(pair, paralog_pairs_dict, gene_to_interactors)
args

8193      861063
8202      123256
387082     58081
79885     111748
2064      114534
           ...  
57337      48614
124923     88209
8189      438306
55294     154008
6143       45747
Name: SMARCA2_SMARCA4, Length: 1144, dtype: int64

In [17]:
pair = (7697, 7700)
args = get_interactor_scores_fast(pair, paralog_pairs_dict, gene_to_interactors)
args

23389    40804
Name: ZNF138_ZNF141, dtype: int64

In [18]:
all_scores = []

for pair in paralog_pairs:
    s = get_interactor_scores_fast(pair, paralog_pairs_dict, gene_to_interactors)
    if not s.empty:
        all_scores.append(s)
    # print number of pairs
    if len(all_scores) % 1000 == 0:
        print(f"Processed {len(all_scores)} pairs")

Processed 1000 pairs
Processed 2000 pairs
Processed 3000 pairs
Processed 4000 pairs
Processed 5000 pairs
Processed 6000 pairs
Processed 7000 pairs
Processed 8000 pairs
Processed 9000 pairs
Processed 10000 pairs
Processed 11000 pairs
Processed 12000 pairs
Processed 13000 pairs
Processed 14000 pairs
Processed 15000 pairs
Processed 16000 pairs
Processed 17000 pairs
Processed 18000 pairs
Processed 19000 pairs
Processed 20000 pairs
Processed 21000 pairs
Processed 22000 pairs
Processed 23000 pairs
Processed 24000 pairs
Processed 25000 pairs
Processed 26000 pairs
Processed 27000 pairs
Processed 28000 pairs
Processed 29000 pairs
Processed 29000 pairs
Processed 30000 pairs
Processed 31000 pairs
Processed 32000 pairs
Processed 33000 pairs
Processed 34000 pairs


In [19]:
# Concatenate all scores into a DataFrame
all_scores_df = pd.concat(all_scores, axis=1)
all_scores_df_v2 = all_scores_df.sort_index(axis=0).copy()

In [20]:
#testing

display(all_scores_df_v2.loc[all_scores_df_v2.index == 25])

display(all_scores_df_v2.loc[all_scores_df_v2.index == 23389])

Unnamed: 0,SMARCA2_SMARCA4,EXOC6_EXOC6B,STAG1_STAG2,RPL3_RPL3L,CNOT7_CNOT8,CKS1B_CKS2,HDAC1_HDAC2,HSP90AA1_HSP90AB1,RBBP4_RBBP7,PPP2R1A_PPP2R1B,...,ZNF138_ZNF141,ZNF320_ZNF611,ZNF141_ZNF195,ZNF195_ZNF253,NBPF1_NBPF15,ZIK1_ZNF530,ZNF138_ZNF737,ZNF93_ZNF141,ZIK1_ZNF211,ZNF117_ZNF493
25,,,,,,,249378.0,736344.0,,,...,,,,,,,,,,


Unnamed: 0,SMARCA2_SMARCA4,EXOC6_EXOC6B,STAG1_STAG2,RPL3_RPL3L,CNOT7_CNOT8,CKS1B_CKS2,HDAC1_HDAC2,HSP90AA1_HSP90AB1,RBBP4_RBBP7,PPP2R1A_PPP2R1B,...,ZNF138_ZNF141,ZNF320_ZNF611,ZNF141_ZNF195,ZNF195_ZNF253,NBPF1_NBPF15,ZIK1_ZNF530,ZNF138_ZNF737,ZNF93_ZNF141,ZIK1_ZNF211,ZNF117_ZNF493
23389,85986.0,,,,,,,,,,...,40804.0,,40804.0,40804.0,,,40804.0,40804.0,,40804.0


In [28]:
all_scores_df_v2.loc[all_scores_df_v2['SMARCA2_SMARCA4'].notna(), 'SMARCA2_SMARCA4']

58           211482.0
59           176231.0
60           831168.0
70           222438.0
71           665600.0
               ...   
100533467     64009.0
100996331     77841.0
102724631     77841.0
107282092     45796.0
114108587     44521.0
Name: SMARCA2_SMARCA4, Length: 1144, dtype: float64

In [29]:
all_scores_df_v2

Unnamed: 0,SMARCA2_SMARCA4,EXOC6_EXOC6B,STAG1_STAG2,RPL3_RPL3L,CNOT7_CNOT8,CKS1B_CKS2,HDAC1_HDAC2,HSP90AA1_HSP90AB1,RBBP4_RBBP7,PPP2R1A_PPP2R1B,...,ZNF138_ZNF141,ZNF320_ZNF611,ZNF141_ZNF195,ZNF195_ZNF253,NBPF1_NBPF15,ZIK1_ZNF530,ZNF138_ZNF737,ZNF93_ZNF141,ZIK1_ZNF211,ZNF117_ZNF493
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
12,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118568804,,,,,,,,,,,...,,,,,,,,,,
119086082,,,,,,,,,,,...,,,,,,,,,,
120356739,,,,,,,,,,,...,,,,,,,,,,
122394733,,,,,,,,,,,...,,,,,,,,,,


In [22]:
# save the files
output_path = get_data_path(['input', 'PPI'], '')

all_scores_df_v2.to_parquet(os.path.join(output_path, 'combined_interaction_scores.parquet'), engine='pyarrow', index=True)