In [1]:
import os
import csv
from collections import defaultdict
import itertools

In [22]:
# set the base directory for the project
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, "..", ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

file_path_genenames = get_data_path(['input', 'other'], 'genenames.txt')
file_humax_complexes = get_data_path(['input', 'other'], 'humap2_complexes_20200809.txt')
dekegel_table8_path = get_data_path(['input', 'other'], 'processed_DeKegel_TableS8.csv')
biogrid_physical_path = get_data_path(['input', 'other'], 'BIOGRID-MV-Physical-4.4.221.tab3.txt')
biogrid_all_path = get_data_path(['input', 'other'], 'BIOGRID-ALL-4.4.221.tab3.txt')

We're reading PPIs from two sources - the BioGRID multi-validated set or BioGRID all and HuMAP complexes

In [10]:
entrez_to_common = {}
uniprot_to_entrez = {}
with open(file_path_genenames,"r") as f :
    reader = csv.DictReader(f,delimiter="\t")
    for r in reader :
        if r['NCBI Gene ID(supplied by NCBI)'] and r['Approved symbol'] :
            entrez_to_common[int(r['NCBI Gene ID(supplied by NCBI)'])] = r['Approved symbol']
            if r['UniProt ID(supplied by UniProt)'] :
                for u in r['UniProt ID(supplied by UniProt)'].split("|"): 
                    uniprot_to_entrez[u] = int(r['NCBI Gene ID(supplied by NCBI)'])

In [11]:
humap_ppis = set()
with open(file_humax_complexes, "r") as f:
    reader = csv.DictReader(f)
    for r in reader :
        members = r["Uniprot_ACCs"].split()
        ids = [uniprot_to_entrez[x] for x in members if x in uniprot_to_entrez]
        for i in itertools.combinations(ids, 2) :
            humap_ppis.add(i)
print(len(humap_ppis))

62023


In [12]:
paralog_pairs = {}
with open(dekegel_table8_path, "r") as f :
    reader = csv.DictReader(f)
    for r in reader:
        a1_str = r['A1_entrez_new']
        a2_str = r['A2_entrez_new']

        if a1_str == "" or a2_str == "" :
            continue

        try:
            a1 = int(a1_str)
            a2 = int(a2_str)
            sorted_gene_pair = r['genepair']
            paralog_pairs[sorted_gene_pair] = (a1, a2)
        except ValueError:
            # Handle the case where the conversion to int fails
            # This could be due to non-numeric values in the input
            print(f"Skipping entry with non-integer values: {a1_str}, {a2_str}")
            continue

In [26]:
files = [biogrid_physical_path, biogrid_all_path]
filenames = ['BIOGRID-MV-Physical-4.4.221', 'BIOGRID-ALL-4.4.221']

In [29]:
HSAPIENS = '9606'

biogrid_ppis = set()

shared_interactors = {}

m = 0

for file in files:
    
    with open(file, "r") as f:
        reader = csv.DictReader(f, delimiter="\t")
        for k in reader:
            if (k['Organism ID Interactor A'] == HSAPIENS) and (k['Organism ID Interactor B'] == HSAPIENS) :
                if k['Experimental System Type'] == 'physical' :
                    if k["Experimental System"] not in ["Affinity Capture-RNA", "Protein-RNA"]:
                        geneA = int(k['Entrez Gene Interactor A'])
                        geneB = int(k['Entrez Gene Interactor B'])
                        if geneA != geneB:
                            if geneA in entrez_to_common and geneB in entrez_to_common:
                                biogrid_ppis.add((geneA, geneB))
    
    interactors = defaultdict(set)
    for interaction in biogrid_ppis :
        interactors[interaction[0]].add(interaction[1])
        interactors[interaction[1]].add(interaction[0])
    for interaction in humap_ppis :
        interactors[interaction[0]].add(interaction[1])
        interactors[interaction[1]].add(interaction[0])
        
    for pair in paralog_pairs:
        overlap = interactors[paralog_pairs[pair][0]].intersection(interactors[paralog_pairs[pair][1]])
        overlap = overlap.difference(paralog_pairs[pair])
        shared_interactors[pair] = overlap
    
    path = get_data_path(['input', 'PPI'], '')
    outputname = os.path.join(path, "shared_interactors_"+filenames[m]+".txt")
    with open(outputname, "w") as f:
        fieldnames = ["Pair", "A1", "A2", "SharedInteractors"]
        writer = csv.DictWriter(f, fieldnames = fieldnames)
        writer.writeheader()
        for j in shared_interactors:
            writer.writerow({'Pair':j, 'A1': paralog_pairs[j][0],
                             'A2': paralog_pairs[j][1],
                             'SharedInteractors':";".join([str(x) for x in shared_interactors[j]])})

    m = m + 1

    biogrid_ppis = set()

    shared_interactors = {}