In [1]:
import pandas as pd
import csv
import os

In [2]:
# set the base directory for the project
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, "..", ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

ranked_essentiality_files_path = get_data_path(['output', 'ranked_essentiality'], 'ranked_essentiality.csv')
ranked_zessentiality_files_path = get_data_path(['output', 'ranked_essentiality'], 'ranked_zessentiality.csv')
shared_interactors_path = get_data_path(['input', 'PPI'], '')

In [3]:
gene_effect = pd.read_csv(ranked_essentiality_files_path, index_col=0)
gene_effect.columns = gene_effect.columns.astype(int)

In [4]:
folder = shared_interactors_path

files = list()
filenames = list()

for filename in os.listdir(folder):
    if filename.endswith('.txt'):
        
        base_filename = os.path.splitext(filename)[0]
        filenames.append(base_filename.split("_")[2])
        
        file_directory = os.path.join(folder, filename)
        files.append(file_directory)
    else:
        continue

In [6]:
filenames

['BIOGRID-MV-Physical-4.4.221', 'BIOGRID-ALL-4.4.221']

In [7]:
shared_interactors = {}

m = 0

for file in files:

    with open(file, "r") as f:
        reader = csv.DictReader(f, delimiter = ",")
        for r in reader :
            if r['SharedInteractors']:
                shared_interactors[r['Pair']] = set([int(x) for x in r['SharedInteractors'].split(';')])
            else:
                shared_interactors[r['Pair']] = set([])
    
    # restricting our analysis to those genes that have been screened in Depmap
    screened_genes = set(gene_effect.columns)
    for i in shared_interactors:
        shared_interactors[i] = shared_interactors[i].intersection(screened_genes)

    pair_means=[]
    # for pairs with no shared interactions we just use the overall mean value for that cell line
    overall_mean = gene_effect.T.mean()
    for j in shared_interactors:
        if len(shared_interactors[j]) > 0:
            shared_mean = gene_effect[list(shared_interactors[j])].T.mean()
        else:
            shared_mean = overall_mean
        shared_mean.name = j
        pair_means.append(shared_mean)

    paralog_shared_means = pd.concat(pair_means, axis=1)

    path = get_data_path(['input', 'PPI'], '')
    outputname = os.path.join(path, "ranked_BioGRID"+filenames[m]+".csv")
    paralog_shared_means.to_csv(outputname)
        
    m = m + 1

    shared_interactors = {}

In [8]:
gene_z_effect = pd.read_csv(ranked_zessentiality_files_path, index_col=0)
gene_z_effect.columns = gene_z_effect.columns.astype(int)

In [9]:
shared_interactors = {}

m = 0

for file in files:

    with open(file, "r") as f:
        reader = csv.DictReader(f, delimiter = ",")
        for r in reader :
            if r['SharedInteractors']:
                shared_interactors[r['Pair']] = set([int(x) for x in r['SharedInteractors'].split(';')])
            else:
                shared_interactors[r['Pair']] = set([])
    
    # restricting our analysis to those genes that have been screened in Depmap
    screened_genes = set(gene_z_effect.columns)
    for i in shared_interactors:
        shared_interactors[i] = shared_interactors[i].intersection(screened_genes)

    pair_means=[]
    # for pairs with no shared interactions we just use the overall mean value for that cell line
    overall_mean = gene_z_effect.T.mean()
    for j in shared_interactors:
        if len(shared_interactors[j]) > 0:
            shared_mean = gene_z_effect[list(shared_interactors[j])].T.mean()
        else:
            shared_mean = overall_mean
        shared_mean.name = j
        pair_means.append(shared_mean)

    paralog_shared_means = pd.concat(pair_means, axis=1)

    path = get_data_path(['input', 'PPI'], '')
    outputname = os.path.join(path, "z_ranked_BioGRID"+filenames[m]+".csv")
    paralog_shared_means.to_csv(outputname)
        
    m = m + 1

    shared_interactors = {}