In [1]:
import pandas as pd
import csv
import os

In [None]:
# set the base directory for the project
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, "..", ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

ranked_essentiality_files_path = get_data_path(['data', 'output', 'ranked_essentiality'], 'ranked_essentiality.csv')
ranked_zessentiality_files_path = get_data_path(['data', 'output', 'ranked_essentiality'], 'ranked_zessentiality.csv')
shared_interactors_path = get_data_path(['data', 'input', 'PPI'], '')

In [3]:
gene_effect = pd.read_csv(ranked_essentiality_files_path, index_col=0)
gene_effect.columns = gene_effect.columns.astype(int)

In [4]:
gene_z_effect = pd.read_csv(ranked_zessentiality_files_path, index_col=0)
gene_z_effect.columns = gene_z_effect.columns.astype(int)

In [5]:
folder = shared_interactors_path

files = list()
filenames = list()

for filename in os.listdir(folder):
    if filename.endswith('.txt'):
        
        base_filename = os.path.splitext(filename)[0]
        filenames.append(base_filename.split("_")[2])
        
        file_directory = os.path.join(folder, filename)
        files.append(file_directory)
    else:
        continue

In [6]:
filenames

['BIOGRID-MV-Physical-4.4.221', 'BIOGRID-ALL-4.4.221']

In [None]:
def process_shared_interactors(files, filenames, effect_df, output_prefix):
    """
    For each shared interactors file, calculate mean effect for each paralog pair across cell lines.
    If a pair has no shared interactors, use the overall mean for that cell line.
    Saves output as CSV for each file.
    """
    for m, file in enumerate(files):
        shared_interactors = {}
        with open(file, "r") as f:
            reader = csv.DictReader(f, delimiter=",")
            for r in reader:
                if r['SharedInteractors']:
                    # Convert gene IDs to int to match effect_df columns
                    shared_interactors[r['Pair']] = set([int(x) for x in r['SharedInteractors'].split(';') if x])
                else:
                    shared_interactors[r['Pair']] = set()
        # Restrict to screened genes (as int)
        screened_genes = set(effect_df.columns)
        for i in shared_interactors:
            shared_interactors[i] = shared_interactors[i].intersection(screened_genes)
        pair_means = []
        overall_mean = effect_df.T.mean()
        for j in shared_interactors:
            if len(shared_interactors[j]) > 0:
                shared_mean = effect_df[list(shared_interactors[j])].T.mean()
            else:
                shared_mean = overall_mean
            shared_mean.name = j
            pair_means.append(shared_mean)
        paralog_shared_means = pd.concat(pair_means, axis=1)
        print(f"File {filenames[m]}: Generated {paralog_shared_means.shape[1]} pair means with {paralog_shared_means.shape[0]} cell lines")
        if 'SMARCA2_SMARCA4' in paralog_shared_means.columns:
            print(f"Example means for SMARCA2_SMARCA4 across cell lines:")
            print(paralog_shared_means['SMARCA2_SMARCA4'].head())
        path = get_data_path(['data', 'input', 'PPI'], '')
        outputname = os.path.join(path, f"{output_prefix}_BioGRID{filenames[m]}.csv")
        paralog_shared_means.to_csv(outputname)

In [11]:
# For ranked essentiality
process_shared_interactors(files, filenames, gene_effect, "ranked")

File BIOGRID-MV-Physical-4.4.221: Generated 36623 pair means with 1080 cell lines
Example means for SMARCA2_SMARCA4 across cell lines:
ACH-000004    6429.137931
ACH-000005    7018.293103
ACH-000007    7132.241379
ACH-000009    6743.482759
ACH-000011    6440.965517
Name: SMARCA2_SMARCA4, dtype: float64
File BIOGRID-ALL-4.4.221: Generated 36623 pair means with 1080 cell lines
Example means for SMARCA2_SMARCA4 across cell lines:
ACH-000004    6285.550336
ACH-000005    6601.261745
ACH-000007    6847.067114
ACH-000009    6892.536913
ACH-000011    6902.859060
Name: SMARCA2_SMARCA4, dtype: float64


In [12]:
# For z-ranked essentiality
process_shared_interactors(files, filenames, gene_z_effect, "z_ranked")

File BIOGRID-MV-Physical-4.4.221: Generated 36623 pair means with 1080 cell lines
Example means for SMARCA2_SMARCA4 across cell lines:
ACH-000004   -0.076228
ACH-000005    0.020407
ACH-000007    0.027020
ACH-000009   -0.002028
ACH-000011   -0.136976
Name: SMARCA2_SMARCA4, dtype: float64
File BIOGRID-ALL-4.4.221: Generated 36623 pair means with 1080 cell lines
Example means for SMARCA2_SMARCA4 across cell lines:
ACH-000004   -0.153113
ACH-000005   -0.041593
ACH-000007   -0.042827
ACH-000009   -0.029603
ACH-000011   -0.019966
Name: SMARCA2_SMARCA4, dtype: float64
