In [1]:
import pandas as pd
import csv
import os

In [2]:
# set the base directory for the project
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, "..", ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

ranked_essentiality_files_path = get_data_path(['output', 'ranked_essentiality'], 'ranked_essentiality.csv')
ranked_zessentiality_files_path = get_data_path(['output', 'ranked_essentiality'], 'ranked_zessentiality.csv')
shared_interactors_path = get_data_path(['input', 'PPI'], '')

In [3]:
gene_effect = pd.read_csv(ranked_essentiality_files_path, index_col=0)
gene_effect.columns = gene_effect.columns.astype(int)

In [4]:
gene_z_effect = pd.read_csv(ranked_zessentiality_files_path, index_col=0)
gene_z_effect.columns = gene_z_effect.columns.astype(int)

In [5]:
folder = shared_interactors_path

files = list()
filenames = list()

for filename in os.listdir(folder):
    if filename.endswith('.txt'):
        
        base_filename = os.path.splitext(filename)[0]
        filenames.append(base_filename.split("_")[2])
        
        file_directory = os.path.join(folder, filename)
        files.append(file_directory)
    else:
        continue

In [6]:
filenames

['BIOGRID-MV-Physical-4.4.221', 'BIOGRID-ALL-4.4.221']

In [7]:
shared_interactors = {}

m = 0

for file in files:

    with open(file, "r") as f:
        reader = csv.DictReader(f, delimiter = ",")
        for r in reader :
            if r['SharedInteractors']:
                shared_interactors[r['Pair']] = set([int(x) for x in r['SharedInteractors'].split(';')])
            else:
                shared_interactors[r['Pair']] = set([])
    
    # restricting our analysis to those genes that have been screened in Depmap
    screened_genes = set(gene_effect.columns)
    for i in shared_interactors:
        shared_interactors[i] = shared_interactors[i].intersection(screened_genes)

    # Print example pair and its shared interactors
    if shared_interactors:
        example_pair = list(shared_interactors.keys())[0]
        example_interactors = shared_interactors[example_pair]
        print(f"File {filenames[m]}: Example pair '{example_pair}' has {len(example_interactors)} shared interactors")

    pair_means=[]
    # for pairs with no shared interactions we just use the overall mean value for that cell line
    overall_mean = gene_effect.T.mean()
    for j in shared_interactors:
        if len(shared_interactors[j]) > 0:
            shared_mean = gene_effect[list(shared_interactors[j])].T.mean()
        else:
            shared_mean = overall_mean
        shared_mean.name = j
        pair_means.append(shared_mean)

    paralog_shared_means = pd.concat(pair_means, axis=1)

    # Print information about pair means
    print(f"File {filenames[m]}: Generated {paralog_shared_means.shape[1]} pair means with {paralog_shared_means.shape[0]} cell lines")
    
    # Show example pair means for SMARCA2_SMARCA4 if it exists
    if 'SMARCA2_SMARCA4' in paralog_shared_means.columns:
        print(f"Example means for SMARCA2_SMARCA4 across cell lines:")
        print(paralog_shared_means['SMARCA2_SMARCA4'].head())

    path = get_data_path(['input', 'PPI'], '')
    outputname = os.path.join(path, "ranked_BioGRID"+filenames[m]+".csv")
    paralog_shared_means.to_csv(outputname)
        
    m = m + 1

    shared_interactors = {}

File BIOGRID-MV-Physical-4.4.221: Example pair 'SMARCA2_SMARCA4' has 60 shared interactors
File BIOGRID-MV-Physical-4.4.221: Generated 36623 pair means with 1080 cell lines
Example means for SMARCA2_SMARCA4 across cell lines:
ACH-000004    6429.137931
ACH-000005    7018.293103
ACH-000007    7132.241379
ACH-000009    6743.482759
ACH-000011    6440.965517
Name: SMARCA2_SMARCA4, dtype: float64
File BIOGRID-MV-Physical-4.4.221: Generated 36623 pair means with 1080 cell lines
Example means for SMARCA2_SMARCA4 across cell lines:
ACH-000004    6429.137931
ACH-000005    7018.293103
ACH-000007    7132.241379
ACH-000009    6743.482759
ACH-000011    6440.965517
Name: SMARCA2_SMARCA4, dtype: float64
File BIOGRID-ALL-4.4.221: Example pair 'SMARCA2_SMARCA4' has 151 shared interactors
File BIOGRID-ALL-4.4.221: Generated 36623 pair means with 1080 cell lines
Example means for SMARCA2_SMARCA4 across cell lines:
ACH-000004    6285.550336
ACH-000005    6601.261745
ACH-000007    6847.067114
ACH-000009    

In [8]:
shared_interactors = {}

m = 0  # Reset m to 0 for the second loop

for file in files:

    with open(file, "r") as f:
        reader = csv.DictReader(f, delimiter = ",")
        for r in reader :
            if r['SharedInteractors']:
                shared_interactors[r['Pair']] = set([int(x) for x in r['SharedInteractors'].split(';')])
            else:
                shared_interactors[r['Pair']] = set([])
    
    # restricting our analysis to those genes that have been screened in Depmap
    screened_genes = set(gene_z_effect.columns)
    for i in shared_interactors:
        shared_interactors[i] = shared_interactors[i].intersection(screened_genes)

    # Print example pair and its shared interactors
    if shared_interactors:
        example_pair = list(shared_interactors.keys())[0]
        example_interactors = shared_interactors[example_pair]
        print(f"File {filenames[m]}: Example pair '{example_pair}' has {len(example_interactors)} shared interactors")

    pair_means=[]
    # for pairs with no shared interactions we just use the overall mean value for that cell line
    overall_mean = gene_z_effect.T.mean()
    for j in shared_interactors:
        if len(shared_interactors[j]) > 0:
            shared_mean = gene_z_effect[list(shared_interactors[j])].T.mean()
        else:
            shared_mean = overall_mean
        shared_mean.name = j
        pair_means.append(shared_mean)

    paralog_shared_means = pd.concat(pair_means, axis=1)

    # Print information about pair means
    print(f"File {filenames[m]}: Generated {paralog_shared_means.shape[1]} pair means with {paralog_shared_means.shape[0]} cell lines")
    
    # Show example pair means for SMARCA2_SMARCA4 if it exists
    if 'SMARCA2_SMARCA4' in paralog_shared_means.columns:
        print(f"Example z-score means for SMARCA2_SMARCA4 across cell lines:")
        print(paralog_shared_means['SMARCA2_SMARCA4'].head())

    path = get_data_path(['input', 'PPI'], '')
    outputname = os.path.join(path, "z_ranked_BioGRID"+filenames[m]+".csv")
    paralog_shared_means.to_csv(outputname)
        
    m = m + 1

    shared_interactors = {}  # Clear shared_interactors for next iteration

File BIOGRID-MV-Physical-4.4.221: Example pair 'SMARCA2_SMARCA4' has 60 shared interactors
File BIOGRID-MV-Physical-4.4.221: Generated 36623 pair means with 1080 cell lines
Example z-score means for SMARCA2_SMARCA4 across cell lines:
ACH-000004   -0.076228
ACH-000005    0.020407
ACH-000007    0.027020
ACH-000009   -0.002028
ACH-000011   -0.136976
Name: SMARCA2_SMARCA4, dtype: float64
File BIOGRID-MV-Physical-4.4.221: Generated 36623 pair means with 1080 cell lines
Example z-score means for SMARCA2_SMARCA4 across cell lines:
ACH-000004   -0.076228
ACH-000005    0.020407
ACH-000007    0.027020
ACH-000009   -0.002028
ACH-000011   -0.136976
Name: SMARCA2_SMARCA4, dtype: float64
File BIOGRID-ALL-4.4.221: Example pair 'SMARCA2_SMARCA4' has 151 shared interactors
File BIOGRID-ALL-4.4.221: Example pair 'SMARCA2_SMARCA4' has 151 shared interactors
File BIOGRID-ALL-4.4.221: Generated 36623 pair means with 1080 cell lines
Example z-score means for SMARCA2_SMARCA4 across cell lines:
ACH-000004   -