In [1]:
# import modules
import os
import re
import csv
import pandas as pd
import numpy as np
import pyarrow as pa
from collections import defaultdict

In [2]:
# set the base directory for the project
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, "..", ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

file_path_gene_essentiality = get_data_path(['data', 'output', 'ranked_essentiality'], 'ranked_essentiality.csv')
file_path_gene_zessentiality = get_data_path(['data', 'output', 'ranked_essentiality'], 'ranked_zessentiality.csv')

dekegel_table8_path = get_data_path(['data','input', 'other'], 'processed_DeKegel_TableS8.csv')

file_path_genenames = get_data_path(['data', 'input', 'other'], 'approved_and_previous_symbols.csv')

file_path_ensembl = get_data_path(['data', 'input', 'other'], 'mart_export.txt')

In [3]:
# read the gene names mapping file
id_map = pd.read_csv(file_path_genenames)

# create dictionaries to map gene symbols to Entrez IDs
id_map_cleaned = id_map.dropna(axis=0, how='any', subset=['entrez_id', 'ensembl_id']).reset_index(drop=True)
entrez_id_to_ensembl = dict(zip(id_map_cleaned.entrez_id, id_map_cleaned.ensembl_id))

### Gene Effect Score

#### Differential activity score
- Null hypothesis: the activity of the gene stays the same in two different conditions
- Alternative hypothesis: the activity of the gene is different in two different conditions

In [4]:
gene_essentiality = pd.read_csv(file_path_gene_essentiality, sep=',', index_col=0, low_memory=False)

In [5]:
gene_z_essentiality = pd.read_csv(file_path_gene_zessentiality, sep=',', index_col=0, low_memory=False)

In [6]:
gene_essentiality.columns = gene_essentiality.columns.astype(int)  #replace the entrez_id with ensembl_id
gene_essentiality = gene_essentiality.rename(columns=entrez_id_to_ensembl)
display(gene_essentiality[:3])

gene_z_essentiality.columns = gene_z_essentiality.columns.astype(int)
gene_z_essentiality = gene_z_essentiality.rename(columns=entrez_id_to_ensembl) #replace the entrez_id with ensembl_id
display(gene_z_essentiality[:3])

Unnamed: 0,ENSG00000121410,ENSG00000175899,ENSG00000171428,ENSG00000156006,ENSG00000196136,ENSG00000114771,ENSG00000127837,ENSG00000129673,ENSG00000090861,ENSG00000183044,...,ENSG00000236311,ENSG00000218819,ENSG00000185055,ENSG00000184224,100505705,ENSG00000204403,ENSG00000250733,ENSG00000214447,ENSG00000283787,ENSG00000162621
ACH-000004,11172.0,5346.0,15919.0,14531.0,7827.0,15880.0,681.0,3342.0,238.0,11965.0,...,,,,,,,,,,
ACH-000005,3608.0,14189.0,17197.0,17118.0,4887.0,11519.0,785.0,9371.0,98.0,3221.0,...,,,,,,,,,,
ACH-000007,9393.0,12274.0,15246.0,16239.0,16454.0,16579.0,645.0,12509.0,91.0,16790.0,...,,,,,,,,,,


Unnamed: 0,ENSG00000121410,ENSG00000175899,ENSG00000171428,ENSG00000156006,ENSG00000196136,ENSG00000114771,ENSG00000127837,ENSG00000129673,ENSG00000090861,ENSG00000183044,...,ENSG00000236311,ENSG00000218819,ENSG00000185055,ENSG00000184224,100505705,ENSG00000204403,ENSG00000250733,ENSG00000214447,ENSG00000283787,ENSG00000162621
ACH-000004,0.59194,-1.861455,0.767406,0.765687,-1.188311,0.721604,-0.474503,-1.111145,0.112406,0.51984,...,,,,,,,,,,
ACH-000005,-1.651996,0.848431,1.19561,1.503581,-2.063585,-0.574681,-0.316621,0.907069,-0.778173,-1.745288,...,,,,,,,,,,
ACH-000007,0.064182,0.26159,0.541912,1.252863,1.380053,0.929378,-0.529154,1.957518,-0.822702,1.769753,...,,,,,,,,,,


### Paralog Pairs


In [7]:
paralog_pairs_dict = {}
list_of_paralogs = set()
#paralog_pairs = []
with open(dekegel_table8_path, "r") as f :
    reader = csv.DictReader(f)
    for r in reader :
        sorted_gene_pair = r['genepair']
        a1 = r['A1_ensembl_new']
        a2 = r['A2_ensembl_new']
        if a1 in gene_essentiality.columns and a2 in gene_essentiality.columns:
            #paralog_pairs_dict[sorted_gene_pair] = (a1, a2)
            paralog_pairs_dict[(a1, a2)] = sorted_gene_pair
            list_of_paralogs.update((a1, a2))
            #paralog_pairs.append(set([a1, a2]))

In [None]:
#paralog_pairs = []
#for key in paralog_pairs_dict.keys():
#    if key[0] != key[1]:
#        paralog_pairs.append([key[0], key[1]])

In [8]:
paralog_pairs = pd.Series(list(paralog_pairs_dict.keys()))

In [9]:
paralog_pairs[0]

('ENSG00000080503', 'ENSG00000127616')

In [10]:
len(paralog_pairs)

35656

In [11]:
# Check if any tuple contains None/NA elements
def has_na_in_tuple(pair):
    if pd.isna(pair):
        return True
    try:
        return any(pd.isna(element) for element in pair)
    except:
        return False

# Apply the check to all pairs
has_internal_na = paralog_pairs.apply(has_na_in_tuple)
print(f"Pairs with internal NA values: {has_internal_na.sum()}")

# Show pairs with internal NA (if any)
if has_internal_na.any():
    print(paralog_pairs[has_internal_na])

Pairs with internal NA values: 0


### Protein-coding paralog pairs w/ their corresponding GO term annotations

Ensembl Filters:

- Paralogous Human Genes: Only
- Gene type: protein_coding

Attributes:

- Gene Stable ID
- GO Term Accession
- GO Term Name
- GO Domain

In [12]:
# read the ensemble file
raw_data = pd.read_table(file_path_ensembl, sep=',', dtype='str')
raw_data = raw_data.rename(columns={'Gene stable ID':'ensembl_id', 'Transcript stable ID':'ensembl_transcript_id', 
                                    'GO term accession': 'GO_accession', 'GO term name':'GO_name',
                                    'GO domain':'GO_domain'})
raw_data.head()

Unnamed: 0,ensembl_id,ensembl_transcript_id,GO_accession,GO_name,GO_domain
0,ENSG00000198763,ENST00000361453,GO:0008137,NADH dehydrogenase (ubiquinone) activity,molecular_function
1,ENSG00000198763,ENST00000361453,GO:0006120,"mitochondrial electron transport, NADH to ubiq...",biological_process
2,ENSG00000198763,ENST00000361453,GO:0016020,membrane,cellular_component
3,ENSG00000198763,ENST00000361453,GO:0016491,oxidoreductase activity,molecular_function
4,ENSG00000198763,ENST00000361453,GO:0005739,mitochondrion,cellular_component


In [13]:
protein_coding_genes = raw_data.loc[raw_data['ensembl_id'].isin(id_map_cleaned.ensembl_id)] # filter out the protein coding genes only
filtered_protein_coding_genes = protein_coding_genes.loc[~protein_coding_genes['GO_accession'].isna()].reset_index(drop=True) #remove the ones without GO_accession 
filtered_BP_genes = filtered_protein_coding_genes.loc[filtered_protein_coding_genes['GO_domain'] == 'biological_process',].reset_index(drop=True) #filter the biological process domain
filtered_CC_genes = filtered_protein_coding_genes.loc[filtered_protein_coding_genes['GO_domain'] == 'cellular_component',].reset_index(drop=True) #filter the cellular component domain

print(f'# of genes w/ their corresponding GO term annotations (BP): {filtered_BP_genes.ensembl_id.nunique()} \n',
      f'# of unique GO term accession (BP): {filtered_BP_genes.GO_accession.nunique()}')

print(f'# of genes w/ their corresponding GO term annotations (CC): {filtered_CC_genes.ensembl_id.nunique()} \n',
      f'# of unique GO term accession (CC): {filtered_CC_genes.GO_accession.nunique()}')

display(filtered_BP_genes.head())
display(filtered_CC_genes.head())

# of genes w/ their corresponding GO term annotations (BP): 16207 
 # of unique GO term accession (BP): 11911
# of genes w/ their corresponding GO term annotations (CC): 16832 
 # of unique GO term accession (CC): 1662


Unnamed: 0,ensembl_id,ensembl_transcript_id,GO_accession,GO_name,GO_domain
0,ENSG00000198763,ENST00000361453,GO:0006120,"mitochondrial electron transport, NADH to ubiq...",biological_process
1,ENSG00000198763,ENST00000361453,GO:0032981,mitochondrial respiratory chain complex I asse...,biological_process
2,ENSG00000198763,ENST00000361453,GO:0006120,"mitochondrial electron transport, NADH to ubiq...",biological_process
3,ENSG00000198763,ENST00000361453,GO:0006120,"mitochondrial electron transport, NADH to ubiq...",biological_process
4,ENSG00000198763,ENST00000361453,GO:0006120,"mitochondrial electron transport, NADH to ubiq...",biological_process


Unnamed: 0,ensembl_id,ensembl_transcript_id,GO_accession,GO_name,GO_domain
0,ENSG00000198763,ENST00000361453,GO:0016020,membrane,cellular_component
1,ENSG00000198763,ENST00000361453,GO:0005739,mitochondrion,cellular_component
2,ENSG00000198763,ENST00000361453,GO:0005743,mitochondrial inner membrane,cellular_component
3,ENSG00000198763,ENST00000361453,GO:0070469,respirasome,cellular_component
4,ENSG00000198763,ENST00000361453,GO:0005747,mitochondrial respiratory chain complex I,cellular_component


In [14]:
# filter out the paralogs only
filtered_BP_paralogs = filtered_BP_genes.loc[filtered_BP_genes['ensembl_id'].isin(list_of_paralogs)].reset_index(drop=True) 

print(f'# of paralogs w/ their corresponding GO term annotations (BP): {filtered_BP_paralogs.ensembl_id.nunique()} \n',
      f'# of unique GO term accession (BP): {filtered_BP_paralogs.GO_accession.nunique()}')

display(filtered_BP_paralogs.head())

filtered_CC_paralogs = filtered_CC_genes.loc[filtered_CC_genes['ensembl_id'].isin(list_of_paralogs)].reset_index(drop=True)

print(f'# of paralogs w/ their corresponding GO term annotations (CC): {filtered_CC_paralogs.ensembl_id.nunique()} \n',
      f'# of unique GO term accession (CC): {filtered_CC_paralogs.GO_accession.nunique()}')

display(filtered_CC_paralogs.head())

# of paralogs w/ their corresponding GO term annotations (BP): 11955 
 # of unique GO term accession (BP): 11305


Unnamed: 0,ensembl_id,ensembl_transcript_id,GO_accession,GO_name,GO_domain
0,ENSG00000114374,ENST00000651177,GO:0006511,ubiquitin-dependent protein catabolic process,biological_process
1,ENSG00000114374,ENST00000651177,GO:0016579,protein deubiquitination,biological_process
2,ENSG00000114374,ENST00000651177,GO:0006508,proteolysis,biological_process
3,ENSG00000114374,ENST00000651177,GO:0016579,protein deubiquitination,biological_process
4,ENSG00000114374,ENST00000651177,GO:0016477,cell migration,biological_process


# of paralogs w/ their corresponding GO term annotations (CC): 12423 
 # of unique GO term accession (CC): 1524


Unnamed: 0,ensembl_id,ensembl_transcript_id,GO_accession,GO_name,GO_domain
0,ENSG00000114374,ENST00000651177,GO:0005737,cytoplasm,cellular_component
1,ENSG00000114374,ENST00000651177,GO:0005737,cytoplasm,cellular_component
2,ENSG00000114374,ENST00000651177,GO:0005634,nucleus,cellular_component
3,ENSG00000114374,ENST00000651177,GO:0005829,cytosol,cellular_component
4,ENSG00000114374,ENST00000338981,GO:0005737,cytoplasm,cellular_component


In [15]:
sum_bp_paralogs = filtered_BP_paralogs.groupby(['GO_accession', 'GO_name'])['ensembl_id'].apply(set).reset_index(name="genes")
display(sum_bp_paralogs.head())

sum_cc_paralogs = filtered_CC_paralogs.groupby(['GO_accession', 'GO_name'])['ensembl_id'].apply(set).reset_index(name="genes")
display(sum_cc_paralogs.head())

Unnamed: 0,GO_accession,GO_name,genes
0,GO:0000002,mitochondrial genome maintenance,"{ENSG00000196365, ENSG00000114120, ENSG0000017..."
1,GO:0000003,reproduction,{ENSG00000189409}
2,GO:0000012,single strand break repair,"{ENSG00000096717, ENSG00000137074, ENSG0000022..."
3,GO:0000017,alpha-glucoside transport,"{ENSG00000100170, ENSG00000140675}"
4,GO:0000018,regulation of DNA recombination,"{ENSG00000114030, ENSG00000182481}"


Unnamed: 0,GO_accession,GO_name,genes
0,GO:0000015,phosphopyruvate hydratase complex,"{ENSG00000188316, ENSG00000074800, ENSG0000010..."
1,GO:0000118,histone deacetylase complex,"{ENSG00000108840, ENSG00000094916, ENSG0000019..."
2,GO:0000123,histone acetyltransferase complex,"{ENSG00000111653, ENSG00000136504, ENSG0000010..."
3,GO:0000124,SAGA complex,"{ENSG00000114166, ENSG00000162227, ENSG0000016..."
4,GO:0000137,Golgi cis cisterna,"{ENSG00000174567, ENSG00000188626, ENSG0000013..."


In [16]:
# filter biological processes - include processes with 5 to 100 expresses genes
sum_bp_paralogs['sm_biological_process'] = sum_bp_paralogs['genes'].apply(lambda x: 100 > len(x) > 5)
sum_bp_paralogs_df = sum_bp_paralogs[sum_bp_paralogs['sm_biological_process'] == True].reset_index(drop=True).drop(['sm_biological_process'], axis=1)
display(sum_bp_paralogs_df.head())

Unnamed: 0,GO_accession,GO_name,genes
0,GO:0000002,mitochondrial genome maintenance,"{ENSG00000196365, ENSG00000114120, ENSG0000017..."
1,GO:0000038,very long-chain fatty acid metabolic process,"{ENSG00000101986, ENSG00000140284, ENSG0000011..."
2,GO:0000045,autophagosome assembly,"{ENSG00000198925, ENSG00000034713, ENSG0000015..."
3,GO:0000050,urea cycle,"{ENSG00000118520, ENSG00000081181, ENSG0000012..."
4,GO:0000070,mitotic sister chromatid segregation,"{ENSG00000166851, ENSG00000037042, ENSG0000023..."


In [17]:
# filter cellular component - include processes with 5 to 100 expresses genes
sum_cc_paralogs['sm_cellular_component'] = sum_cc_paralogs['genes'].apply(lambda x: 100 > len(x) > 5)
sum_cc_paralogs_df = sum_cc_paralogs[sum_cc_paralogs['sm_cellular_component'] == True].reset_index(drop=True).drop(['sm_cellular_component'], axis=1)
display(sum_cc_paralogs_df.head())

Unnamed: 0,GO_accession,GO_name,genes
0,GO:0000118,histone deacetylase complex,"{ENSG00000108840, ENSG00000094916, ENSG0000019..."
1,GO:0000123,histone acetyltransferase complex,"{ENSG00000111653, ENSG00000136504, ENSG0000010..."
2,GO:0000124,SAGA complex,"{ENSG00000114166, ENSG00000162227, ENSG0000016..."
3,GO:0000137,Golgi cis cisterna,"{ENSG00000174567, ENSG00000188626, ENSG0000013..."
4,GO:0000138,Golgi trans cisterna,"{ENSG00000198964, ENSG00000130733, ENSG0000008..."


In [19]:
def precompute_go_effects(go_df, gene_effect):
    go_to_mean_effect = {}
    valid_go_df = []

    for _, row in go_df.iterrows():
        go_id = row['GO_accession']
        gene_set = list(row['genes'])
        genes_in_effect = [g for g in gene_set if g in gene_effect.columns]
        if len(genes_in_effect) >= 2:
            go_to_mean_effect[go_id] = gene_effect[genes_in_effect].mean(axis=1)
            valid_go_df.append(row)

    return go_to_mean_effect, pd.DataFrame(valid_go_df)

def build_gene_to_go_map(go_df):
    gene_to_go = defaultdict(set)
    for _, row in go_df.iterrows():
        go_id = row['GO_accession']
        for gene in row['genes']:
            gene_to_go[gene].add(go_id)
    return gene_to_go

def process_gene_pair_optimized(pair, go_df, go_to_mean_effect, gene_to_go):
    a1, a2 = pair
    shared_go_ids = gene_to_go.get(a1, set()).intersection(gene_to_go.get(a2, set()))

    if not shared_go_ids:
        return pd.DataFrame(), pd.DataFrame()

    go_processes_for_pair = []
    go_name_list = []
    len_of_go = []
    pair_essentiality_df = pd.DataFrame()

    for go_id in shared_go_ids:
        row = go_df[go_df['GO_accession'] == go_id].iloc[0]
        gene_set = row['genes']
        go_processes_for_pair.append(go_id)
        go_name_list.append(row['GO_name'])
        len_of_go.append(len(gene_set))
        pair_essentiality_df[go_id] = go_to_mean_effect[go_id]

    pair_df = pd.DataFrame({
        'paralog_pairs': [pair] * len(go_processes_for_pair),
        'GO_accession': go_processes_for_pair,
        'GO_name': go_name_list,
        'len_of_go': len_of_go
    }).sort_values(by='len_of_go', ascending=False).reset_index(drop=True)

    return pair_df, pair_essentiality_df


In [20]:
# Precompute mean gene effect values for each GO term
go_to_mean_effect, filtered_go_df = precompute_go_effects(sum_bp_paralogs_df, gene_essentiality)

# Build a lookup map from gene → GO terms
gene_to_go = build_gene_to_go_map(filtered_go_df)

In [21]:
def summarize_GO_scores(pair_df, pair_essentiality_df, pair, paralog_pairs_dict=None):
    if pair_df.empty or pair_essentiality_df.empty:
        return None

    result = {}

    # --- smallest_GO_accession (per cell line) ---
    min_len = pair_df['len_of_go'].min()
    smallest_df = pair_df[pair_df['len_of_go'] == min_len]
    smallest_go_ids = smallest_df['GO_accession'].tolist()
    smallest_subset = pair_essentiality_df[smallest_go_ids]

    # Pick GO with lowest essentiality per cell line
    result['smallest_GO_accession'] = smallest_subset.idxmin(axis=1)
    result['smallest_gene_effect'] = smallest_subset.min(axis=1)

    # --- min/max GO by essentiality (per cell line) ---
    result['min_GO_accession'] = pair_essentiality_df.idxmin(axis=1)
    result['min_gene_effect'] = pair_essentiality_df.min(axis=1)

    result['max_GO_accession'] = pair_essentiality_df.idxmax(axis=1)
    result['max_gene_effect'] = pair_essentiality_df.max(axis=1)

    # --- paralog pair label ---
    if paralog_pairs_dict is not None:
        result['paralog_pair'] = paralog_pairs_dict.get(pair, '|'.join(pair))
    else:
        result['paralog_pair'] = '|'.join(pair)

    return pd.DataFrame({
        'cell_line': pair_essentiality_df.index,
        'paralog_pair': result['paralog_pair'],
        'smallest_GO_accession': result['smallest_GO_accession'].values,
        'smallest_gene_effect': result['smallest_gene_effect'].values,
        'min_GO_accession': result['min_GO_accession'].values,
        'min_gene_effect': result['min_gene_effect'].values,
        'max_GO_accession': result['max_GO_accession'].values,
        'max_gene_effect': result['max_gene_effect'].values,
    })


In [22]:
all_summaries = []

for pair in paralog_pairs:
    pair_df, pair_essentiality_df = process_gene_pair_optimized(
        pair, filtered_go_df, go_to_mean_effect, gene_to_go
    )
    summary_df = summarize_GO_scores(pair_df, pair_essentiality_df, pair, paralog_pairs_dict)
    
    if summary_df is not None:
        all_summaries.append(summary_df)

final_df = pd.concat(all_summaries).reset_index(drop=True)

In [23]:
###### Test ######

#PRKACA_PRKACB
pair = {'ENSG00000072062', 'ENSG00000142875'}

filtered_df = sum_bp_paralogs_df[sum_bp_paralogs_df['genes'].apply(lambda x: pair.issubset(x))]
filtered_df.insert(filtered_df.shape[1], 'len_of_genes', filtered_df['genes'].apply(len))
display(filtered_df)

go_effect_subset = {go: go_to_mean_effect[go] for go in filtered_df['GO_accession']}
go_effect_df = pd.DataFrame(go_effect_subset)

min_len = filtered_df['len_of_genes'].min()
smallest_df = filtered_df[filtered_df['len_of_genes'] == min_len]
smallest_go_ids = smallest_df['GO_accession'].tolist()
smallest_subset = go_effect_df[smallest_go_ids]

#smallest GOs: GO:0070613 and GO:0034380

result_df = pd.DataFrame(index=go_effect_df.index)
result_df['smallest_GO_accession'] = smallest_subset.idxmin(axis=1)
result_df['smallest_gene_effect'] = smallest_subset.min(axis=1)
result_df['min_GO_accession'] = go_effect_df.idxmin(axis=1)
result_df['min_gene_effect'] = go_effect_df.min(axis=1)
result_df['max_GO_accession'] = go_effect_df.idxmax(axis=1)
result_df['max_gene_effect'] = go_effect_df.max(axis=1)

# include all GO expression values for comparison
result_df = pd.concat([result_df, go_effect_df], axis=1)
display(result_df.loc[result_df.index == 'ACH-000022'])

display(final_df.loc[(final_df.paralog_pair == 'PRKACA_PRKACB') & (final_df.cell_line == 'ACH-000022'), :])

###### Test ######

Unnamed: 0,GO_accession,GO_name,genes,len_of_genes
95,GO:0001843,neural tube closure,"{ENSG00000147050, ENSG00000104290, ENSG0000007...",58
226,GO:0003091,renal water homeostasis,"{ENSG00000171885, ENSG00000169344, ENSG0000016...",16
846,GO:0010737,protein kinase A signaling,"{ENSG00000165059, ENSG00000136167, ENSG0000021...",15
1585,GO:0034380,high-density lipoprotein particle assembly,"{ENSG00000165059, ENSG00000118137, ENSG0000014...",7
2936,GO:0070613,regulation of protein processing,"{ENSG00000204084, ENSG00000142875, ENSG0000007...",7


Unnamed: 0,smallest_GO_accession,smallest_gene_effect,min_GO_accession,min_gene_effect,max_GO_accession,max_gene_effect,GO:0001843,GO:0003091,GO:0010737,GO:0034380,GO:0070613
ACH-000022,GO:0034380,6369.285714,GO:0034380,6369.285714,GO:0070613,10245.285714,9228.87931,9576.0,10074.5,6369.285714,10245.285714


Unnamed: 0,cell_line,paralog_pair,smallest_GO_accession,smallest_gene_effect,min_GO_accession,min_gene_effect,max_GO_accession,max_gene_effect
164173,ACH-000022,PRKACA_PRKACB,GO:0034380,6369.285714,GO:0034380,6369.285714,GO:0070613,10245.285714


In [24]:
go_to_mean_effect_bp_norm, filtered_go_df_bp_norm = precompute_go_effects(sum_bp_paralogs_df, gene_z_essentiality)

gene_to_go_map_bp_norm = build_gene_to_go_map(filtered_go_df_bp_norm)

bp_norm_summaries = []

for pair in paralog_pairs:
    pair_df, pair_essentiality_df = process_gene_pair_optimized(
        pair, filtered_go_df_bp_norm, go_to_mean_effect_bp_norm, gene_to_go_map_bp_norm
    )

    summary_df = summarize_GO_scores(pair_df, pair_essentiality_df, pair, paralog_pairs_dict)

    if summary_df is not None:
        bp_norm_summaries.append(summary_df)

# Step 4: Concatenate into final normalized BP summary
final_bp_norm_df = pd.concat(bp_norm_summaries).reset_index(drop=True)

In [25]:
final_bp_norm_df['paralog_pair'].nunique(), paralog_pairs.nunique()

(15795, 35656)

In [26]:
final_df = final_df.rename(columns={
    'smallest_GO_accession': 'smallest_BP_GO_accession',
    'smallest_gene_effect': 'smallest_BP_GO_essentiality',
    'min_GO_accession': 'min_BP_GO_accession', 
    'min_gene_effect': 'min_BP_GO_essentiality',
    'max_GO_accession': 'max_BP_GO_accession', 
    'max_gene_effect': 'max_BP_GO_essentiality',
})

final_bp_norm_df = final_bp_norm_df.rename(columns={
    'smallest_GO_accession': 'zsmallest_BP_GO_accession',
    'smallest_gene_effect': 'zsmallest_BP_GO_essentiality',
    'min_GO_accession': 'zmin_BP_GO_accession', 
    'min_gene_effect': 'zmin_BP_GO_essentiality',
    'max_GO_accession': 'zmax_BP_GO_accession', 
    'max_gene_effect': 'zmax_BP_GO_essentiality',
})

In [27]:
BP_GO_merged_df = pd.merge(
    final_df, final_bp_norm_df,
    on=['paralog_pair', 'cell_line'],
    how='outer',
)

In [28]:
BP_GO_merged_df[:3]

Unnamed: 0,cell_line,paralog_pair,smallest_BP_GO_accession,smallest_BP_GO_essentiality,min_BP_GO_accession,min_BP_GO_essentiality,max_BP_GO_accession,max_BP_GO_essentiality,zsmallest_BP_GO_accession,zsmallest_BP_GO_essentiality,zmin_BP_GO_accession,zmin_BP_GO_essentiality,zmax_BP_GO_accession,zmax_BP_GO_essentiality
0,ACH-000001,A1CF_RBM47,GO:0016554,8093.583333,GO:0016554,8093.583333,GO:0016554,8093.583333,GO:0016554,-0.693127,GO:0016554,-0.693127,GO:0016554,-0.693127
1,ACH-000004,A1CF_RBM47,GO:0016554,10267.583333,GO:0016554,10267.583333,GO:0016554,10267.583333,GO:0016554,-0.030233,GO:0016554,-0.030233,GO:0016554,-0.030233
2,ACH-000005,A1CF_RBM47,GO:0016554,10641.0,GO:0016554,10641.0,GO:0016554,10641.0,GO:0016554,0.100157,GO:0016554,0.100157,GO:0016554,0.100157


In [29]:
def summarize_GO_scores(pair_df, pair_essentiality_df, pair, paralog_pairs_dict=None):
    # --- Initial validation ---
    if pair_df.empty or pair_essentiality_df.empty:
        return None

    # --- Drop invalid rows and columns ---
    pair_essentiality_df = pair_essentiality_df.dropna(how='all')
    if pair_essentiality_df.empty:
        return None

    pair_essentiality_df = pair_essentiality_df.dropna(axis=1, how='all')
    if pair_essentiality_df.empty:
        return None

    # --- Set base index (cell lines) ---
    cell_lines = pair_essentiality_df.index

    result = {}

    # --- Smallest GO logic ---
    min_len = pair_df['len_of_go'].min()
    smallest_df = pair_df[pair_df['len_of_go'] == min_len]
    smallest_go_ids = [go for go in smallest_df['GO_accession'] if go in pair_essentiality_df.columns]

    if not smallest_go_ids:
        return None

    smallest_subset = pair_essentiality_df[smallest_go_ids].dropna(how='all')
    if smallest_subset.empty:
        return None

    result['smallest_GO_accession'] = smallest_subset.idxmin(axis=1).reindex(cell_lines)
    result['smallest_gene_effect'] = smallest_subset.min(axis=1).reindex(cell_lines)

    # --- Min/Max across all GO terms (row-wise) ---
    result['min_GO_accession'] = pair_essentiality_df.idxmin(axis=1).reindex(cell_lines)
    result['min_gene_effect'] = pair_essentiality_df.min(axis=1).reindex(cell_lines)

    result['max_GO_accession'] = pair_essentiality_df.idxmax(axis=1).reindex(cell_lines)
    result['max_gene_effect'] = pair_essentiality_df.max(axis=1).reindex(cell_lines)

    # --- Paralog pair label ---
    if paralog_pairs_dict is not None:
        pair_name = paralog_pairs_dict.get(pair, '|'.join(pair))
    else:
        pair_name = '|'.join(pair)

    result['paralog_pair'] = [pair_name] * len(cell_lines)

    # --- Final DataFrame ---
    return pd.DataFrame({
        'cell_line': cell_lines,
        'paralog_pair': result['paralog_pair'],
        'smallest_GO_accession': result['smallest_GO_accession'].values,
        'smallest_gene_effect': result['smallest_gene_effect'].values,
        'min_GO_accession': result['min_GO_accession'].values,
        'min_gene_effect': result['min_gene_effect'].values,
        'max_GO_accession': result['max_GO_accession'].values,
        'max_gene_effect': result['max_gene_effect'].values,
    })

In [30]:
# Precompute for CC
go_to_mean_effect_cc, filtered_go_df_cc = precompute_go_effects(sum_cc_paralogs_df, gene_essentiality)

# Build gene → GO map for CC
gene_to_go_cc = build_gene_to_go_map(filtered_go_df_cc)

all_summaries_cc = []

for pair in paralog_pairs:
    pair_df, pair_essentiality_df = process_gene_pair_optimized(
        pair, filtered_go_df_cc, go_to_mean_effect_cc, gene_to_go_cc
    )
    
    summary_df = summarize_GO_scores(pair_df, pair_essentiality_df, pair, paralog_pairs_dict)
    
    if summary_df is not None:
        all_summaries_cc.append(summary_df)

# Concatenate the results
final_df_cc = pd.concat(all_summaries_cc).reset_index(drop=True)

In [31]:
final_df_cc['paralog_pair'].nunique(), paralog_pairs.nunique()

(6296, 35656)

In [32]:
display(final_df_cc.loc[(final_df_cc.paralog_pair == 'PRKACA_PRKACB') & (final_df_cc.cell_line == 'ACH-000022'), :])

Unnamed: 0,cell_line,paralog_pair,smallest_GO_accession,smallest_gene_effect,min_GO_accession,min_gene_effect,max_GO_accession,max_gene_effect
118813,ACH-000022,PRKACA_PRKACB,GO:0005952,8908.375,GO:0005952,8908.375,GO:0097546,10236.24


In [33]:
# Precompute for CC using normalized data
go_to_mean_effect_cc_norm, filtered_go_df_cc_norm = precompute_go_effects(sum_cc_paralogs_df, gene_z_essentiality)

# Build gene → GO map
gene_to_go_cc_norm = build_gene_to_go_map(filtered_go_df_cc_norm)

# Process all paralog pairs
all_summaries_cc_norm = []

for pair in paralog_pairs:
    pair_df, pair_essentiality_df = process_gene_pair_optimized(
        pair, filtered_go_df_cc_norm, go_to_mean_effect_cc_norm, gene_to_go_cc_norm
    )

    summary_df = summarize_GO_scores(pair_df, pair_essentiality_df, pair, paralog_pairs_dict)

    if summary_df is not None:
        all_summaries_cc_norm.append(summary_df)

# Concatenate results
final_df_cc_norm = pd.concat(all_summaries_cc_norm).reset_index(drop=True)

In [34]:
final_df_cc_norm['paralog_pair'].nunique(), paralog_pairs.nunique()

(6296, 35656)

In [35]:
final_df_cc = final_df_cc.rename(columns={
    'smallest_GO_accession': 'smallest_CC_GO_accession',
    'smallest_gene_effect': 'smallest_CC_GO_essentiality',
    'min_GO_accession': 'min_CC_GO_accession', 
    'min_gene_effect': 'min_CC_GO_essentiality',
    'max_GO_accession': 'max_CC_GO_accession', 
    'max_gene_effect': 'max_CC_GO_essentiality',
})

final_df_cc_norm = final_df_cc_norm.rename(columns={
    'smallest_GO_accession': 'zsmallest_CC_GO_accession',
    'smallest_gene_effect': 'zsmallest_CC_GO_essentiality',
    'min_GO_accession': 'zmin_CC_GO_accession', 
    'min_gene_effect': 'zmin_CC_GO_essentiality',
    'max_GO_accession': 'zmax_CC_GO_accession', 
    'max_gene_effect': 'zmax_CC_GO_essentiality',
})

In [36]:
CC_GO_merged_df = pd.merge(
    final_df_cc, final_df_cc_norm,
    on=['paralog_pair', 'cell_line'],
    how='outer',
)

In [37]:
CC_GO_merged_df[:3]

Unnamed: 0,cell_line,paralog_pair,smallest_CC_GO_accession,smallest_CC_GO_essentiality,min_CC_GO_accession,min_CC_GO_essentiality,max_CC_GO_accession,max_CC_GO_essentiality,zsmallest_CC_GO_accession,zsmallest_CC_GO_essentiality,zmin_CC_GO_accession,zmin_CC_GO_essentiality,zmax_CC_GO_accession,zmax_CC_GO_essentiality
0,ACH-000001,A2M_C3,GO:0072562,10072.863636,GO:0072562,10072.863636,GO:0072562,10072.863636,GO:0072562,0.131185,GO:0072562,0.131185,GO:0072562,0.131185
1,ACH-000004,A2M_C3,GO:0072562,9107.590909,GO:0072562,9107.590909,GO:0072562,9107.590909,GO:0072562,-0.191389,GO:0072562,-0.191389,GO:0072562,-0.191389
2,ACH-000005,A2M_C3,GO:0072562,9475.579545,GO:0072562,9475.579545,GO:0072562,9475.579545,GO:0072562,-0.090731,GO:0072562,-0.090731,GO:0072562,-0.090731


In [None]:
# save the files
output_path = get_data_path(['data', 'input', 'GO'], '')

BP_GO_merged_df.to_parquet(os.path.join(output_path, 'go_BP_ranked_essentiality.parquet'), engine='pyarrow', index=True)
CC_GO_merged_df.to_parquet(os.path.join(output_path, 'go_CC_ranked_essentiality.parquet'), engine='pyarrow', index=True)