In [2]:
# import modules
import os
import csv
import pandas as pd
import numpy as np
import pyarrow as pa
from collections import defaultdict

In [3]:
# set the base directory for the project
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, "..", ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

file_path_gene_expression = get_data_path(['output', 'processed_DepMap22Q4'], 'expression_data.csv')
file_path_gene_zexpression = get_data_path(['output', 'processed_DepMap22Q4'], 'zexpression_data.csv')

dekegel_table8_path = get_data_path(['input', 'other'], 'processed_DeKegel_TableS8.csv')

file_path_genenames = get_data_path(['input', 'other'], 'genenames.txt')

file_path_ensembl = get_data_path(['input', 'other'], 'mart_export.txt')

In [8]:
# read HGNC
id_map_raw = pd.read_table(file_path_genenames, dtype = "str")

id_map = id_map_raw[['HGNC ID', 'Approved symbol', 'NCBI Gene ID', 'NCBI Gene ID(supplied by NCBI)', 'Ensembl gene ID', 'Ensembl ID(supplied by Ensembl)']]

id_map = id_map.copy()
id_map['entrez_id'] = id_map['NCBI Gene ID(supplied by NCBI)'].combine_first(id_map['NCBI Gene ID'])
id_map['ensembl_id'] = id_map['Ensembl ID(supplied by Ensembl)'].combine_first(id_map['Ensembl gene ID'])

id_map_na = id_map.dropna(axis=0, how='any', subset=['entrez_id', 'ensembl_id']).reset_index(drop=True)
id_map_na['entrez_id'] = id_map_na['entrez_id'].astype(int)
id_map_na[:3]
#id_map.hgnc_id = id_map.hgnc_id.apply(lambda x: x.split(':')[1])

Unnamed: 0,HGNC ID,Approved symbol,NCBI Gene ID,NCBI Gene ID(supplied by NCBI),Ensembl gene ID,Ensembl ID(supplied by Ensembl),entrez_id,ensembl_id
0,HGNC:5,A1BG,1,1,ENSG00000121410,ENSG00000121410,1,ENSG00000121410
1,HGNC:37133,A1BG-AS1,503538,503538,ENSG00000268895,ENSG00000268895,503538,ENSG00000268895
2,HGNC:24086,A1CF,29974,29974,ENSG00000148584,ENSG00000148584,29974,ENSG00000148584


### Gene Expression Profiles

#### Differential expression analysis
- Null hypothesis: the activity of the gene stays the same in two different conditions
- Alternative hypothesis: the activity of the gene is different in two different conditions

In [9]:
gene_expression = pd.read_csv(file_path_gene_expression, 
                            index_col=0, low_memory=False)
gene_expression = gene_expression.drop(['cell_name'], axis=1)
gene_expression.columns = gene_expression.columns.astype(int)
gene_expression[:3]

Unnamed: 0,7105,64102,8813,57147,55732,2268,3075,2519,2729,4800,...,8916,29970,255027,883,221468,55486,552900,220074,440519,284391
ACH-001113,4.331992,0.0,7.36466,2.792855,4.471187,0.028569,1.226509,3.044394,6.500005,4.739848,...,1.367371,1.201634,0.0,1.659925,0.286881,6.397974,4.149747,2.831877,2.153805,0.0
ACH-001289,4.567424,0.584963,7.106641,2.543496,3.50462,0.0,0.189034,3.813525,4.221877,3.481557,...,3.053111,6.17971,0.748461,3.019702,1.807355,5.778734,3.134221,3.689299,1.669027,1.485427
ACH-001339,3.15056,0.0,7.379118,2.333424,4.228049,0.056584,1.31034,6.687201,3.682573,3.273516,...,2.596935,1.0,0.0,1.815575,0.62293,6.20418,3.363171,1.895303,0.15056,1.269033


In [10]:
zexpression_data = pd.read_csv(file_path_gene_zexpression,
                               index_col=0, low_memory=False)
zexpression_data = zexpression_data.drop(['cell_name'], axis=1)
zexpression_data.columns = zexpression_data.columns.astype(int)
zexpression_data[:3]

Unnamed: 0,7105,64102,8813,57147,55732,2268,3075,2519,2729,4800,...,8916,29970,255027,883,221468,55486,552900,220074,440519,284391
ACH-001113,0.588734,-0.202156,1.341424,0.782413,1.015674,-0.333763,-0.419278,-1.154559,1.615601,0.920474,...,-1.225467,-0.14346,-0.692331,-1.157672,-0.770213,1.149857,1.662524,-0.122769,0.97997,-1.078924
ACH-001289,0.731947,1.493293,0.94251,0.325524,-0.214189,-0.356637,-0.882359,-0.73089,-0.360896,-0.750642,...,0.631138,3.312879,0.182974,0.446167,1.850917,0.02758,0.242377,1.015032,0.405184,0.672085
ACH-001339,-0.129934,-0.202156,1.363777,-0.059381,0.706304,-0.311333,-0.38186,0.852051,-0.828794,-1.026939,...,0.128724,-0.283457,-0.692331,-0.974084,-0.190901,0.798635,0.562549,-1.365604,-1.395212,0.417002


In [11]:
entrez_id_to_ensembl = dict(zip(id_map_na.entrez_id, id_map_na.ensembl_id))

gene_expression = gene_expression.rename(columns=entrez_id_to_ensembl) #replace the entrez_id with ensembl_id
display(gene_expression[:3])

zexpression_data = zexpression_data.rename(columns=entrez_id_to_ensembl) #replace the entrez_id with ensembl_id
display(zexpression_data[:3])

Unnamed: 0,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,...,ENSG00000138641,ENSG00000151967,ENSG00000156968,ENSG00000171097,ENSG00000172738,ENSG00000175193,ENSG00000183336,ENSG00000284922,ENSG00000196081,ENSG00000223547
ACH-001113,4.331992,0.0,7.36466,2.792855,4.471187,0.028569,1.226509,3.044394,6.500005,4.739848,...,1.367371,1.201634,0.0,1.659925,0.286881,6.397974,4.149747,2.831877,2.153805,0.0
ACH-001289,4.567424,0.584963,7.106641,2.543496,3.50462,0.0,0.189034,3.813525,4.221877,3.481557,...,3.053111,6.17971,0.748461,3.019702,1.807355,5.778734,3.134221,3.689299,1.669027,1.485427
ACH-001339,3.15056,0.0,7.379118,2.333424,4.228049,0.056584,1.31034,6.687201,3.682573,3.273516,...,2.596935,1.0,0.0,1.815575,0.62293,6.20418,3.363171,1.895303,0.15056,1.269033


Unnamed: 0,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,...,ENSG00000138641,ENSG00000151967,ENSG00000156968,ENSG00000171097,ENSG00000172738,ENSG00000175193,ENSG00000183336,ENSG00000284922,ENSG00000196081,ENSG00000223547
ACH-001113,0.588734,-0.202156,1.341424,0.782413,1.015674,-0.333763,-0.419278,-1.154559,1.615601,0.920474,...,-1.225467,-0.14346,-0.692331,-1.157672,-0.770213,1.149857,1.662524,-0.122769,0.97997,-1.078924
ACH-001289,0.731947,1.493293,0.94251,0.325524,-0.214189,-0.356637,-0.882359,-0.73089,-0.360896,-0.750642,...,0.631138,3.312879,0.182974,0.446167,1.850917,0.02758,0.242377,1.015032,0.405184,0.672085
ACH-001339,-0.129934,-0.202156,1.363777,-0.059381,0.706304,-0.311333,-0.38186,0.852051,-0.828794,-1.026939,...,0.128724,-0.283457,-0.692331,-0.974084,-0.190901,0.798635,0.562549,-1.365604,-1.395212,0.417002


In [12]:
# map lineage to gene expression file
#sample_info = pd.read_csv("/Users/narod/Documents/GitHub/SL_prediction/ccle_files/sample_info.csv", sep=',', usecols=['DepMap_ID', 'lineage'], low_memory=False)  
#lineage_dict = dict(zip(sample_info.DepMap_ID, sample_info.lineage))
#gene_expression.insert(0, 'lineage', gene_expression.index.map(lineage_dict))

### Paralog Pairs


In [14]:
paralog_pairs_dict = {}
list_of_paralogs = set()
#paralog_pairs = []
with open(dekegel_table8_path, "r") as f :
    reader = csv.DictReader(f)
    for r in reader :
        sorted_gene_pair = r['genepair']
        a1 = r['A1_ensembl_new']
        a2 = r['A2_ensembl_new']
        if a1 in gene_expression.columns and a2 in gene_expression.columns:
            #paralog_pairs_dict[sorted_gene_pair] = (a1, a2)
            paralog_pairs_dict[(a1, a2)] = sorted_gene_pair
            list_of_paralogs.update((a1, a2))
            #paralog_pairs.append(set([a1, a2]))

In [8]:
#paralog_pairs = []
#for key in paralog_pairs_dict.keys():
#    if key[0] != key[1]:
#        paralog_pairs.append([key[0], key[1]])

In [15]:
paralog_pairs = pd.Series(list(paralog_pairs_dict.keys()))

In [16]:
paralog_pairs[0]

('ENSG00000080503', 'ENSG00000127616')

In [11]:
#paralog_pairs_dict[tuple(paralog_pairs[0])]

### Protein-coding paralog pairs w/ their corresponding GO term annotations

Ensembl Filters:

- Paralogous Human Genes: Only
- Gene type: protein_coding

Attributes:

- Gene Stable ID
- GO Term Accession
- GO Term Name
- GO Domain

In [18]:
# read the ensemble file
raw_data = pd.read_table(file_path_ensembl, sep=',', dtype='str')
raw_data = raw_data.rename(columns={'Gene stable ID':'ensembl_id', 'Transcript stable ID':'ensembl_transcript_id', 
                                    'GO term accession': 'GO_accession', 'GO term name':'GO_name',
                                    'GO domain':'GO_domain'})
raw_data.head()

Unnamed: 0,ensembl_id,ensembl_transcript_id,GO_accession,GO_name,GO_domain
0,ENSG00000198763,ENST00000361453,GO:0008137,NADH dehydrogenase (ubiquinone) activity,molecular_function
1,ENSG00000198763,ENST00000361453,GO:0006120,"mitochondrial electron transport, NADH to ubiq...",biological_process
2,ENSG00000198763,ENST00000361453,GO:0016020,membrane,cellular_component
3,ENSG00000198763,ENST00000361453,GO:0016491,oxidoreductase activity,molecular_function
4,ENSG00000198763,ENST00000361453,GO:0005739,mitochondrion,cellular_component


In [19]:
protein_coding_genes = raw_data.loc[raw_data['ensembl_id'].isin(id_map_na.ensembl_id)] # filter out the protein coding genes only
filtered_protein_coding_genes = protein_coding_genes.loc[~protein_coding_genes['GO_accession'].isna()].reset_index(drop=True) #remove the ones without GO_accession 
filtered_BP_genes = filtered_protein_coding_genes.loc[filtered_protein_coding_genes['GO_domain'] == 'biological_process',].reset_index(drop=True) #filter the biological process domain
filtered_CC_genes = filtered_protein_coding_genes.loc[filtered_protein_coding_genes['GO_domain'] == 'cellular_component',].reset_index(drop=True) #filter the cellular component domain

print(f'# of genes w/ their corresponding GO term annotations (BP): {filtered_BP_genes.ensembl_id.nunique()} \n',
      f'# of unique GO term accession (BP): {filtered_BP_genes.GO_accession.nunique()}')

print(f'# of genes w/ their corresponding GO term annotations (CC): {filtered_CC_genes.ensembl_id.nunique()} \n',
      f'# of unique GO term accession (CC): {filtered_CC_genes.GO_accession.nunique()}')

display(filtered_BP_genes.head())
display(filtered_CC_genes.head())

# of genes w/ their corresponding GO term annotations (BP): 16207 
 # of unique GO term accession (BP): 11911
# of genes w/ their corresponding GO term annotations (CC): 16832 
 # of unique GO term accession (CC): 1662


Unnamed: 0,ensembl_id,ensembl_transcript_id,GO_accession,GO_name,GO_domain
0,ENSG00000198763,ENST00000361453,GO:0006120,"mitochondrial electron transport, NADH to ubiq...",biological_process
1,ENSG00000198763,ENST00000361453,GO:0032981,mitochondrial respiratory chain complex I asse...,biological_process
2,ENSG00000198763,ENST00000361453,GO:0006120,"mitochondrial electron transport, NADH to ubiq...",biological_process
3,ENSG00000198763,ENST00000361453,GO:0006120,"mitochondrial electron transport, NADH to ubiq...",biological_process
4,ENSG00000198763,ENST00000361453,GO:0006120,"mitochondrial electron transport, NADH to ubiq...",biological_process


Unnamed: 0,ensembl_id,ensembl_transcript_id,GO_accession,GO_name,GO_domain
0,ENSG00000198763,ENST00000361453,GO:0016020,membrane,cellular_component
1,ENSG00000198763,ENST00000361453,GO:0005739,mitochondrion,cellular_component
2,ENSG00000198763,ENST00000361453,GO:0005743,mitochondrial inner membrane,cellular_component
3,ENSG00000198763,ENST00000361453,GO:0070469,respirasome,cellular_component
4,ENSG00000198763,ENST00000361453,GO:0005747,mitochondrial respiratory chain complex I,cellular_component


In [20]:
# filter out the paralogs only
filtered_BP_paralogs = filtered_BP_genes.loc[filtered_BP_genes['ensembl_id'].isin(list_of_paralogs)].reset_index(drop=True) 

print(f'# of paralogs w/ their corresponding GO term annotations (BP): {filtered_BP_paralogs.ensembl_id.nunique()} \n',
      f'# of unique GO term accession (BP): {filtered_BP_paralogs.GO_accession.nunique()}')

display(filtered_BP_paralogs.head())

filtered_CC_paralogs = filtered_CC_genes.loc[filtered_CC_genes['ensembl_id'].isin(list_of_paralogs)].reset_index(drop=True)

print(f'# of paralogs w/ their corresponding GO term annotations (CC): {filtered_CC_paralogs.ensembl_id.nunique()} \n',
      f'# of unique GO term accession (CC): {filtered_CC_paralogs.GO_accession.nunique()}')

display(filtered_CC_paralogs.head())

# of paralogs w/ their corresponding GO term annotations (BP): 12012 
 # of unique GO term accession (BP): 11314


Unnamed: 0,ensembl_id,ensembl_transcript_id,GO_accession,GO_name,GO_domain
0,ENSG00000114374,ENST00000651177,GO:0006511,ubiquitin-dependent protein catabolic process,biological_process
1,ENSG00000114374,ENST00000651177,GO:0016579,protein deubiquitination,biological_process
2,ENSG00000114374,ENST00000651177,GO:0006508,proteolysis,biological_process
3,ENSG00000114374,ENST00000651177,GO:0016579,protein deubiquitination,biological_process
4,ENSG00000114374,ENST00000651177,GO:0016477,cell migration,biological_process


# of paralogs w/ their corresponding GO term annotations (CC): 12493 
 # of unique GO term accession (CC): 1526


Unnamed: 0,ensembl_id,ensembl_transcript_id,GO_accession,GO_name,GO_domain
0,ENSG00000114374,ENST00000651177,GO:0005737,cytoplasm,cellular_component
1,ENSG00000114374,ENST00000651177,GO:0005737,cytoplasm,cellular_component
2,ENSG00000114374,ENST00000651177,GO:0005634,nucleus,cellular_component
3,ENSG00000114374,ENST00000651177,GO:0005829,cytosol,cellular_component
4,ENSG00000114374,ENST00000338981,GO:0005737,cytoplasm,cellular_component


In [21]:
# group by GO_accession and GO_name
sum_bp_paralogs = filtered_BP_paralogs.groupby(['GO_accession', 'GO_name'])['ensembl_id'].apply(set).reset_index(name="genes")
display(sum_bp_paralogs.head())

sum_cc_paralogs = filtered_CC_paralogs.groupby(['GO_accession', 'GO_name'])['ensembl_id'].apply(set).reset_index(name="genes")
display(sum_cc_paralogs.head())

Unnamed: 0,GO_accession,GO_name,genes
0,GO:0000002,mitochondrial genome maintenance,"{ENSG00000115204, ENSG00000114120, ENSG0000011..."
1,GO:0000003,reproduction,{ENSG00000189409}
2,GO:0000012,single strand break repair,"{ENSG00000225830, ENSG00000096717, ENSG0000013..."
3,GO:0000017,alpha-glucoside transport,"{ENSG00000100170, ENSG00000140675}"
4,GO:0000018,regulation of DNA recombination,"{ENSG00000114030, ENSG00000182481}"


Unnamed: 0,GO_accession,GO_name,genes
0,GO:0000015,phosphopyruvate hydratase complex,"{ENSG00000108515, ENSG00000111674, ENSG0000007..."
1,GO:0000118,histone deacetylase complex,"{ENSG00000162227, ENSG00000163517, ENSG0000016..."
2,GO:0000123,histone acetyltransferase complex,"{ENSG00000156983, ENSG00000043143, ENSG0000017..."
3,GO:0000124,SAGA complex,"{ENSG00000162227, ENSG00000108773, ENSG0000027..."
4,GO:0000137,Golgi cis cisterna,"{ENSG00000116209, ENSG00000175265, ENSG0000016..."


In [22]:
# filter biological processes - include processes with 5 to 100 expresses genes

sum_bp_paralogs['sm_biological_process'] = sum_bp_paralogs['genes'].apply(lambda x: 100 > len(x) > 5)
sum_bp_paralogs_df = sum_bp_paralogs[sum_bp_paralogs['sm_biological_process'] == True].reset_index(drop=True).drop(['sm_biological_process'], axis=1)
display(sum_bp_paralogs_df.head())

Unnamed: 0,GO_accession,GO_name,genes
0,GO:0000002,mitochondrial genome maintenance,"{ENSG00000115204, ENSG00000114120, ENSG0000011..."
1,GO:0000038,very long-chain fatty acid metabolic process,"{ENSG00000113396, ENSG00000184227, ENSG0000011..."
2,GO:0000045,autophagosome assembly,"{ENSG00000138069, ENSG00000174437, ENSG0000013..."
3,GO:0000050,urea cycle,"{ENSG00000118520, ENSG00000021826, ENSG0000024..."
4,GO:0000070,mitotic sister chromatid segregation,"{ENSG00000072501, ENSG00000186185, ENSG0000012..."


In [23]:
# filter cellular component - include processes with 5 to 100 expresses genes

sum_cc_paralogs['sm_cellular_component'] = sum_cc_paralogs['genes'].apply(lambda x: 100 > len(x) > 5)
sum_cc_paralogs_df = sum_cc_paralogs[sum_cc_paralogs['sm_cellular_component'] == True].reset_index(drop=True).drop(['sm_cellular_component'], axis=1)
display(sum_cc_paralogs_df.head())

Unnamed: 0,GO_accession,GO_name,genes
0,GO:0000118,histone deacetylase complex,"{ENSG00000162227, ENSG00000163517, ENSG0000016..."
1,GO:0000123,histone acetyltransferase complex,"{ENSG00000156983, ENSG00000043143, ENSG0000017..."
2,GO:0000124,SAGA complex,"{ENSG00000162227, ENSG00000108773, ENSG0000027..."
3,GO:0000137,Golgi cis cisterna,"{ENSG00000116209, ENSG00000175265, ENSG0000016..."
4,GO:0000138,Golgi trans cisterna,"{ENSG00000116209, ENSG00000130733, ENSG0000008..."


In [24]:
# for i in range(sum_cc_paralogs_df.shape[0]):
#     if set(('ENSG00000138758', 'ENSG00000125354')).issubset(sum_cc_paralogs_df.genes[i]):
#         print(sum_cc_paralogs_df.GO_accession[i])

In [25]:
def precompute_go_effects(go_df, gene_expression):
    go_to_mean_effect = {}
    valid_go_df = []

    for _, row in go_df.iterrows():
        go_id = row['GO_accession']
        gene_set = list(row['genes'])
        genes_in_effect = [g for g in gene_set if g in gene_expression.columns]
        if len(genes_in_effect) >= 2:
            go_to_mean_effect[go_id] = gene_expression[genes_in_effect].mean(axis=1)
            valid_go_df.append(row)

    return go_to_mean_effect, pd.DataFrame(valid_go_df)

def build_gene_to_go_map(go_df):
    gene_to_go = defaultdict(set)
    for _, row in go_df.iterrows():
        go_id = row['GO_accession']
        for gene in row['genes']:
            gene_to_go[gene].add(go_id)
    return gene_to_go

def process_gene_pair_optimized(pair, go_df, go_to_mean_effect, gene_to_go):
    a1, a2 = pair
    shared_go_ids = gene_to_go.get(a1, set()).intersection(gene_to_go.get(a2, set()))

    if not shared_go_ids:
        return pd.DataFrame(), pd.DataFrame()

    go_processes_for_pair = []
    go_name_list = []
    len_of_go = []
    pair_expression_df = pd.DataFrame()

    for go_id in shared_go_ids:
        row = go_df[go_df['GO_accession'] == go_id].iloc[0]
        gene_set = row['genes']
        go_processes_for_pair.append(go_id)
        go_name_list.append(row['GO_name'])
        len_of_go.append(len(gene_set))
        pair_expression_df[go_id] = go_to_mean_effect[go_id]

    pair_df = pd.DataFrame({
        'paralog_pairs': [pair] * len(go_processes_for_pair),
        'GO_accession': go_processes_for_pair,
        'GO_name': go_name_list,
        'len_of_go': len_of_go
    }).sort_values(by='len_of_go', ascending=False).reset_index(drop=True)

    return pair_df, pair_expression_df

In [26]:
def summarize_GO_expression_scores(pair_df, pair_expression_df, pair, paralog_pairs_dict=None):
    if pair_df.empty or pair_expression_df.empty:
        return None

    result = {}

    # --- smallest_GO_accession (per cell line) ---
    min_len = pair_df['len_of_go'].min()
    smallest_df = pair_df[pair_df['len_of_go'] == min_len]
    smallest_go_ids = smallest_df['GO_accession'].tolist()
    smallest_subset = pair_expression_df[smallest_go_ids]

    # For each cell line, get the GO with the highest expression among the smallest GOs
    result['smallest_GO_accession'] = smallest_subset.idxmax(axis=1)
    result['smallest_gene_expression'] = smallest_subset.max(axis=1)

    # --- min/max GO by expression (per cell line) ---
    result['min_GO_accession'] = pair_expression_df.idxmin(axis=1)
    result['min_gene_expression'] = pair_expression_df.min(axis=1)

    result['max_GO_accession'] = pair_expression_df.idxmax(axis=1)
    result['max_gene_expression'] = pair_expression_df.max(axis=1)

    # --- paralog pair label ---
    if paralog_pairs_dict is not None:
        result['paralog_pair'] = paralog_pairs_dict.get(pair, '|'.join(pair))
    else:
        result['paralog_pair'] = '|'.join(pair)

    # --- return as DataFrame ---
    return pd.DataFrame({
        'cell_line': pair_expression_df.index,
        'paralog_pair': result['paralog_pair'],
        'smallest_GO_accession': result['smallest_GO_accession'].values,
        'smallest_gene_expression': result['smallest_gene_expression'].values,
        'min_GO_accession': result['min_GO_accession'].values,
        'min_gene_expression': result['min_gene_expression'].values,
        'max_GO_accession': result['max_GO_accession'].values,
        'max_gene_expression': result['max_gene_expression'].values,
    })

In [27]:
# Step 1: Precompute mean gene expression per GO term
go_to_mean_expression, filtered_go_df_expr = precompute_go_effects(
    sum_bp_paralogs_df, gene_expression
)

# Step 2: Build gene → GO term mapping
gene_to_go_expr = build_gene_to_go_map(filtered_go_df_expr)

# Step 4: Run loop over paralog pairs
expression_summaries = []

for pair in paralog_pairs:
    pair_df, pair_expression_df = process_gene_pair_optimized(
        pair, filtered_go_df_expr, go_to_mean_expression, gene_to_go_expr
    )
    summary_df = summarize_GO_expression_scores(pair_df, pair_expression_df, pair, paralog_pairs_dict)

    if summary_df is not None:
        expression_summaries.append(summary_df)

# Step 5: Final combined result
final_expression_df = pd.concat(expression_summaries).reset_index(drop=True)

In [28]:
final_expression_df[:3]

Unnamed: 0,cell_line,paralog_pair,smallest_GO_accession,smallest_gene_expression,min_GO_accession,min_gene_expression,max_GO_accession,max_gene_expression
0,ACH-001113,SMARCA2_SMARCA4,GO:0070316,4.449597,GO:0045596,2.391188,GO:2000819,4.702816
1,ACH-001289,SMARCA2_SMARCA4,GO:0070316,4.992393,GO:0045596,2.311941,GO:2000819,5.106746
2,ACH-001339,SMARCA2_SMARCA4,GO:0070316,4.407439,GO:0045596,2.210668,GO:2000819,4.510038


In [29]:
final_expression_df['paralog_pair'].nunique(), paralog_pairs.nunique()

(15793, 36237)

In [30]:
###### Test ######

#PRKACA_PRKACB
#pair = {'ENSG00000072062', 'ENSG00000142875'}

#PTK2_PTK2B
pair = {'ENSG00000169398', 'ENSG00000120899'}

filtered_df = sum_bp_paralogs_df[sum_bp_paralogs_df['genes'].apply(lambda x: pair.issubset(x))]
filtered_df.insert(filtered_df.shape[1], 'len_of_genes', filtered_df['genes'].apply(len))

go_effect_subset = {go: go_to_mean_expression[go] for go in filtered_df['GO_accession']}
go_effect_df = pd.DataFrame(go_effect_subset)

min_len = filtered_df['len_of_genes'].min()
smallest_df = filtered_df[filtered_df['len_of_genes'] == min_len]
smallest_go_ids = smallest_df['GO_accession'].tolist()
smallest_subset = go_effect_df[smallest_go_ids]

#smallest GOs: GO:0007172 and GO:1902531

result_df = pd.DataFrame(index=go_effect_df.index)
result_df['smallest_GO_accession'] = smallest_subset.idxmax(axis=1)
result_df['smallest_gene_expression'] = smallest_subset.max(axis=1)
result_df['min_GO_accession'] = go_effect_df.idxmin(axis=1)
result_df['min_gene_expression'] = go_effect_df.min(axis=1)
result_df['max_GO_accession'] = go_effect_df.idxmax(axis=1)
result_df['max_gene_expression'] = go_effect_df.max(axis=1)

# include all GO expression values for comparison
result_df = pd.concat([result_df, go_effect_df], axis=1)
display(result_df.loc[result_df.index == 'ACH-000022'])

# --- paralog pair ---
display(final_expression_df.loc[(final_expression_df.paralog_pair == 'PRKACA_PRKACB') & (final_expression_df.cell_line == 'ACH-000022'), :])

###### Test ######

Unnamed: 0,smallest_GO_accession,smallest_gene_expression,min_GO_accession,min_gene_expression,max_GO_accession,max_gene_expression,GO:0007172,GO:0007173,GO:0007229,GO:0018108,GO:0030155,GO:0045860,GO:0048010,GO:0051128,GO:0051239,GO:0071310,GO:1902531,GO:2000060
ACH-000022,GO:0007172,4.378402,GO:0018108,2.120749,GO:0007172,4.378402,4.378402,3.406025,2.990841,2.120749,2.714527,2.675518,2.553962,3.321127,2.837618,2.525463,4.360907,2.968965


Unnamed: 0,cell_line,paralog_pair,smallest_GO_accession,smallest_gene_expression,min_GO_accession,min_gene_expression,max_GO_accession,max_gene_expression
220029,ACH-000022,PRKACA_PRKACB,GO:0034380,2.107183,GO:0070613,1.742455,GO:0001843,3.426666


In [34]:
go_to_mean_expr_norm, filtered_go_df_expr_norm = precompute_go_effects(
    sum_bp_paralogs_df, zexpression_data
)

gene_to_go_expr_norm = build_gene_to_go_map(filtered_go_df_expr_norm)

# Step 3: Process paralog pairs
normalized_expr_summaries = []

for pair in paralog_pairs:
    pair_df, pair_expr_df = process_gene_pair_optimized(
        pair, filtered_go_df_expr_norm, go_to_mean_expr_norm, gene_to_go_expr_norm
    )
    summary_df = summarize_GO_expression_scores(
        pair_df, pair_expr_df, pair, paralog_pairs_dict
    )

    if summary_df is not None:
        normalized_expr_summaries.append(summary_df)

# Step 4: Final DataFrame
final_expr_norm_df = pd.concat(normalized_expr_summaries).reset_index(drop=True)

In [35]:
final_expr_norm_df['paralog_pair'].nunique(), paralog_pairs.nunique()

(15793, 36237)

In [36]:
final_expression_df = final_expression_df.rename(columns={
    'smallest_GO_accession': 'smallest_BP_GO_accession',
    'smallest_gene_expression': 'smallest_BP_GO_gene_expression',
    'min_GO_accession': 'min_BP_GO_accession', 
    'min_gene_expression': 'min_BP_GO_gene_expression',
    'max_GO_accession': 'max_BP_GO_accession', 
    'max_gene_expression': 'max_BP_GO_gene_expression',
})

final_expr_norm_df = final_expr_norm_df.rename(columns={
    'smallest_GO_accession': 'zsmallest_BP_GO_accession',
    'smallest_gene_expression': 'zsmallest_BP_GO_gene_expression',
    'min_GO_accession': 'zmin_BP_GO_accession', 
    'min_gene_expression': 'zmin_BP_GO_gene_expression',
    'max_GO_accession': 'zmax_BP_GO_accession', 
    'max_gene_expression': 'zmax_BP_GO_gene_expression',
})

In [37]:
BP_GO_merged_df = pd.merge(
    final_expression_df, final_expr_norm_df,
    on=['paralog_pair', 'cell_line'],
    how='outer',
)

In [38]:
BP_GO_merged_df[:3]

Unnamed: 0,cell_line,paralog_pair,smallest_BP_GO_accession,smallest_BP_GO_gene_expression,min_BP_GO_accession,min_BP_GO_gene_expression,max_BP_GO_accession,max_BP_GO_gene_expression,zsmallest_BP_GO_accession,zsmallest_BP_GO_gene_expression,zmin_BP_GO_accession,zmin_BP_GO_gene_expression,zmax_BP_GO_accession,zmax_BP_GO_gene_expression
0,ACH-000001,A1CF_RBM47,GO:0016554,0.977073,GO:0016554,0.977073,GO:0016554,0.977073,GO:0016554,-0.37568,GO:0016554,-0.37568,GO:0016554,-0.37568
1,ACH-000002,A1CF_RBM47,GO:0016554,0.293647,GO:0016554,0.293647,GO:0016554,0.293647,GO:0016554,-0.78886,GO:0016554,-0.78886,GO:0016554,-0.78886
2,ACH-000003,A1CF_RBM47,GO:0016554,2.008663,GO:0016554,2.008663,GO:0016554,2.008663,GO:0016554,0.650103,GO:0016554,0.650103,GO:0016554,0.650103


In [39]:
# Step 1: Precompute mean z-expression per GO term (CC ontology)
go_to_mean_expr_cc, filtered_go_df_expr_cc = precompute_go_effects(
    sum_cc_paralogs_df, gene_expression
)

# Step 2: Build gene → GO term map
gene_to_go_expr_cc = build_gene_to_go_map(filtered_go_df_expr_cc)

# Step 3: Process paralog pairs and summarize GO scores
normalized_expr_summaries_cc = []

for pair in paralog_pairs:
    pair_df, pair_expr_df = process_gene_pair_optimized(
        pair,
        filtered_go_df_expr_cc,
        go_to_mean_expr_cc,
        gene_to_go_expr_cc
    )

    summary_df = summarize_GO_expression_scores(
        pair_df,
        pair_expr_df,
        pair,
        paralog_pairs_dict
    )

    if summary_df is not None:
        normalized_expr_summaries_cc.append(summary_df)

# Step 4: Combine results
final_expr_cc_df = pd.concat(normalized_expr_summaries_cc).reset_index(drop=True)

In [40]:
# Step 1: Precompute mean z-expression per GO term (CC ontology)
go_to_mean_expr_norm_cc, filtered_go_df_expr_norm_cc = precompute_go_effects(
    sum_cc_paralogs_df, zexpression_data
)

# Step 2: Build gene → GO term map
gene_to_go_expr_norm_cc = build_gene_to_go_map(filtered_go_df_expr_norm_cc)

# Step 3: Process paralog pairs and summarize GO scores
normalized_expr_summaries_cc = []

for pair in paralog_pairs:
    pair_df, pair_expr_df = process_gene_pair_optimized(
        pair,
        filtered_go_df_expr_norm_cc,
        go_to_mean_expr_norm_cc,
        gene_to_go_expr_norm_cc
    )

    summary_df = summarize_GO_expression_scores(
        pair_df,
        pair_expr_df,
        pair,
        paralog_pairs_dict
    )

    if summary_df is not None:
        normalized_expr_summaries_cc.append(summary_df)

# Step 4: Combine results
final_expr_norm_cc_df = pd.concat(normalized_expr_summaries_cc).reset_index(drop=True)

In [41]:
final_expr_cc_df = final_expr_cc_df.rename(columns={
    'smallest_GO_accession': 'smallest_CC_GO_accession',
    'smallest_gene_expression': 'smallest_CC_GO_gene_expression',
    'min_GO_accession': 'min_CC_GO_accession', 
    'min_gene_expression': 'min_CC_GO_gene_expression',
    'max_GO_accession': 'max_CC_GO_accession', 
    'max_gene_expression': 'max_CC_GO_gene_expression',
})

final_expr_norm_cc_df = final_expr_norm_cc_df.rename(columns={
    'smallest_GO_accession': 'zsmallest_CC_GO_accession',
    'smallest_gene_expression': 'zsmallest_CC_GO_gene_expression',
    'min_GO_accession': 'zmin_CC_GO_accession', 
    'min_gene_expression': 'zmin_CC_GO_gene_expression',
    'max_GO_accession': 'zmax_CC_GO_accession', 
    'max_gene_expression': 'zmax_CC_GO_gene_expression',
})

In [42]:
CC_GO_merged_df = pd.merge(
    final_expr_cc_df, final_expr_norm_cc_df,
    on=['paralog_pair', 'cell_line'],
    how='outer',
)

In [43]:
CC_GO_merged_df[:3]

Unnamed: 0,cell_line,paralog_pair,smallest_CC_GO_accession,smallest_CC_GO_gene_expression,min_CC_GO_accession,min_CC_GO_gene_expression,max_CC_GO_accession,max_CC_GO_gene_expression,zsmallest_CC_GO_accession,zsmallest_CC_GO_gene_expression,zmin_CC_GO_accession,zmin_CC_GO_gene_expression,zmax_CC_GO_accession,zmax_CC_GO_gene_expression
0,ACH-000001,A2M_C3,GO:0072562,2.248426,GO:0072562,2.248426,GO:0072562,2.248426,GO:0072562,0.115564,GO:0072562,0.115564,GO:0072562,0.115564
1,ACH-000002,A2M_C3,GO:0072562,1.409691,GO:0072562,1.409691,GO:0072562,1.409691,GO:0072562,-0.363556,GO:0072562,-0.363556,GO:0072562,-0.363556
2,ACH-000003,A2M_C3,GO:0072562,2.82625,GO:0072562,2.82625,GO:0072562,2.82625,GO:0072562,0.790761,GO:0072562,0.790761,GO:0072562,0.790761


In [44]:
CC_GO_merged_df.loc[(CC_GO_merged_df['paralog_pair'] == 'PRKACA_PRKACB') & (CC_GO_merged_df['cell_line'] == 'ACH-000022'), ]

Unnamed: 0,cell_line,paralog_pair,smallest_CC_GO_accession,smallest_CC_GO_gene_expression,min_CC_GO_accession,min_CC_GO_gene_expression,max_CC_GO_accession,max_CC_GO_gene_expression,zsmallest_CC_GO_accession,zsmallest_CC_GO_gene_expression,zmin_CC_GO_accession,zmin_CC_GO_gene_expression,zmax_CC_GO_accession,zmax_CC_GO_gene_expression
7437076,ACH-000022,PRKACA_PRKACB,GO:0005952,2.395621,GO:0097546,2.282365,GO:0005952,2.395621,GO:0005952,-0.383464,GO:0005952,-0.383464,GO:0097546,-0.190227


In [None]:
# save the files
output_path = get_data_path(['input', 'GO'], '')

BP_GO_merged_df.to_parquet(os.path.join(output_path, 'go_BP_expression.parquet'), engine='pyarrow', index=True)
CC_GO_merged_df.to_parquet(os.path.join(output_path, 'go_CC_expression.parquet'), engine='pyarrow', index=True)