In [1]:
# import modules
import os
import re
import pandas as pd
import numpy as np
import gc  # garbage collector
from functools import reduce

In [2]:
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

# breast cancer cell lines
breast_cancer_df_path = get_data_path(['output', 'breast_cancer'], 'paralog_pairs_breast_cancer_df.csv')

# load features
depmap_folder_path = get_data_path(['output', 'processed_DepMap22Q4'], '')


In [3]:
breast_cancer_df = pd.read_csv(breast_cancer_df_path, low_memory=False)
breast_cancer_df[:3]

Unnamed: 0,prediction_rank,prediction_percentile,old_genepair,genepair,A1,A2,A1_entrez,A2_entrez,A1_ensembl,A2_ensembl,...,A1_new,A2_new,A1_ensembl_new,A2_ensembl_new,DepMap_ID,cell_line,cancer_type,mutation_status,her_label,rb1_defects
0,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,SMARCA2,SMARCA4,ENSG00000080503,ENSG00000127616,ACH-000017,SKBR3,Breast,No mutation,her2_pos,unknown
1,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,SMARCA2,SMARCA4,ENSG00000080503,ENSG00000127616,ACH-000019,MCF7,Breast,PIK3CA mutated,her2_neg,unknown
2,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,SMARCA2,SMARCA4,ENSG00000080503,ENSG00000127616,ACH-000028,KPL1,Breast,PIK3CA mutated,her2_neg,unknown


In [4]:
def get_feature_files(folder):
    """Return sorted list of CSV file paths and their base names (without extension) from a folder."""
    csv_files = [
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.endswith('.csv')
    ]
    csv_files = sorted(csv_files)
    filenames = [os.path.splitext(os.path.basename(f))[0] for f in csv_files]
    return csv_files, filenames

In [5]:
feature_files, feature_names = get_feature_files(depmap_folder_path)
feature_files

['/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_DepMap22Q4/common_essentials.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_DepMap22Q4/copy_number_data.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_DepMap22Q4/expression_data.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_DepMap22Q4/gene_effect_data.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_DepMap22Q4/mutation_data.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_predictio

In [6]:
# extract the mutation data, adn common essentials from files and filenames
remove_idx = [0, 4]
ufeature_files = np.delete(feature_files, remove_idx)
ufeature_names = np.delete(feature_names, remove_idx)

In [None]:
def process_single_depmap_file(file, cell_lines):
    """Process a single DepMap file and return melted dataframe"""
    
    # Load and filter data
    filtered_df = pd.read_csv(file, index_col=0, low_memory=False)
    filtered_df = filtered_df[filtered_df.index.isin(cell_lines['DepMap_ID'])].reset_index()
    filtered_df = filtered_df.rename(columns={"index":"DepMap_ID"})

    # Convert wide format to long format
    melt_df = pd.melt(filtered_df.drop(["cell_name"], axis=1), 
                      id_vars=["DepMap_ID"], 
                      value_vars=filtered_df.drop(["cell_name"], axis=1).columns,
                      var_name='entrez_id', value_name='value')
    
    melt_df = melt_df.astype({'entrez_id':'int'})
    feature_name = os.path.basename(file).split(".")[0]
    melt_df = melt_df.rename(columns={'value': feature_name})
    
    # Clean up memory
    del filtered_df
    return melt_df

def process_depmap_files_memory_efficient(ufiles, cell_lines):
    """Process DepMap files one at a time to reduce memory usage"""
    results = []
    
    for file in ufiles:
        melt_df = process_single_depmap_file(file, cell_lines)
        results.append(melt_df)
        del melt_df
    
    return results

def annotate_features_memory_efficient(target_df, ufiles, cell_lines):
    """Annotate features one file at a time to reduce memory usage"""
    df = target_df.copy()
    
    for file in ufiles:
        
        # Process file using the shared function
        melt_df = process_single_depmap_file(file, cell_lines)
        feature_name = os.path.basename(file).split(".")[0]
        
        # Annotate A1
        df = pd.merge(
            df,
            melt_df.rename(columns={'entrez_id': 'A1_entrez'}),
            on=['DepMap_ID', 'A1_entrez'],
            how='left'
        )
        df = df.rename(columns={feature_name: f'A1_{feature_name}'})
        
        # Annotate A2
        df = pd.merge(
            df,
            melt_df.rename(columns={'entrez_id': 'A2_entrez'}),
            on=['DepMap_ID', 'A2_entrez'],
            how='left'
        )
        df = df.rename(columns={feature_name: f'A2_{feature_name}'})
        
        # Clean up memory
        del melt_df
        
        print(f"Completed annotation for {file}")
    
    return df

In [8]:
# Memory-efficient annotation workflow
import gc  # garbage collector

# Extract cell lines from breast cancer data
cell_lines = pd.DataFrame(breast_cancer_df["DepMap_ID"].unique(), columns=["DepMap_ID"])
print(f"Processing {len(cell_lines)} cell lines")

# Process DepMap features with memory management
print("\n=== Annotating DepMap Features ===")
annotated_df = annotate_features_memory_efficient(breast_cancer_df, ufeature_files, cell_lines)

# Force garbage collection after major processing
gc.collect()

print(f"Shape after DepMap annotation: {annotated_df.shape}")

Starting memory-efficient annotation...
Processing 94 cell lines

=== Annotating DepMap Features ===
Annotating /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_DepMap22Q4/copy_number_data.csv...
Processing /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_DepMap22Q4/copy_number_data.csv...
Completed /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_DepMap22Q4/copy_number_data.csv
Completed annotation for /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_DepMap22Q4/copy_number_data.csv
Annotating /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/process

In [9]:
annotated_df[:3]

Unnamed: 0,prediction_rank,prediction_percentile,old_genepair,genepair,A1,A2,A1_entrez,A2_entrez,A1_ensembl,A2_ensembl,...,A1_copy_number_data,A2_copy_number_data,A1_expression_data,A2_expression_data,A1_gene_effect_data,A2_gene_effect_data,A1_zexpression_data,A2_zexpression_data,A1_zgene_effect_data,A2_zgene_effect_data
0,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,0.920247,0.817367,2.803227,6.202418,0.08888,-1.519226,-1.269232,0.009565,0.368872,-3.538792
1,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,0.865182,1.014479,5.109361,6.877008,0.126018,-0.484287,0.368598,0.711733,0.542432,-0.356751
2,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,0.876326,1.032156,4.920293,7.112596,0.095795,-0.06504,0.234321,0.956952,0.401188,0.932271


In [10]:
# Add pairwise features (lightweight operation)
def calculate_pairwise_features(df):
    return df.assign(
        rMaxExp_A1A2 = lambda x: np.amax(x[['A1_expression_data', 'A2_expression_data']], axis=1),
        rMinExp_A1A2 = lambda x: np.amin(x[['A1_expression_data', 'A2_expression_data']], axis=1),
        max_cn = lambda x: np.amax(x[['A1_copy_number_data', 'A2_copy_number_data']], axis=1),
        min_cn = lambda x: np.amin(x[['A1_copy_number_data', 'A2_copy_number_data']], axis=1),
        rMaxESS_A1A2 = lambda x: np.amax(x[['A1_gene_effect_data', 'A2_gene_effect_data']], axis=1),
        rMinESS_A1A2 = lambda x: np.amin(x[['A1_gene_effect_data', 'A2_gene_effect_data']], axis=1),
        zMaxExp_A1A2 = lambda x: np.amax(x[['A1_zexpression_data', 'A2_zexpression_data']], axis=1),
        zMinExp_A1A2 = lambda x: np.amin(x[['A1_zexpression_data', 'A2_zexpression_data']], axis=1),
        zMaxESS_A1A2 = lambda x: np.amax(x[['A1_zgene_effect_data', 'A2_zgene_effect_data']], axis=1),
        zMinESS_A1A2 = lambda x: np.amin(x[['A1_zgene_effect_data', 'A2_zgene_effect_data']], axis=1)
    )

enriched_df = calculate_pairwise_features(annotated_df)

# Clear the previous dataframe
del annotated_df
gc.collect()

print(f"Shape after pairwise features: {enriched_df.shape}")

Shape after pairwise features: (3444912, 69)


In [11]:
# Memory-efficient mutation annotation
print("\n=== Annotating Mutation Features ===")

def annotate_mutations_memory_efficient(mapped_df, mutation_file):
    
    # Load only the required cell lines
    cell_line_filter = mapped_df['DepMap_ID'].unique()
    
    # Read mutation data in chunks if it's very large
    mutation_df = pd.read_csv(mutation_file, low_memory=False)
    filtered = mutation_df[mutation_df['DepMap_ID'].isin(cell_line_filter)]
    
    # Clean up original mutation data
    del mutation_df
    gc.collect()
    
    filtered = filtered[['entrez_id', 'DepMap_ID', 'Damaging', 'VariantInfo']].copy()
    filtered['VariantInfo'] = 1
    filtered = filtered.drop_duplicates(subset=['entrez_id', 'DepMap_ID']).reset_index(drop=True)

    # Merge A1 mutations
    mapped_df = pd.merge(
        mapped_df,
        filtered.rename(columns={
            'entrez_id': 'A1_entrez',
            'VariantInfo': 'A1_mut',
            'Damaging': 'A1_Deleterious'
        }),
        on=['DepMap_ID', 'A1_entrez'],
        how='left'
    )

    # Merge A2 mutations
    mapped_df = pd.merge(
        mapped_df,
        filtered.rename(columns={
            'entrez_id': 'A2_entrez',
            'VariantInfo': 'A2_mut',
            'Damaging': 'A2_Deleterious'
        }),
        on=['DepMap_ID', 'A2_entrez'],
        how='left'
    )
    
    # Clean up
    del filtered
    gc.collect()

    return mapped_df

def finalize_mutation_flags(df):
    pd.set_option('future.no_silent_downcasting', True)

    df['A1_Deleterious'] = df['A1_Deleterious'].fillna(0).astype(int)
    df['A2_Deleterious'] = df['A2_Deleterious'].fillna(0).astype(int)
    df['A1_mut'] = df['A1_mut'].fillna(0).astype(int)
    df['A2_mut'] = df['A2_mut'].fillna(0).astype(int)

    df['Protein_Altering'] = (df['A1_mut'] + df['A2_mut']).astype(int)
    df['Damaging'] = (df['A1_Deleterious'] + df['A2_Deleterious']).astype(int)

    # Optional: drop rows with any remaining NaNs
    # df.dropna(axis=0, how='any', inplace=True)

    return df

# Apply mutation annotations
mutation_file = get_data_path(['output', 'processed_DepMap22Q4'], 'mutation_data.csv')
enriched_df = annotate_mutations_memory_efficient(enriched_df, mutation_file)

# Finalize mutation flags
enriched_df = finalize_mutation_flags(enriched_df)

print(f"Shape after mutation annotation: {enriched_df.shape}")


=== Annotating Mutation Features ===
Shape after mutation annotation: (3444912, 75)


In [None]:
# Common essential filtering
print("\n=== Filtering Common Essentials ===")

def filter_common_essentials(df, essentials_df):
    """Filter out common essential gene pairs"""
    
    # Filter out pairs where A1 is a common essential
    filtered_df = df[~df["A1_entrez"].isin(essentials_df["Essentials"])]
    # Filter out pairs where A2 is a common essential  
    mut_filtered_df = filtered_df[~filtered_df["A2_entrez"].isin(essentials_df["Essentials"])]
    mut_filtered_df = mut_filtered_df.reset_index(drop=True)
    
    before_count = len(df)
    after_count = len(mut_filtered_df)
    print(f"Filtered out {before_count - after_count} pairs containing common essential genes")
    print(f"Remaining pairs: {after_count}")
    
    return mut_filtered_df

# Load common essentials
common_essentials_path = get_data_path(['output', 'processed_DepMap22Q4'], 'common_essentials.csv')
common_essentials_df = pd.read_csv(common_essentials_path)

# Apply common essentials filtering for VERSION 1 (WITHOUT common essentials)
enriched_df_filtered = filter_common_essentials(enriched_df, common_essentials_df)

print(f"Shape after common essentials filtering: {enriched_df_filtered.shape}")

# Keep original dataframe for VERSION 2 (WITH common essentials)
enriched_df_with_essentials = enriched_df.copy()
print(f"Shape for version WITH common essentials: {enriched_df_with_essentials.shape}")

# Continue with filtered version as main pipeline
enriched_df = enriched_df_filtered
del enriched_df_filtered
gc.collect()



=== Filtering Common Essentials ===
Filtered out 217328 pairs containing common essential genes
Remaining pairs: 3227584
Shape after common essentials filtering: (3227584, 75)
Shape for version WITH common essentials: (3444912, 75)


0

In [13]:
# Save intermediate results to avoid re-processing
print("\n=== Saving Intermediate Results ===")
intermediate_path = get_data_path(['output', 'breast_cancer'], 'paralog_pairs_breast_cancer_df_depmap_annotated.parquet')
enriched_df.to_parquet(intermediate_path, index=False)
print(f"Saved intermediate results: {intermediate_path}")

intermediate_path_for_essentials = get_data_path(['output', 'breast_cancer'], 'paralog_pairs_breast_cancer_df_w_essentials_depmap_annotated.parquet')
enriched_df_with_essentials.to_parquet(intermediate_path_for_essentials, index=False)
print(f"Saved intermediate results: {intermediate_path_for_essentials}")


=== Saving Intermediate Results ===
Saved intermediate results: /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/breast_cancer/paralog_pairs_breast_cancer_df_depmap_annotated.parquet
Saved intermediate results: /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/breast_cancer/paralog_pairs_breast_cancer_df_w_essentials_depmap_annotated.parquet


In [14]:
# Add network features annotation - PPI and GO features (following 02_annotate_networkfeatures.ipynb exactly)
print("\n=== Annotating Network Features ===")

# Extract cell lines from breast cancer data
cell_lines = pd.DataFrame(enriched_df["DepMap_ID"].unique(), columns=["DepMap_ID"])

# Paths to network feature files
go_files_path = get_data_path(['input', 'GO'], '')

# Get only the specific 2 PPI files needed
ppi_files = [
    get_data_path(['input', 'PPI'], 'weighted_PPI_expression.parquet'),
    get_data_path(['input', 'PPI'], 'weighted_PPI_essentiality.parquet')
]

# Filter to only existing files
ppi_files = [f for f in ppi_files if os.path.exists(f)]
ppi_filenames = [os.path.basename(f).split(".")[0] for f in ppi_files]

# Get GO files (only parquet files)
go_files = list()
go_filenames = list()

for filename in os.listdir(go_files_path):
    if filename.endswith('.parquet'):
        base_filename = os.path.split(filename)[1]
        go_filenames.append(base_filename.split(".")[0])

        file_directory = os.path.join(go_files_path, filename)
        go_files.append(file_directory)

# GO dictionary for mapping (exact from original)
go_dict = {
    'go_BP_expression':'smallest_BP_GO_expression',
    'go_CC_expression':'smallest_CC_GO_expression',
    'go_BP_ranked_essentiality':'smallest_BP_GO_essentiality',
    'go_CC_ranked_essentiality':'smallest_CC_GO_essentiality'
}

def process_ppi_files(ppi_files, cell_lines):
    results = []

    for file in ppi_files:
        
        # Only process parquet files
        ppi_df = pd.read_parquet(file)
        ppi_df = ppi_df[ppi_df.index.isin(cell_lines['DepMap_ID'])].reset_index()
        ppi_df = ppi_df.rename(columns={"index":"DepMap_ID"})

        # Convert wide format to long format
        melt_ppi = ppi_df.melt(id_vars=["DepMap_ID"], var_name="genepair", value_name="PPI")
        melt_ppi = melt_ppi.rename(columns={'PPI': os.path.basename(file).split(".")[0]})
        results.append(melt_ppi)
        print(f"Processed {file}")
            
    return results

def process_go_files(go_files, cell_lines, go_dict):
    results = []

    for file in go_files:
        basename = os.path.basename(file).split(".")[0]
        value_col = go_dict.get(basename)

        if not value_col:
            print(f"Skipping file {file} as it does not match any known GO file names in go_dict.")
            continue

        # Only process parquet files
        go_df = pd.read_parquet(file)
        go_df = go_df[go_df['cell_line'].isin(cell_lines['DepMap_ID'])]
        go_df = go_df[['cell_line', 'paralog_pair', value_col]]

        go_df = go_df.rename(columns={"cell_line": "DepMap_ID", "paralog_pair": "genepair", value_col: basename})
        results.append(go_df)
        print(f"Processed {file}")
            
    return results

# Process PPI files
list_of_ppi_files = process_ppi_files(ppi_files, cell_lines)

# Process GO files
list_of_go_files = process_go_files(go_files, cell_lines, go_dict)

# Merge all PPI dataframes on 'DepMap_ID' and 'genepair' (exact from original)
ppi_merged_df = reduce(lambda left, right: pd.merge(left, right, on=['DepMap_ID', 'genepair'], how='outer'), list_of_ppi_files)

# Update the column names in the PPI merged dataframe (focused on the 2 specific files)
rename_dict = {
    'weighted_PPI_expression': 'weighted_PPI_expression',                  
    'weighted_PPI_essentiality': 'weighted_PPI_essentiality'
}

ppi_merged_df = ppi_merged_df.rename(columns=rename_dict)
print(f"PPI columns after rename: {list(ppi_merged_df.columns)}")

# Merge all GO dataframes on 'DepMap_ID' and 'genepair' (exact from original)
go_merged_df = reduce(lambda left, right: pd.merge(left, right, on=['DepMap_ID', 'genepair'], how='outer'), list_of_go_files)

# Update the column names in the GO merged dataframe (exact from original)
rename_dict = {
    'go_CC_expression':'smallest_CC_GO_expression',
    'go_BP_ranked_essentiality':'smallest_BP_GO_essentiality',
    'go_BP_expression':'smallest_BP_GO_expression',
    'go_CC_ranked_essentiality': 'smallest_CC_GO_essentiality',
}

go_merged_df = go_merged_df.rename(columns=rename_dict)     

# Combine all PPI and GO dataframes (exact from original)
network_all_df = pd.merge(ppi_merged_df, go_merged_df, on=['DepMap_ID', 'genepair'], how='outer')

# Merge network features with main dataframe (exact from original)
enriched_df = pd.merge(enriched_df, network_all_df, on=['DepMap_ID', 'genepair'], how='left')

print(f"Shape after network annotation: {enriched_df.shape}")

# Clean up memory
del list_of_ppi_files, list_of_go_files, ppi_merged_df, go_merged_df
gc.collect()

print(f"Network features annotation completed")


=== Annotating Network Features ===
Processed /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/input/PPI/weighted_PPI_expression.parquet
Processed /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/input/PPI/weighted_PPI_essentiality.parquet
Processed /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/input/GO/go_CC_expression.parquet
Processed /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/input/GO/go_BP_ranked_essentiality.parquet
Processed /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/input/GO/go_BP_expression.parquet
Processed /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos

In [15]:
# Add ranked essentiality features (from 03_annotate_ranked_essentiality.ipynb)
print("\n=== Annotating Ranked Essentiality Features ===")

ranked_essentiality_files_path = get_data_path(['output', 'ranked_essentiality'], '')

def process_ranked_ess_files(ranked_files, cell_lines):

    results = []

    for file in ranked_files:
        print(f"Processing ranked file: {file}...")
        
        ranked_df = pd.read_csv(file, index_col=0)
        filtered_ranked_df = ranked_df[ranked_df.index.isin(cell_lines['DepMap_ID'])]
        filtered_ranked_df = filtered_ranked_df.reset_index(drop=False).rename(columns={"index":"DepMap_ID"})
        
        # Convert wide format to long format
        melt_ranked_df = pd.melt(filtered_ranked_df, id_vars=["DepMap_ID"], var_name="entrez_id", value_name="value")
        melt_ranked_df['entrez_id'] = melt_ranked_df['entrez_id'].astype(int)
        melt_ranked_df = melt_ranked_df.rename(columns={'value': os.path.basename(file).split(".")[0]})
        results.append(melt_ranked_df)
        
        # Clean up memory
        del ranked_df, filtered_ranked_df, melt_ranked_df
        gc.collect()
        
        print(f"Completed ranked file: {file}")

    return results

def annotate_ranked_features(target_df, processed_ranked_ess_files):

    df = target_df.copy()

    # Annotate A1
    for ranked_df in processed_ranked_ess_files:
        df = pd.merge(
            df,
            ranked_df.rename(columns={'entrez_id': 'A1_entrez'}),
            on=['DepMap_ID', 'A1_entrez'],
            how='left'
        )

    df = df.rename(columns={
        'ranked_essentiality':'A1_rank',
        'ranked_zessentiality':'zA1_rank'
    })

    # Annotate A2
    for ranked_df in processed_ranked_ess_files:
        df = pd.merge(
            df,
            ranked_df.rename(columns={'entrez_id': 'A2_entrez'}),
            on=['DepMap_ID', 'A2_entrez'],
            how='left'
        )

    df = df.rename(columns={
        'ranked_essentiality':'A2_rank',
        'ranked_zessentiality':'zA2_rank'
    })

    return df

# Process ranked essentiality features
ranked_files, _ = get_feature_files(ranked_essentiality_files_path)
processed_ranked_features = process_ranked_ess_files(ranked_files, cell_lines)

# Annotate ranked features
enriched_df = annotate_ranked_features(enriched_df, processed_ranked_features)

# Add ranked pairwise features
def calculate_ranked_pairwise_features(df):
    return df.assign(
        max_ranked_A1A2 = lambda x: np.amax(x[['A1_rank', 'A2_rank']], axis=1),
        min_ranked_A1A2 = lambda x: np.amin(x[['A1_rank', 'A2_rank']], axis=1),
        z_max_ranked_A1A2 = lambda x: np.amax(x[['zA1_rank', 'zA2_rank']], axis=1),
        z_min_ranked_A1A2 = lambda x: np.amin(x[['zA1_rank', 'zA2_rank']], axis=1),
    )

enriched_df = calculate_ranked_pairwise_features(enriched_df)

print(f"Shape after ranked essentiality annotation: {enriched_df.shape}")


=== Annotating Ranked Essentiality Features ===
Processing ranked file: /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/ranked_essentiality/ranked_essentiality.csv...
Completed ranked file: /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/ranked_essentiality/ranked_essentiality.csv
Processing ranked file: /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/ranked_essentiality/ranked_zessentiality.csv...
Completed ranked file: /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/ranked_essentiality/ranked_zessentiality.csv
Shape after ranked essentiality annotation: (3227584, 89)


In [16]:
enriched_df.head()

Unnamed: 0,prediction_rank,prediction_percentile,old_genepair,genepair,A1,A2,A1_entrez,A2_entrez,A1_ensembl,A2_ensembl,...,smallest_BP_GO_expression,smallest_CC_GO_essentiality,A1_rank,zA1_rank,A2_rank,zA2_rank,max_ranked_A1A2,min_ranked_A1A2,z_max_ranked_A1A2,z_min_ranked_A1A2
0,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,4.265264,5189.875,14677.0,0.656169,491.0,-0.947882,14677.0,491.0,0.656169,-0.947882
1,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,4.535862,7734.375,15721.0,0.913013,1854.0,-0.614466,15721.0,1854.0,0.913013,-0.614466
2,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,4.881678,8400.25,14888.0,0.708079,7562.0,0.781824,14888.0,7562.0,0.781824,0.708079
3,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,4.753704,,,,,,,,,
4,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,4.543316,8006.25,17378.0,1.320666,5275.0,0.222379,17378.0,5275.0,1.320666,0.222379


In [17]:
enriched_df.isna().sum()

prediction_rank                0
prediction_percentile          0
old_genepair                   0
genepair                       0
A1                             0
                          ...   
zA2_rank                 1824352
max_ranked_A1A2          1727008
min_ranked_A1A2          1727008
z_max_ranked_A1A2        1727008
z_min_ranked_A1A2        1727008
Length: 89, dtype: int64

In [18]:
# Add preprocessing function for missing value handling
print("\n=== Handling Missing Values ===")

def preprocess_dataset(df, old_df=None,
                       required_genepairs_col='genepair',
                       dropna_cols=None,
                       fillna_zero_cols=None,
                       fillna_large_cols=None,
                       fillna_large_value=18000):
    """
    Preprocess a training or testing dataset:
    - Keep only rows with genepairs present in `old_df` (optional)
    - Drop rows with NaN in specific columns
    - Fill missing values with default values
    
    Parameters:
        df (pd.DataFrame): Dataset to process
        old_df (pd.DataFrame): Dataset with allowed genepairs (optional)
        required_genepairs_col (str): Column name for genepairs to match
        dropna_cols (list): Columns for which rows with NaNs should be dropped
        fillna_zero_cols (list): Columns to fill NaNs with 0
        fillna_large_cols (list): Columns to fill NaNs with large constant
        fillna_large_value (int or float): The large value to fill (default: 18000)

    Returns:
        pd.DataFrame: Cleaned and processed DataFrame
    """
    
    df_filtered = df.copy()

    # Step 1: Filter rows to only those in old_df (if provided)
    if old_df is not None:
        df_filtered = df_filtered[df_filtered[required_genepairs_col].isin(old_df[required_genepairs_col])].copy()

    # Step 2: Drop rows with any NA in required columns
    if dropna_cols:
        initial_rows = len(df_filtered)
        df_filtered = df_filtered.dropna(axis=0, how='any', subset=dropna_cols)
        dropped_rows = initial_rows - len(df_filtered)
        if dropped_rows > 0:
            print(f"Dropped {dropped_rows} rows due to missing values in critical columns: {dropna_cols}")

    # Step 3: Fill NaNs with 0 or large number
    if fillna_zero_cols:
        for col in fillna_zero_cols:
            if col in df_filtered.columns:
                na_count = df_filtered[col].isna().sum()
                if na_count > 0:
                    df_filtered[col] = df_filtered[col].fillna(0)
                    print(f"Filled {na_count} missing values in '{col}' with 0")

    if fillna_large_cols:
        for col in fillna_large_cols:
            if col in df_filtered.columns:
                na_count = df_filtered[col].isna().sum()
                if na_count > 0:
                    df_filtered[col] = df_filtered[col].fillna(fillna_large_value)
                    print(f"Filled {na_count} missing values in '{col}' with {fillna_large_value}")

    # Step 4: Reset index for clean result
    return df_filtered.reset_index(drop=True)

# Check missing values before preprocessing
print("Missing values before preprocessing:")
missing_before = enriched_df.isna().sum()
missing_cols = missing_before[missing_before > 0]
if len(missing_cols) > 0:
    print(missing_cols)
else:
    print("No missing values found!")

# Define missing value handling strategy
drop_na_values = ['rMaxExp_A1A2', 'rMinExp_A1A2', 'max_ranked_A1A2', 'min_ranked_A1A2']
fillna_values = ['weighted_PPI_expression', 'smallest_BP_GO_expression', 'smallest_CC_GO_expression']
fillna_values_v2 = ['weighted_PPI_essentiality', 'smallest_BP_GO_essentiality', 'smallest_CC_GO_essentiality']

# Apply preprocessing to handle missing values
enriched_df_clean = preprocess_dataset(
    df=enriched_df,
    old_df=None,  # Don't filter by genepairs since this is already our target dataset
    dropna_cols=drop_na_values,
    fillna_zero_cols=fillna_values,
    fillna_large_cols=fillna_values_v2
)

# Check missing values after preprocessing
print("\nMissing values after preprocessing:")
missing_after = enriched_df_clean.isna().sum()
missing_cols_after = missing_after[missing_after > 0]
if len(missing_cols_after) > 0:
    print(missing_cols_after)
else:
    print("✓ All missing values handled successfully!")

print(f"\nShape before preprocessing: {enriched_df.shape}")
print(f"Shape after preprocessing: {enriched_df_clean.shape}")

# Update the enriched_df with the cleaned version
enriched_df = enriched_df_clean
del enriched_df_clean
gc.collect()


=== Handling Missing Values ===
Missing values before preprocessing:
depmap_hit                     2905916
A1_entrez_new                      846
A2_entrez_new                     1316
A1_new                             846
A2_new                            1316
A1_ensembl_new                     846
A2_ensembl_new                    1316
A1_copy_number_data             833207
A2_copy_number_data             836272
A1_expression_data             1076953
A2_expression_data             1080733
A1_gene_effect_data            1811488
A2_gene_effect_data            1824352
A1_zexpression_data            1076953
A2_zexpression_data            1080733
A1_zgene_effect_data           1811488
A2_zgene_effect_data           1824352
rMaxExp_A1A2                   1068763
rMinExp_A1A2                   1068763
max_cn                          826157
min_cn                          826157
rMaxESS_A1A2                   1727008
rMinESS_A1A2                   1727008
zMaxExp_A1A2                   10

0

In [19]:
print(f'Number of unique gene pairs: {enriched_df.genepair.nunique()}')
print(f'Number of unique cell lines: {enriched_df.cell_line.nunique()}')
enriched_df[:3]

Number of unique gene pairs: 31262
Number of unique cell lines: 45


Unnamed: 0,prediction_rank,prediction_percentile,old_genepair,genepair,A1,A2,A1_entrez,A2_entrez,A1_ensembl,A2_ensembl,...,smallest_BP_GO_expression,smallest_CC_GO_essentiality,A1_rank,zA1_rank,A2_rank,zA2_rank,max_ranked_A1A2,min_ranked_A1A2,z_max_ranked_A1A2,z_min_ranked_A1A2
0,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,4.265264,5189.875,14677.0,0.656169,491.0,-0.947882,14677.0,491.0,0.656169,-0.947882
1,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,4.535862,7734.375,15721.0,0.913013,1854.0,-0.614466,15721.0,1854.0,0.913013,-0.614466
2,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,4.881678,8400.25,14888.0,0.708079,7562.0,0.781824,14888.0,7562.0,0.781824,0.708079


In [20]:
# Save the fully annotated breast cancer dataset
print("\n=== Saving Final Annotated Dataset ===")

# Final dataset name following the naming convention
final_output_path = get_data_path(['output', 'breast_cancer'], 'paralog_pairs_breast_cancer_df_fully_annotated.parquet')
enriched_df.to_parquet(final_output_path, index=False)

print(f"Saved fully annotated breast cancer dataset: {final_output_path}")
print(f"Final shape: {enriched_df.shape}")

# Display summary of available features
print(f"\nFinal dataset contains {enriched_df.shape[1]} features:")
print(f"- DepMap features (expression, copy number, gene effect, mutations)")
print(f"- Network features (PPI and GO terms)")
print(f"- Ranked essentiality features")
print(f"- Pairwise computed features")

# Show columns added in each step
depmap_cols = [col for col in enriched_df.columns if any(x in col for x in ['A1_', 'A2_']) and not any(y in col for y in ['weighted_PPI', 'GO', 'smallest', 'rank'])]
network_cols = [col for col in enriched_df.columns if any(x in col for x in ['weighted_PPI', 'GO', 'smallest'])]
ranked_cols = [col for col in enriched_df.columns if 'rank' in col]
pairwise_cols = [col for col in enriched_df.columns if any(x in col for x in ['Max', 'Min', 'max_', 'min_'])]

print(f"\nDepMap features: {len(depmap_cols)} columns")
print(f"Network features: {len(network_cols)} columns")
print(f"Ranked features: {len(ranked_cols)} columns") 
print(f"Pairwise features: {len(pairwise_cols)} columns")

# Show sample of final data
print("\nSample of final annotated data:")
print(enriched_df.head(3))


=== Saving Final Annotated Dataset ===
Saved fully annotated breast cancer dataset: /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/breast_cancer/paralog_pairs_breast_cancer_df_fully_annotated.parquet
Final shape: (1406790, 89)

Final dataset contains 89 features:
- DepMap features (expression, copy number, gene effect, mutations)
- Network features (PPI and GO terms)
- Ranked essentiality features
- Pairwise computed features

DepMap features: 24 columns
Network features: 6 columns
Ranked features: 9 columns
Pairwise features: 17 columns

Sample of final annotated data:
   prediction_rank  prediction_percentile     old_genepair         genepair  \
0                1                    0.1  SMARCA2_SMARCA4  SMARCA2_SMARCA4   
1                1                    0.1  SMARCA2_SMARCA4  SMARCA2_SMARCA4   
2                1                    0.1  SMARCA2_SMARCA4  SMARCA2_SMARCA4   

        A1       A2  A

In [21]:
intermediate_path_for_essentials = get_data_path(['output', 'breast_cancer'], 'paralog_pairs_breast_cancer_df_w_essentials_depmap_annotated.parquet')
enriched_df_with_essentials = pd.read_parquet(intermediate_path_for_essentials)

In [24]:
# Process VERSION 2: Complete annotation pipeline WITH common essentials
print("\n=== Processing Alternative Version (WITH Common Essentials) ===")

display(enriched_df_with_essentials[:3])

display(enriched_df_with_essentials.shape)

# Extract cell lines for alternative version
cell_lines_alt = pd.DataFrame(enriched_df_with_essentials["DepMap_ID"].unique(), columns=["DepMap_ID"])

# Apply network features annotation to alternative version
print("Applying network features annotation to version WITH common essentials...")
print("Reusing network features from main pipeline (same cell lines, same data)...")

# Merge network features with main dataframe (exact from original)
enriched_df_with_essentials = pd.merge(enriched_df_with_essentials, network_all_df, on=['DepMap_ID', 'genepair'], how='left')

print(f"Shape after network annotation: {enriched_df_with_essentials.shape}")


# Apply ranked essentiality features to alternative version
print("Applying ranked essentiality features to version WITH common essentials...")
print("Reusing ranked essentiality features from main pipeline (same cell lines, same data)...")

# Annotate ranked features
enriched_df_with_essentials = annotate_ranked_features(enriched_df_with_essentials, processed_ranked_features)

enriched_df_with_essentials = calculate_ranked_pairwise_features(enriched_df_with_essentials)

print(f"Shape after ranked essentiality annotation (WITH essentials): {enriched_df_with_essentials.shape}")

# Apply missing value preprocessing to alternative version
print("Applying missing value preprocessing to version WITH common essentials...")

enriched_df_with_essentials_clean = preprocess_dataset(
    df=enriched_df_with_essentials,
    old_df=None,
    dropna_cols=drop_na_values,
    fillna_zero_cols=fillna_values,
    fillna_large_cols=fillna_values_v2
)

print(f"Shape after preprocessing (WITH essentials): {enriched_df_with_essentials_clean.shape}")

# Save alternative version WITH common essentials
alternative_output_path = get_data_path(['output', 'breast_cancer'], 'paralog_pairs_breast_cancer_df_fully_annotated_with_essentials.parquet')
enriched_df_with_essentials_clean.to_parquet(alternative_output_path, index=False)

print(f"Saved alternative dataset (WITH common essentials): {alternative_output_path}")
print(f"Alternative dataset final shape: {enriched_df_with_essentials_clean.shape}")

# Store final shape for summary
essentials_final_shape = enriched_df_with_essentials_clean.shape[0]

print("# of genepairs in alternative version (WITH common essentials):", 
      enriched_df_with_essentials_clean.genepair.nunique())
print("# of cell lines in alternative version (WITH common essentials):",
      enriched_df_with_essentials_clean.cell_line.nunique())

# Clean up alternative version variables
#del enriched_df_with_essentials, enriched_df_with_essentials_clean
#gc.collect()

print("✓ Alternative version (WITH common essentials) processing completed")


=== Processing Alternative Version (WITH Common Essentials) ===


Unnamed: 0,prediction_rank,prediction_percentile,old_genepair,genepair,A1,A2,A1_entrez,A2_entrez,A1_ensembl,A2_ensembl,...,zMaxExp_A1A2,zMinExp_A1A2,zMaxESS_A1A2,zMinESS_A1A2,A1_Deleterious,A1_mut,A2_Deleterious,A2_mut,Protein_Altering,Damaging
0,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,0.009565,-1.269232,0.368872,-3.538792,1,1,0,0,1,1
1,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,0.711733,0.368598,0.542432,-0.356751,0,0,0,0,0,0
2,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,0.956952,0.234321,0.932271,0.401188,0,0,0,0,0,0


(3444912, 75)

Applying network features annotation to version WITH common essentials...
Reusing network features from main pipeline (same cell lines, same data)...
Shape after network annotation: (3444912, 81)
Applying ranked essentiality features to version WITH common essentials...
Reusing ranked essentiality features from main pipeline (same cell lines, same data)...
Shape after ranked essentiality annotation (WITH essentials): (3444912, 89)
Applying missing value preprocessing to version WITH common essentials...
Dropped 1934082 rows due to missing values in critical columns: ['rMaxExp_A1A2', 'rMinExp_A1A2', 'max_ranked_A1A2', 'min_ranked_A1A2']
Filled 90450 missing values in 'weighted_PPI_expression' with 0
Filled 826605 missing values in 'smallest_BP_GO_expression' with 0
Filled 1230615 missing values in 'smallest_CC_GO_expression' with 0
Filled 90450 missing values in 'weighted_PPI_essentiality' with 18000
Filled 827775 missing values in 'smallest_BP_GO_essentiality' with 18000
Filled 1234845