In [18]:
# import modules
import os
import re
import gc
import pandas as pd
import numpy as np
from natsort import natsorted
import pyarrow.parquet as pq
from tqdm.notebook import tqdm

In [2]:
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

# cancer cell lines
depmap_folder_path = get_data_path(['output', 'processed_DepMap22Q4'], '')
paralog_pairs_path = get_data_path(['input', 'other'], 'processed_DeKegel_TableS8.csv')
model_info_path = get_data_path(['input', 'DepMap22Q4'], 'Model.csv')

In [3]:
paralog_pairs = pd.read_csv(paralog_pairs_path)
paralog_pairs = paralog_pairs.dropna(axis=0, how='any', subset=['A1_entrez_new', 'A2_entrez_new']).reset_index(drop=True)
paralog_pairs['A1_entrez_new'] = paralog_pairs['A1_entrez_new'].astype(int).astype(str)
paralog_pairs['A2_entrez_new'] = paralog_pairs['A2_entrez_new'].astype(int).astype(str)
paralog_pairs[:3]

Unnamed: 0,prediction_rank,prediction_percentile,old_genepair,genepair,A1,A2,A1_entrez,A2_entrez,A1_ensembl,A2_ensembl,...,shared_ppi_mean_essentiality,gtex_spearman_corr,gtex_min_mean_expr,gtex_max_mean_expr,A1_entrez_new,A2_entrez_new,A1_new,A2_new,A1_ensembl_new,A2_ensembl_new
0,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,0.225382,0.627875,18.609973,34.302868,6595,6597,SMARCA2,SMARCA4,ENSG00000080503,ENSG00000127616
1,2,0.1,EXOC6_EXOC6B,EXOC6_EXOC6B,EXOC6,EXOC6B,54536,23233,ENSG00000138190,ENSG00000144036,...,0.285886,0.069456,6.390812,11.168367,54536,23233,EXOC6,EXOC6B,ENSG00000138190,ENSG00000144036
2,3,0.1,STAG1_STAG2,STAG1_STAG2,STAG1,STAG2,10274,10735,ENSG00000118007,ENSG00000101972,...,0.329993,0.854086,13.103716,22.097616,10274,10735,STAG1,STAG2,ENSG00000118007,ENSG00000101972


In [4]:
paralog_pairs_updated = paralog_pairs[['prediction_rank', 'prediction_percentile', 'old_genepair', 'genepair',
                                        'A1_new', 'A2_new', 'A1_entrez_new','A2_entrez_new', 'A1_ensembl_new', 'A2_ensembl_new',
                                        'prediction_score', 'validated_SL', 'n_screens', 'n_screens_SL',
                                        'depmap_hit', 'min_sequence_identity', 'closest', 'WGD', 'family_size',
                                        'cds_length_ratio', 'shared_domains', 'has_pombe_ortholog',
                                        'has_essential_pombe_ortholog', 'has_cerevisiae_ortholog',
                                        'has_essential_cerevisiae_ortholog', 'conservation_score', 'mean_age',
                                        'either_in_complex', 'mean_complex_essentiality', 'colocalisation',
                                        'interact', 'n_total_ppi', 'fet_ppi_overlap',
                                        'shared_ppi_mean_essentiality', 'gtex_spearman_corr',
                                        'gtex_min_mean_expr', 'gtex_max_mean_expr']]
paralog_pairs_updated = paralog_pairs_updated.rename(columns={'A1_entrez_new':'A1_entrez', 'A2_entrez_new': 'A2_entrez', 'A1_new': 'A1', 'A2_new': 'A2', 'A1_ensembl_new': 'A1_ensembl', 'A2_ensembl_new': 'A2_ensembl'})

In [5]:
paralog_pairs_updated[:3]

Unnamed: 0,prediction_rank,prediction_percentile,old_genepair,genepair,A1,A2,A1_entrez,A2_entrez,A1_ensembl,A2_ensembl,...,either_in_complex,mean_complex_essentiality,colocalisation,interact,n_total_ppi,fet_ppi_overlap,shared_ppi_mean_essentiality,gtex_spearman_corr,gtex_min_mean_expr,gtex_max_mean_expr
0,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,True,0.387262,0.333333,True,302,114.614142,0.225382,0.627875,18.609973,34.302868
1,2,0.1,EXOC6_EXOC6B,EXOC6_EXOC6B,EXOC6,EXOC6B,54536,23233,ENSG00000138190,ENSG00000144036,...,True,0.486857,0.25,True,53,29.782706,0.285886,0.069456,6.390812,11.168367
2,3,0.1,STAG1_STAG2,STAG1_STAG2,STAG1,STAG2,10274,10735,ENSG00000118007,ENSG00000101972,...,True,0.897254,0.0,False,74,39.412527,0.329993,0.854086,13.103716,22.097616


## Process DepMap files

In [6]:
paralogs = set(paralog_pairs_updated['A1_entrez'].unique()).union(paralog_pairs_updated['A2_entrez'].unique())

In [7]:
def get_feature_files(folder):
    """Return sorted list of CSV file paths and their base names (without extension) from a folder."""
    csv_files = [
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.endswith('.csv')
    ]
    csv_files = sorted(csv_files)
    filenames = [os.path.splitext(os.path.basename(f))[0] for f in csv_files]
    return csv_files, filenames

In [8]:
feature_files, feature_names = get_feature_files(depmap_folder_path)
feature_files

['/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_DepMap22Q4/common_essentials.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_DepMap22Q4/copy_number_data.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_DepMap22Q4/expression_data.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_DepMap22Q4/gene_effect_data.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_DepMap22Q4/mutation_data.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_predictio

In [9]:
# extract the mutation data, adn common essentials from files and filenames
remove_idx = [0, 3, 4, 6]
ufeature_files = np.delete(feature_files, remove_idx)
ufeature_names = np.delete(feature_names, remove_idx)

In [10]:
def process_depmap_files(ufiles, paralogs):
    results = []

    for file in ufiles:
        df = pd.read_csv(file, index_col=0, low_memory=False)
        df = df.drop(['cell_name'], axis=1, errors='ignore')  # Drop 'cell_name' if it exists
        filtered_df = df.loc[:, df.columns.isin(paralogs)]
        filtered_df = filtered_df.reset_index(drop=False)
        filtered_df = filtered_df.rename(columns={"index":"DepMap_ID"})

        # Drop cell_name column if it exists
        if "cell_name" in filtered_df.columns:
            filtered_df = filtered_df.drop(["cell_name"], axis=1)

        # Convert wide format to long format
        feature_cols = [col for col in filtered_df.columns if col != "DepMap_ID"]
        melt_df = pd.melt(filtered_df, 
                          id_vars=["DepMap_ID"], 
                          value_vars=feature_cols,
                          var_name='entrez_id', 
                          value_name=os.path.basename(file).split(".")[0])
        
        # Keep entrez_id as string for consistency with paralog pairs data
        melt_df['entrez_id'] = melt_df['entrez_id'].astype(str)
        results.append(melt_df)
        print(f"Processed {file}")
            
    return results

In [11]:
def annotate_features(processed_features):
    """
    Merge paralog pairs with genomic features for both A1 and A2 genes.
    
    Args:
        paralog_pairs_df: DataFrame with paralog pairs
        processed_features: List of melted feature DataFrames from process_depmap_files
    
    Returns:
        DataFrame with features annotated for both A1 and A2 genes
    """
    
    # Start with the first feature as base
    base_df = processed_features[0].copy()
    
    # Merge all features together on DepMap_ID and entrez_id
    for feature_df in processed_features[1:]:
        base_df = pd.merge(base_df, feature_df, on=['DepMap_ID', 'entrez_id'], how='outer')
        
    return base_df

def annotate_mutations(mapped_df, mutation_df):
    """
    Annotate mapped dataframe with mutation information for both A1 and A2 genes.
    
    Args:
        mapped_df: DataFrame with paralog pairs and features
        mutation_df: DataFrame with mutation data
    
    Returns:
        DataFrame with mutation annotations added
    """
    # Filter mutations for relevant cell lines and ensure consistent data types
    filtered = mutation_df[mutation_df['DepMap_ID'].isin(mapped_df['DepMap_ID'])].copy()
    filtered = filtered[['entrez_id', 'DepMap_ID', 'Damaging', 'VariantInfo']].copy()
    
    # Ensure entrez_id is string type for consistent merging
    filtered['entrez_id'] = filtered['entrez_id'].astype(str)
    
    # Create binary indicator for any variant
    filtered['VariantInfo'] = 1
    filtered = filtered.drop_duplicates(subset=['entrez_id', 'DepMap_ID']).reset_index(drop=True)

    return filtered


In [12]:
# Load mutation data
mutation_df = pd.read_csv(feature_files[4], low_memory=False)

# Process all feature files for all paralogs
print("Processing DepMap feature files...")
processed_features = process_depmap_files(ufeature_files, paralogs)

mapped_df = annotate_features(processed_features)
print(f"Mapped dataframe shape: {mapped_df.shape}")

# Add mutation annotations
print("Adding mutation annotations...")
mapped_mutation_df = annotate_mutations(mapped_df, mutation_df)
print(f"Final dataframe shape: {mapped_mutation_df.shape}")

final_df = pd.merge(mapped_df, mapped_mutation_df,
        left_on=['DepMap_ID', 'entrez_id'],
        right_on=['DepMap_ID', 'entrez_id'],
        how='left'
    )


Processing DepMap feature files...
Processed /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_DepMap22Q4/copy_number_data.csv
Processed /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_DepMap22Q4/expression_data.csv
Processed /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_DepMap22Q4/zexpression_data.csv
Mapped dataframe shape: (23696157, 5)
Adding mutation annotations...
Final dataframe shape: (1151800, 4)


In [13]:
final_df['VariantInfo'] = final_df['VariantInfo'].fillna(0).astype(int)
final_df['Damaging'] = final_df['Damaging'].fillna(0).astype(int)

final_df = final_df.rename(columns={'Damaging':'Deleterious', 'VariantInfo': 'mut'})

final_df_cleared = final_df.dropna(subset=['copy_number_data', 'expression_data'], axis=0, how='any')
final_df_sorted = final_df_cleared.sort_values(by=['entrez_id', 'DepMap_ID']).reset_index(drop=True)

In [14]:
final_df_sorted

Unnamed: 0,DepMap_ID,entrez_id,copy_number_data,expression_data,zexpression_data,Deleterious,mut
0,ACH-000001,1,0.888349,3.116032,0.155751,0,0
1,ACH-000002,1,1.028104,1.395063,-0.700537,0,0
2,ACH-000003,1,1.022828,1.400538,-0.697813,0,0
3,ACH-000004,1,1.162923,4.178715,0.684501,0,0
4,ACH-000005,1,1.089594,4.003602,0.597372,0,0
...,...,...,...,...,...,...,...
18523145,ACH-002785,9997,0.925357,0.604071,-4.274450,0,0
18523146,ACH-002800,9997,0.477553,1.996389,-2.540767,0,0
18523147,ACH-002834,9997,1.236121,2.021480,-2.509524,0,0
18523148,ACH-002847,9997,0.960344,3.463361,-0.714126,0,0


In [15]:
# Remove large variables
del final_df
del final_df_cleared
del mapped_df
del mapped_mutation_df

# Run garbage collection
gc.collect()

9

In [24]:
# Save intermediate data for server processing
print("Saving intermediate data...")

# Save final_df_sorted
final_df_sorted.to_parquet(get_data_path(['output'], 'final_df_sorted.parquet'))
print(f"Saved final_df_sorted with shape: {final_df_sorted.shape}")

# Save paralog_pairs_updated
paralog_pairs_updated.to_parquet(get_data_path(['output'], 'paralog_pairs_updated.parquet'))
print(f"Saved paralog_pairs_updated with shape: {paralog_pairs_updated.shape}")

print("Data saved successfully!")

Saving intermediate data...
Saved final_df_sorted with shape: (18523150, 7)
Saved paralog_pairs_updated with shape: (36623, 38)
Data saved successfully!


## Annotate DepMap data to paralog pairs

In [None]:
#model_info_df = pd.read_csv(model_info_path)
#DepMapID_to_cancer = dict(zip(model_info_df['ModelID'], model_info_df['OncotreeLineage']))
#ModelID_to_cellline_name = dict(zip(model_info_df['ModelID'], model_info_df['StrippedCellLineName']))

In [None]:
#final_df_sorted = final_df_sorted.assign(
#    cell_line = final_df_sorted['DepMap_ID'].map(ModelID_to_cellline_name),
#    cancer_type = final_df_sorted['DepMap_ID'].map(DepMapID_to_cancer)
#)

#final_df_sorted[:3]

In [16]:
# extract all unique DepMap IDs
depmap_ids = final_df_sorted['DepMap_ID'].unique()

# Add a dummy key to both dataframes
paralog_pairs_updated['key'] = 1
depmap_df = pd.DataFrame({'DepMap_ID': depmap_ids})
depmap_df['key'] = 1

# Cartesian product
expanded_paralog_pairs = pd.merge(paralog_pairs_updated, depmap_df, on='key').drop('key', axis=1)

expanded_paralog_pairs[:3]

Unnamed: 0,prediction_rank,prediction_percentile,old_genepair,genepair,A1,A2,A1_entrez,A2_entrez,A1_ensembl,A2_ensembl,...,mean_complex_essentiality,colocalisation,interact,n_total_ppi,fet_ppi_overlap,shared_ppi_mean_essentiality,gtex_spearman_corr,gtex_min_mean_expr,gtex_max_mean_expr,DepMap_ID
0,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,0.387262,0.333333,True,302,114.614142,0.225382,0.627875,18.609973,34.302868,ACH-000001
1,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,0.387262,0.333333,True,302,114.614142,0.225382,0.627875,18.609973,34.302868,ACH-000002
2,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,0.387262,0.333333,True,302,114.614142,0.225382,0.627875,18.609973,34.302868,ACH-000003


In [17]:
expanded_paralog_pairs['genepair'].nunique() , expanded_paralog_pairs['DepMap_ID'].nunique()

(36623, 1399)

In [None]:
# Define features to map
features = ['expression_data', 'zexpression_data', 'copy_number_data', 'mut', 'Deleterious']

# Build lookup: (DepMap_ID, entrez_id) -> feature dict
lookup = final_df_sorted.set_index(['DepMap_ID', 'entrez_id'])[features].to_dict('index')

def get_features(depmap_id, entrez_id):
    return lookup.get((depmap_id, entrez_id), {f: np.nan for f in features})

# Map features for A1 and A2 using list comprehensions with progress bars
for f in features:
    print(f"Mapping {f} for A1...")
    expanded_paralog_pairs[f'A1_{f}'] = [
        get_features(depmap_id, a1_entrez)[f]
        for depmap_id, a1_entrez in tqdm(
            zip(expanded_paralog_pairs['DepMap_ID'], expanded_paralog_pairs['A1_entrez']),
            total=len(expanded_paralog_pairs)
        )
    ]
    print(f"Mapping {f} for A2...")
    expanded_paralog_pairs[f'A2_{f}'] = [
        get_features(depmap_id, a2_entrez)[f]
        for depmap_id, a2_entrez in tqdm(
            zip(expanded_paralog_pairs['DepMap_ID'], expanded_paralog_pairs['A2_entrez']),
            total=len(expanded_paralog_pairs)
        )
    ]

In [26]:
expanded_paralog_pairs.shape

(51235577, 48)

In [27]:
pd.options.mode.copy_on_write = True  # reduce copies in pandas ≥2.0

def calculate_pairwise_features_ip(df: pd.DataFrame) -> pd.DataFrame:
    steps = [
        ("zExp max/min", ('A1_zexpression_data', 'A2_zexpression_data', 'zMaxExp_A1A2', 'zMinExp_A1A2')),
        ("rExp max/min", ('A1_expression_data', 'A2_expression_data', 'rMaxExp_A1A2', 'rMinExp_A1A2')),
        ("copy number max/min", ('A1_copy_number_data', 'A2_copy_number_data', 'max_cn', 'min_cn')),
    ]

    for label, (c1, c2, max_col, min_col) in tqdm(steps, desc="Pairwise features", unit="block"):
        a1 = df[c1].to_numpy(copy=False)
        a2 = df[c2].to_numpy(copy=False)
        df[max_col] = np.maximum(a1, a2)
        df[min_col] = np.minimum(a1, a2)

    return df


def finalize_mutation_flags_ip(df: pd.DataFrame) -> pd.DataFrame:
    flag_cols = ['A1_Deleterious','A2_Deleterious','A1_mut','A2_mut']
    steps = flag_cols + ['Protein_Altering', 'Damaging']

    with tqdm(total=len(steps), desc="Mutation flags", unit="col") as pbar:
        for col in flag_cols:
            s = df[col]
            if s.isna().any():
                df[col] = s.fillna(0).astype('uint8', copy=False)
            else:
                df[col] = s.astype('uint8', copy=False)
            pbar.update(1)

        df['Protein_Altering'] = (df['A1_mut'].to_numpy(copy=False)
                                  + df['A2_mut'].to_numpy(copy=False)).astype('uint8', copy=False)
        pbar.update(1)

        df['Damaging'] = (df['A1_Deleterious'].to_numpy(copy=False)
                          + df['A2_Deleterious'].to_numpy(copy=False)).astype('uint8', copy=False)
        pbar.update(1)

    return df

In [28]:
# Apply with progress
expanded_paralog_pairs = calculate_pairwise_features_ip(expanded_paralog_pairs)
expanded_paralog_pairs = finalize_mutation_flags_ip(expanded_paralog_pairs)

Pairwise features:   0%|          | 0/3 [00:00<?, ?block/s]

Mutation flags:   0%|          | 0/6 [00:00<?, ?col/s]

In [29]:
expanded_paralog_pairs.loc[expanded_paralog_pairs['A1_entrez'] == '6595',]

Unnamed: 0,prediction_rank,prediction_percentile,old_genepair,genepair,A1,A2,A1_entrez,A2_entrez,A1_ensembl,A2_ensembl,...,A1_Deleterious,A2_Deleterious,zMaxExp_A1A2,zMinExp_A1A2,rMaxExp_A1A2,rMinExp_A1A2,max_cn,min_cn,Protein_Altering,Damaging
0,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,0,0,-0.170854,-0.253171,5.950002,4.349790,1.555071,0.683473,0,0
1,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,0,0,1.031422,0.482802,6.657068,6.042644,1.028104,1.005384,0,0
2,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,0,0,-0.090127,-1.135906,6.106641,2.990955,0.837040,0.828969,0,0
3,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,0,0,1.914890,1.010879,7.286604,7.164404,1.810565,1.238790,0,0
4,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,0,0,1.885330,1.181772,7.328585,7.244982,1.699552,1.267558,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1394,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,0,0,0.826296,-0.476418,5.753818,5.735522,1.393345,1.214934,0,0
1395,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,0,0,0.606688,-1.414372,5.444601,4.834408,1.105940,0.852305,1,0
1396,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,0,0,0.325279,-0.256222,6.505732,4.229588,1.284490,0.754979,0,0
1397,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,0,0,-0.421199,-0.624399,5.593354,3.997292,0.943964,0.639318,0,0


In [31]:
expanded_paralog_pairs.isna().sum() 

prediction_rank                             0
prediction_percentile                       0
old_genepair                                0
genepair                                    0
A1                                          0
A2                                          0
A1_entrez                                   0
A2_entrez                                   0
A1_ensembl                                  0
A2_ensembl                                  0
prediction_score                            0
validated_SL                                0
n_screens                                   0
n_screens_SL                                0
depmap_hit                           46161404
min_sequence_identity                       0
closest                                     0
WGD                                         0
family_size                                 0
cds_length_ratio                            0
shared_domains                              0
has_pombe_ortholog                

In [32]:
# Save expanded_paralog_pairs
output_path = get_data_path(['output'], 'expanded_paralog_pairs.parquet')
expanded_paralog_pairs.to_parquet(
    output_path,
    index=False,         # no need to store the index unless you need it later
    engine="pyarrow"    # faster than fastparquet for wide tables
)

print("Data saved successfully!")

Saved expanded_paralog_pairs with shape: (51235577, 56)
Data saved successfully!
