### Annotation of DepMap data to CRISPR screens

In [1]:
import os
import pandas as pd
import numpy as np
from functools import reduce

In [2]:
# set the base directory for the project
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, "..", ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

depmap_folder_path = get_data_path(['data', 'output', 'processed_DepMap22Q4'], '')
crispr_screens_path = get_data_path(['data','output', 'processed_CRISPR_screens'], '')

In [3]:
def get_feature_files(folder):
    """Return sorted list of CSV file paths and their base names (without extension) from a folder."""
    csv_files = [
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.endswith('.csv')
    ]
    csv_files = sorted(csv_files)
    filenames = [os.path.splitext(os.path.basename(f))[0] for f in csv_files]
    return csv_files, filenames

In [4]:
feature_files, feature_names = get_feature_files(depmap_folder_path)
feature_files

['/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/data/output/processed_DepMap22Q4/common_essentials.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/data/output/processed_DepMap22Q4/copy_number_data.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/data/output/processed_DepMap22Q4/expression_data.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/data/output/processed_DepMap22Q4/gene_effect_data.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/data/output/processed_DepMap22Q4/mutation_data.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/cont

In [5]:
# extract the mutation data, adn common essentials from files and filenames
remove_idx = [0, 4]
ufeature_files = np.delete(feature_files, remove_idx)
ufeature_names = np.delete(feature_names, remove_idx)

In [6]:
def get_csv_files_from_folder(folder_path):
    return sorted([
        os.path.join(folder_path, file)
        for file in os.listdir(folder_path)
        if file.endswith('.csv')
    ])

In [7]:
target_pair_files = get_csv_files_from_folder(crispr_screens_path)
target_pair_files

['/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/data/output/processed_CRISPR_screens/processed_ito_df.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/data/output/processed_CRISPR_screens/processed_ito_df_CCLE22Q4.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/data/output/processed_CRISPR_screens/processed_ito_df_labelled.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/data/output/processed_CRISPR_screens/processed_ito_df_network.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/data/output/processed_CRISPR_screens/processed_ito_df_parrish_labelled.csv',
 '/Users/narod/Library/CloudStor

In [10]:
def process_depmap_files(ufiles, cell_lines):

    results = []

    for file in ufiles:
        filtered_df = pd.read_csv(file, index_col=0, low_memory=False)
        filtered_df = filtered_df[filtered_df.index.isin(cell_lines['DepMap_ID'])].reset_index()
        filtered_df = filtered_df.rename(columns={"index":"DepMap_ID"})

        # Convert wide format to long format
        melt_df = pd.melt(filtered_df.drop(["cell_name"], axis=1), 
                          id_vars=["DepMap_ID"], value_vars=filtered_df.drop(["cell_name"], axis=1).columns,
                          var_name='entrez_id', value_name='value')
        
        melt_df = melt_df.astype({'entrez_id':'int'})
        melt_df = melt_df.rename(columns={'value': os.path.basename(file).split(".")[0]})
        results.append(melt_df)
        print(f"Processed {file}")
            
    return results

In [11]:
def annotate_features(target_df, processed_features):
    df = target_df.copy()

    # Annotate A1
    for feature_df in processed_features:
        df = pd.merge(
            df,
            feature_df.rename(columns={'entrez_id': 'A1_entrez'}),
            on=['DepMap_ID', 'A1_entrez'],
            how='left'
        )

    df = df.rename(columns={
        'copy_number_data': 'A1_copy_number_data',
        'expression_data': 'A1_expression_data',
        'gene_effect_data': 'A1_gene_effect_data',
        'zexpression_data': 'A1_zexpression_data',
        'zgene_effect_data': 'A1_zgene_effect_data'
    })

    # Annotate A2
    for feature_df in processed_features:
        df = pd.merge(
            df,
            feature_df.rename(columns={'entrez_id': 'A2_entrez'}),
            on=['DepMap_ID', 'A2_entrez'],
            how='left'
        )

    df = df.rename(columns={
        'copy_number_data': 'A2_copy_number_data',
        'expression_data': 'A2_expression_data',
        'gene_effect_data': 'A2_gene_effect_data',
        'zexpression_data': 'A2_zexpression_data',
        'zgene_effect_data': 'A2_zgene_effect_data'
    })

    return df


In [None]:
def annotate_mutations(mapped_df, mutation_df):
    # Keep only mutation rows for the cell lines in mapped_df
    m = mutation_df.loc[
        mutation_df['DepMap_ID'].isin(mapped_df['DepMap_ID']),
        ['entrez_id', 'DepMap_ID', 'Damaging']
    ].copy()

    # Collapse variant-level rows -> one row per (gene, cell line)
    # A gene is "mutated" if it appears at least once in mutation_df for that cell line
    # A gene is "deleterious" if ANY of its variants is damaging (max over 0/1)
    agg = (m.groupby(['entrez_id', 'DepMap_ID'], as_index=False)
             .agg(
                 mut_any=('Damaging', 'size'),        # count of variant rows
                 deleterious=('Damaging', 'max')      # any damaging variant
             ))

    # Convert mut_any count -> binary 0/1
    agg['mut_any'] = (agg['mut_any'] > 0).astype(int)

    # Merge A1 flags
    mapped_df = mapped_df.merge(
        agg.rename(columns={
            'entrez_id': 'A1_entrez',
            'mut_any': 'A1_mut',
            'deleterious': 'A1_Deleterious'
        }),
        on=['DepMap_ID', 'A1_entrez'],
        how='left'
    )

    # Merge A2 flags
    mapped_df = mapped_df.merge(
        agg.rename(columns={
            'entrez_id': 'A2_entrez',
            'mut_any': 'A2_mut',
            'deleterious': 'A2_Deleterious'
        }),
        on=['DepMap_ID', 'A2_entrez'],
        how='left'
    )

    # Fill missing values: gene had no mutation rows for that cell line
    for c in ['A1_mut', 'A1_Deleterious', 'A2_mut', 'A2_Deleterious']:
        mapped_df[c] = mapped_df[c].fillna(0).astype(int)

    return mapped_df


In [13]:
def filter_common_essentials(df, essentials_df):
    filtered_df = df[~df["A1_entrez"].isin(essentials_df["Essentials"])] 
    mut_filtered_df = filtered_df[~filtered_df["A2_entrez"].isin(essentials_df["Essentials"])]
    mut_filtered_df = mut_filtered_df.reset_index(drop=True)
    return mut_filtered_df

In [22]:
mutation_df = pd.read_csv(feature_files[4], low_memory=False)
common_essentials_df = pd.read_csv(feature_files[0], low_memory=False)
annotated_datasets = []

for i, file in enumerate(target_pair_files):
    # Load target pair dataset
    target_df = pd.read_csv(file)

    # Extract cell lines for that dataset
    cell_lines = pd.DataFrame(target_df["DepMap_ID"].unique(), columns=["DepMap_ID"])
    
    # Process feature files for this dataset's cell lines
    processed_features = process_depmap_files(ufeature_files, cell_lines)
    
    mapped_df = annotate_features(target_df, processed_features)
    mapped_df = mapped_df.sort_values(['genepair', 'DepMap_ID'], ascending=[True, True]).reset_index(drop=True)

    print(f"Processed {file} with shape: {mapped_df.shape}")

    # Add mutation annotations
    mut_mapped_df = annotate_mutations(mapped_df, mutation_df)

    # Common essential filtering
    final_df = filter_common_essentials(mut_mapped_df, common_essentials_df)

    annotated_datasets.append(final_df)
    print(f"Final shape: {final_df.shape}")

Processed /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/data/output/processed_DepMap22Q4/copy_number_data.csv
Processed /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/data/output/processed_DepMap22Q4/expression_data.csv
Processed /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/data/output/processed_DepMap22Q4/gene_effect_data.csv
Processed /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/data/output/processed_DepMap22Q4/zexpression_data.csv
Processed /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/data/output/processed_DepMap22Q4/zgene_effect_data.csv
Processed /Users/narod/Library/CloudStorage/GoogleDrive-narod.keb

In [23]:
def calculate_pairwise_features(df):
    return df.assign(
        zMaxExp_A1A2 = lambda x: np.amax(x[['A1_zexpression_data', 'A2_zexpression_data']], axis=1),
        zMinExp_A1A2 = lambda x: np.amin(x[['A1_zexpression_data', 'A2_zexpression_data']], axis=1),

        rMaxExp_A1A2 = lambda x: np.amax(x[['A1_expression_data', 'A2_expression_data']], axis=1),
        rMinExp_A1A2 = lambda x: np.amin(x[['A1_expression_data', 'A2_expression_data']], axis=1),

        max_cn = lambda x: np.amax(x[['A1_copy_number_data', 'A2_copy_number_data']], axis=1),
        min_cn = lambda x: np.amin(x[['A1_copy_number_data', 'A2_copy_number_data']], axis=1),

        zMaxESS_A1A2 = lambda x: np.amax(x[['A1_zgene_effect_data', 'A2_zgene_effect_data']], axis=1),
        zMinESS_A1A2 = lambda x: np.amin(x[['A1_zgene_effect_data', 'A2_zgene_effect_data']], axis=1),

        rMaxESS_A1A2 = lambda x: np.amax(x[['A1_gene_effect_data', 'A2_gene_effect_data']], axis=1),
        rMinESS_A1A2 = lambda x: np.amin(x[['A1_gene_effect_data', 'A2_gene_effect_data']], axis=1)
    )

In [24]:
final_annotated_datasets = []

for df in annotated_datasets:
    enriched_df = calculate_pairwise_features(df)
    final_annotated_datasets.append(enriched_df)
    print(f"Final shape: {enriched_df.shape}")

Final shape: (49764, 36)


In [25]:
def finalize_mutation_flags(df):
    pd.set_option('future.no_silent_downcasting', True)

    df['A1_Deleterious'] = df['A1_Deleterious'].fillna(0).astype(int)
    df['A2_Deleterious'] = df['A2_Deleterious'].fillna(0).astype(int)
    df['A1_mut'] = df['A1_mut'].fillna(0).astype(int)
    df['A2_mut'] = df['A2_mut'].fillna(0).astype(int)

    df['Protein_Altering'] = (df['A1_mut'] + df['A2_mut']).astype(int)
    df['Damaging'] = (df['A1_Deleterious'] + df['A2_Deleterious']).astype(int)

    # Optional: drop rows with any remaining NaNs
    # df.dropna(axis=0, how='any', inplace=True)

    return df

In [None]:
cleaned_final_datasets = []

for df in final_annotated_datasets:
    cleaned_df = finalize_mutation_flags(df)
    cleaned_final_datasets.append(cleaned_df)

In [None]:
# summary of the training dataset
ito = cleaned_final_datasets[0]

# Analyze gene pair and cell line triplets in ito2
print(f"Total number of rows (gene pair - cell line combinations): {len(ito)}")
print(f"Number of unique gene pairs: {ito['genepair'].nunique()}")
print(f"Number of unique cell lines: {ito['cell_line'].nunique()}")
print(f"Number of unique gene pair - cell line combinations: {ito[['genepair', 'cell_line']].drop_duplicates().shape[0]}")

# Show some sample data
print("\nSample of the data:")
print(ito.head())

### save annotated datasets

In [None]:
output_dir = get_data_path(['data', 'output', 'processed_CRISPR_screens'], '')

for i, df in enumerate(cleaned_final_datasets):
    base_filename = os.path.splitext(os.path.basename(target_pair_files[i]))[0]
    output_path = os.path.join(output_dir, f"{base_filename}_CCLE22Q4.csv")
    df.to_csv(output_path, index=False)
    print(f"Saved: {output_path}")