In [1]:
# import modules
import os
import pandas as pd
import pyarrow.parquet as pq
import numpy as np

In [2]:
# set the base directory for the project
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, "..", ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

ranked_essentiality_files_path = get_data_path(['output', 'ranked_essentiality'], '')
crispr_screens_path = get_data_path(['output', 'processed_CRISPR_screens'], '')


In [3]:
def get_feature_files(folder):
    """Return sorted list of CSV file paths and their base names (without extension) from a folder."""
    csv_files = [
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.endswith('.csv')
    ]
    csv_files = sorted(csv_files)
    filenames = [os.path.splitext(os.path.basename(f))[0] for f in csv_files]
    return csv_files, filenames

In [4]:
feature_files, feature_names = get_feature_files(ranked_essentiality_files_path)
feature_files

['/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/ranked_essentiality/ranked_essentiality.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/ranked_essentiality/ranked_zessentiality.csv']

In [5]:
def get_target_files(folder, pattern_suffix='_network'):
    """Return sorted list of CSV file paths and their base names (without _network suffix) from a folder."""
    csv_files = [
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.endswith('.csv') and pattern_suffix in f
    ]
    csv_files = sorted(csv_files)
    # Extract base names without the pattern suffix and .csv extension
    filenames = [
        os.path.splitext(os.path.basename(f))[0].replace(pattern_suffix, '') 
        for f in csv_files
    ]
    return csv_files, filenames

In [6]:
crispr_files, crispr_filenames = get_target_files(crispr_screens_path)
crispr_files

['/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_CRISPR_screens/processed_harle_df_network.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_CRISPR_screens/processed_ito_df_network.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_CRISPR_screens/processed_klingbeil_df_network.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_CRISPR_screens/processed_parrish_df_network.csv']

In [9]:
def process_ranked_ess_files(ranked_files, cell_lines):

    results = []

    for file in ranked_files:
        ranked_df = pd.read_csv(file, index_col=0)
        filtered_ranked_df = ranked_df[ranked_df.index.isin(cell_lines['DepMap_ID'])]
        filtered_ranked_df = filtered_ranked_df.reset_index(drop=False).rename(columns={"index":"DepMap_ID"})
        
        # Convert wide format to long format
        melt_ranked_df = pd.melt(filtered_ranked_df, id_vars=["DepMap_ID"], var_name="entrez_id", value_name="value")
        melt_ranked_df['entrez_id'] = melt_ranked_df['entrez_id'].astype(int)
        melt_ranked_df = melt_ranked_df.rename(columns={'value': os.path.basename(file).split(".")[0]})
        results.append(melt_ranked_df)
        print(f"Processed {file}")

    return results

In [10]:
def annotate_features(target_df, processed_ranked_ess_files):
    df = target_df.copy()

    # Annotate A1
    for ranked_df in processed_ranked_ess_files:
        df = pd.merge(
            df,
            ranked_df.rename(columns={'entrez_id': 'A1_entrez'}),
            on=['DepMap_ID', 'A1_entrez'],
            how='left'
        )

    df = df.rename(columns={
        'ranked_essentiality':'A1_rank',
        'ranked_zessentiality':'zA1_rank'
    })

    # Annotate A2
    for ranked_df in processed_ranked_ess_files:
        df = pd.merge(
            df,
            ranked_df.rename(columns={'entrez_id': 'A2_entrez'}),
            on=['DepMap_ID', 'A2_entrez'],
            how='left'
        )

    df = df.rename(columns={
        'ranked_essentiality':'A2_rank',
        'ranked_zessentiality':'zA2_rank'
    })

    return df


In [11]:
annotated_datasets = []

for i, file in enumerate(crispr_files):
    # Load target pair dataset
    target_df = pd.read_csv(file)
    
    # Extract cell lines for that dataset
    cell_lines = pd.DataFrame(target_df["DepMap_ID"].unique(), columns=["DepMap_ID"])

    processed_features = process_ranked_ess_files(feature_files, cell_lines)

    mapped_df = annotate_features(target_df, processed_features)
    mapped_df = mapped_df.sort_values(['genepair', 'DepMap_ID'], ascending=[True, True]).reset_index(drop=True)

    print(f"Processed {file} with shape: {mapped_df.shape}")
    
    annotated_datasets.append(mapped_df)
    

Processed /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/ranked_essentiality/ranked_essentiality.csv
Processed /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/ranked_essentiality/ranked_zessentiality.csv
Processed /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_CRISPR_screens/processed_harle_df_network.csv with shape: (8658, 53)


In [12]:
def calculate_pairwise_features(df):
    return df.assign(
        max_ranked_A1A2 = lambda x: np.amax(x[['A1_rank', 'A2_rank']], axis=1),
        min_ranked_A1A2 = lambda x: np.amin(x[['A1_rank', 'A2_rank']], axis=1),

        z_max_ranked_A1A2 = lambda x: np.amax(x[['zA1_rank', 'zA2_rank']], axis=1),
        z_min_ranked_A1A2 = lambda x: np.amin(x[['zA1_rank', 'zA2_rank']], axis=1),
    )

In [13]:
final_annotated_datasets = []

for df in annotated_datasets:
    enriched_df = calculate_pairwise_features(df)
    final_annotated_datasets.append(enriched_df)
    print(f"Final shape: {enriched_df.shape}")

Final shape: (8658, 57)


In [14]:
# summary of ito
ito = final_annotated_datasets[0]

# Analyze gene pair and cell line triplets in ito2
print(f"Total number of rows (gene pair - cell line combinations): {len(ito)}")
print(f"Number of unique gene pairs: {ito['genepair'].nunique()}")
print(f"Number of unique cell lines: {ito['cell_line'].nunique()}")
print(f"Number of unique gene pair - cell line combinations: {ito[['genepair', 'cell_line']].drop_duplicates().shape[0]}")

# Show some sample data
print("\nSample of the data:")
print(ito.head())

Total number of rows (gene pair - cell line combinations): 8658
Number of unique gene pairs: 333
Number of unique cell lines: 26
Number of unique gene pair - cell line combinations: 8658

Sample of the data:
    genepair sorted_gene_pair    A1    A2  A1_entrez  A2_entrez   DepMap_ID  \
0  ABL1_ABL2        ABL1|ABL2  ABL1  ABL2         25         27  ACH-000094   
1  ABL1_ABL2        ABL1|ABL2  ABL1  ABL2         25         27  ACH-000114   
2  ABL1_ABL2        ABL1|ABL2  ABL1  ABL2         25         27  ACH-000138   
3  ABL1_ABL2        ABL1|ABL2  ABL1  ABL2         25         27  ACH-000219   
4  ABL1_ABL2        ABL1|ABL2  ABL1  ABL2         25         27  ACH-000222   

  cell_line  SL org_A1  ... smallest_BP_GO_expression  \
0   HPAF-II   0   ABL1  ...                  3.320432   
1  SU.86.86   0   ABL1  ...                  3.210991   
2   CFPAC-1   0   ABL1  ...                  3.232917   
3     A-375   0   ABL1  ...                  3.142405   
4    AsPC-1   0   ABL1  ...     

In [13]:
# Checking the percentage of missing values in PPI-related columns
ranked_columns = ['A1_rank', 'zA1_rank', 'A2_rank', 'zA2_rank', 'max_ranked_A1A2', 'min_ranked_A1A2', 'z_max_ranked_A1A2', 'z_min_ranked_A1A2']

missing_values = final_annotated_datasets[0][ranked_columns].isna().sum() / len(final_annotated_datasets[0]) * 100
missing_values

A1_rank              13.773009
zA1_rank             13.773009
A2_rank              14.576803
zA2_rank             14.576803
max_ranked_A1A2      10.115746
min_ranked_A1A2      10.115746
z_max_ranked_A1A2    10.115746
z_min_ranked_A1A2    10.115746
dtype: float64

In [14]:
final_annotated_datasets[0].isna().sum()

genepair                           0
A1                                 0
A2                                 0
A1_entrez                          0
A2_entrez                          0
DepMap_ID                          0
cell_line                          0
Gemini_FDR                       224
raw_LFC                            0
SL                                 0
org_A1                             0
org_A2                             0
A1_copy_number_data              168
A1_expression_data               352
A1_gene_effect_data             6854
A1_zexpression_data              352
A1_zgene_effect_data            6854
A2_copy_number_data              290
A2_expression_data               528
A2_gene_effect_data             7254
A2_zexpression_data              528
A2_zgene_effect_data            7254
A1_Deleterious                     0
A1_mut                             0
A2_Deleterious                     0
A2_mut                             0
zMaxExp_A1A2                      11
z

In [16]:
output_dir = get_data_path(['output', 'processed_CRISPR_screens'], '')

for i, df in enumerate(final_annotated_datasets):
    base_filename = crispr_filenames[i]
    output_path = os.path.join(output_dir, f"{base_filename}_ranked_ess.csv")
    df.to_csv(output_path, index=False)
    print(f"Saved: {output_path}")

Saved: /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_CRISPR_screens/processed_parrish_df_ranked_ess.csv
