### Annotate Network Features to CRISPR Screens

In [1]:
# import modules
import os
import csv
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
from functools import reduce

In [2]:
# set the base directory for the project
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, "..", ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

crispr_screens_path = get_data_path(['output', 'processed_CRISPR_screens'], '')
ppi_files_path = get_data_path(['input', 'PPI'], '')
go_files_path = get_data_path(['input', 'GO'], '')

In [3]:
def get_target_files(folder, pattern_suffix='_CCLE22Q4'):
    """Return sorted list of CSV file paths and their base names (without _CCLE22Q4 suffix) from a folder."""
    csv_files = [
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.endswith('.csv') and pattern_suffix in f
    ]
    csv_files = sorted(csv_files)
    # Extract base names without the pattern suffix and .csv extension
    filenames = [
        os.path.splitext(os.path.basename(f))[0].replace(pattern_suffix, '') 
        for f in csv_files
    ]
    return csv_files, filenames

In [4]:
crispr_files, filenames = get_target_files(crispr_screens_path)
crispr_files

['/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_CRISPR_screens/processed_harle_df_CCLE22Q4.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_CRISPR_screens/processed_ito_df_CCLE22Q4.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_CRISPR_screens/processed_klingbeil_df_CCLE22Q4.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_CRISPR_screens/processed_parrish_CCLE22Q4.csv']

In [7]:
# save pathnames of csv PPI files
ppi_files = list()
ppi_filenames = list()

for filename in os.listdir(ppi_files_path):
    if filename.endswith('.csv'):
    
        base_filename = os.path.split(filename)[1]
        ppi_filenames.append(base_filename.split(".")[0])
        
        file_directory = os.path.join(ppi_files_path, filename)
        ppi_files.append(file_directory)
    elif filename.endswith('.parquet'):
        base_filename = os.path.split(filename)[1]
        ppi_filenames.append(base_filename.split(".")[0])

        file_directory = os.path.join(ppi_files_path, filename)
        ppi_files.append(file_directory)
    else:
        continue

In [8]:
remove_idx = [4]
ppi_files = np.delete(ppi_files, remove_idx)
ppi_filenames = np.delete(ppi_filenames, remove_idx)
ppi_files

array(['/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/input/PPI/ranked_BioGRIDBIOGRID-MV-Physical-4.4.221.csv',
       '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/input/PPI/weighted_PPI_expression.parquet',
       '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/input/PPI/weighted_zPPI_essentiality.parquet',
       '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/input/PPI/z_ranked_BioGRIDBIOGRID-ALL-4.4.221.csv',
       '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/input/PPI/z_ranked_BioGRIDBIOGRID-MV-Physical-4.4.221.csv',
       '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My

In [9]:
# save pathnames of csv PPI ranked files
go_files = list()
go_filenames = list()

for filename in os.listdir(go_files_path):
    if filename.endswith('.csv'):
    
        base_filename = os.path.split(filename)[1]
        go_filenames.append(base_filename.split(".")[0])
        
        file_directory = os.path.join(go_files_path, filename)
        go_files.append(file_directory)
    elif filename.endswith('.parquet'):
        base_filename = os.path.split(filename)[1]
        go_filenames.append(base_filename.split(".")[0])

        file_directory = os.path.join(go_files_path, filename)
        go_files.append(file_directory)
    else:
        continue

In [10]:
go_files

['/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/input/GO/go_CC_expression.parquet',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/input/GO/go_BP_ranked_essentiality.parquet',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/input/GO/go_BP_expression.parquet',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/input/GO/go_CC_ranked_essentiality.parquet']

In [11]:
go_dict = {
    'go_BP_expression':'smallest_BP_GO_expression',
    'go_CC_expression':'smallest_CC_GO_expression',
    'go_BP_ranked_essentiality':'smallest_BP_GO_essentiality',
    'go_CC_ranked_essentiality':'smallest_CC_GO_essentiality'
}

In [12]:
def process_ppi_files(ppi_files, cell_lines):

    results = []

    for file in ppi_files:
        if file.endswith('.csv'):
            ppi_df = pd.read_csv(file, index_col=0, low_memory=False)
            ppi_df = ppi_df[ppi_df.index.isin(cell_lines['DepMap_ID'])].reset_index()
            ppi_df = ppi_df.rename(columns={"index":"DepMap_ID"})

            # Convert wide format to long format
            melt_ppi = ppi_df.melt(id_vars=["DepMap_ID"], var_name="genepair", value_name="PPI")
            melt_ppi = melt_ppi.rename(columns={'PPI': os.path.basename(file).split(".")[0]})
            results.append(melt_ppi)
            print(f"Processed {file}")

        else:
            ppi_df = pd.read_parquet(file)
            ppi_df = ppi_df[ppi_df.index.isin(cell_lines['DepMap_ID'])].reset_index()
            ppi_df = ppi_df.rename(columns={"index":"DepMap_ID"})

            # Convert wide format to long format
            melt_ppi = ppi_df.melt(id_vars=["DepMap_ID"], var_name="genepair", value_name="PPI")
            melt_ppi = melt_ppi.rename(columns={'PPI': os.path.basename(file).split(".")[0]})
            results.append(melt_ppi)
            print(f"Processed {file}")
            
    return results

In [13]:
def process_go_files(go_files, cell_lines, go_dict):

    results = []

    for file in go_files:
        basename = os.path.basename(file).split(".")[0]
        value_col = go_dict.get(basename)

        if not value_col:
            print(f"Skipping file {file} as it does not match any known GO file names in go_dict.")
            continue

        if file.endswith('.parquet'):
            go_df = pd.read_parquet(file)
            go_df = go_df[go_df['cell_line'].isin(cell_lines['DepMap_ID'])]
            go_df = go_df[['cell_line', 'paralog_pair', value_col]]

        go_df = go_df.rename(columns={"cell_line": "DepMap_ID", "paralog_pair": "genepair", value_col: basename})
        results.append(go_df)
        print(f"Processed {file}")
            
    return results

In [14]:
annotated_datasets = []

for i, file in enumerate(crispr_files):
    # Load target pair dataset
    target_df = pd.read_csv(file)

    # Extract cell lines for that dataset
    cell_lines = pd.DataFrame(target_df["DepMap_ID"].unique(), columns=["DepMap_ID"])

    # Process PPI files
    list_of_ppi_files = process_ppi_files(ppi_files, cell_lines)

    # Process GO files
    list_of_go_files = process_go_files(go_files, cell_lines, go_dict)

    # Merge all PPI dataframes on 'DepMap_ID' and 'genepair'
    ppi_merged_df = reduce(lambda left, right: pd.merge(left, right, on=['DepMap_ID', 'genepair'], how='outer'), list_of_ppi_files)

    # Update the column names in the PPI merged dataframe
    rename_dict = {
    'combined_weighted_PPI_expression_new':'Expression_weighted_PPI',                  
    'combined_weighted_PPI_essentiality_new':'ranked_Essentiality_weighted_PPI',
    'z_ranked_BioGRIDBIOGRID-MV-Physical-4':'z_ranked_BioGRID_MW',
    'ranked_BioGRIDBIOGRID-MV-Physical-4':'ranked_BioGRID_MV',
    'z_ranked_BioGRIDBIOGRID-ALL-4':'z_ranked_BioGRID_ALL',
    'ranked_BioGRIDBIOGRID-ALL-4':'ranked_BioGRID_ALL'
    }

    ppi_merged_df = ppi_merged_df.rename(columns=rename_dict)

    # Merge all GO dataframes on 'DepMap_ID' and 'genepair'    
    go_merged_df = reduce(lambda left, right: pd.merge(left, right, on=['DepMap_ID', 'genepair'], how='outer'), list_of_go_files)

    # Update the column names in the GO merged dataframe
    rename_dict = {
    'GO_CC_expression':'smallest_CC_GO_expression',
    'go_BP_ranked_essentiality':'smallest_BP_GO_essentiality',
    'go_BP_expression':'smallest_BP_GO_expression',
    'go_CC_ranked_essentiality': 'smallest_CC_GO_essentiality',
    }

    go_merged_df = go_merged_df.rename(columns=rename_dict)     

    # Combine all PPI and GO dataframes
    network_all_df = pd.merge(ppi_merged_df, go_merged_df, on=['DepMap_ID', 'genepair'], how='outer')
    
    target_pairs_annotated = pd.merge(target_df, network_all_df, on=['DepMap_ID', 'genepair'], how='left')

    annotated_datasets.append(target_pairs_annotated)

    print(f"Processed {file} with shape: {target_pairs_annotated.shape}")
    

Processed /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/input/PPI/ranked_BioGRIDBIOGRID-MV-Physical-4.4.221.csv
Processed /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/input/PPI/weighted_PPI_expression.parquet
Processed /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/input/PPI/weighted_zPPI_essentiality.parquet
Processed /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/input/PPI/z_ranked_BioGRIDBIOGRID-ALL-4.4.221.csv
Processed /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/input/PPI/z_ranked_BioGRIDBIOGRID-MV-Physical-4.4.221.csv
Processed /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/

In [15]:
# summary of ito
ito = annotated_datasets[0]

# Analyze gene pair and cell line triplets in ito2
print(f"Total number of rows (gene pair - cell line combinations): {len(ito)}")
print(f"Number of unique gene pairs: {ito['genepair'].nunique()}")
print(f"Number of unique cell lines: {ito['cell_line'].nunique()}")
print(f"Number of unique gene pair - cell line combinations: {ito[['genepair', 'cell_line']].drop_duplicates().shape[0]}")

# Show some sample data
print("\nSample of the data:")
print(ito.head())

Total number of rows (gene pair - cell line combinations): 8658
Number of unique gene pairs: 333
Number of unique cell lines: 26
Number of unique gene pair - cell line combinations: 8658

Sample of the data:
    genepair sorted_gene_pair    A1    A2  A1_entrez  A2_entrez   DepMap_ID  \
0  ABL1_ABL2        ABL1|ABL2  ABL1  ABL2         25         27  ACH-000094   
1  ABL1_ABL2        ABL1|ABL2  ABL1  ABL2         25         27  ACH-000114   
2  ABL1_ABL2        ABL1|ABL2  ABL1  ABL2         25         27  ACH-000138   
3  ABL1_ABL2        ABL1|ABL2  ABL1  ABL2         25         27  ACH-000219   
4  ABL1_ABL2        ABL1|ABL2  ABL1  ABL2         25         27  ACH-000222   

  cell_line  SL org_A1  ... weighted_zPPI_essentiality  z_ranked_BioGRID_ALL  \
0   HPAF-II   0   ABL1  ...                   0.011043              0.101473   
1  SU.86.86   0   ABL1  ...                  -0.038360             -0.197943   
2   CFPAC-1   0   ABL1  ...                  -0.125673             -0.180782 

In [16]:
# Checking the percentage of missing values in PPI-related columns
ppi_columns = ['ranked_BioGRID_MV', 'weighted_PPI_expression',
               'weighted_zPPI_essentiality', 'z_ranked_BioGRID_ALL',
               'z_ranked_BioGRID_MW', 'weighted_zPPI_expression', 'ranked_BioGRID_ALL',
               'weighted_PPI_essentiality', 'go_CC_expression',
               'smallest_BP_GO_essentiality', 'smallest_BP_GO_expression',
               'smallest_CC_GO_essentiality'
]

missing_values = ito[ppi_columns].isna().sum() / len(ito) * 100
missing_values

ranked_BioGRID_MV              59.089859
weighted_PPI_expression        35.135135
weighted_zPPI_essentiality     40.540541
z_ranked_BioGRID_ALL           46.638947
z_ranked_BioGRID_MW            59.089859
weighted_zPPI_expression       35.135135
ranked_BioGRID_ALL             46.638947
weighted_PPI_essentiality      40.540541
go_CC_expression               73.943174
smallest_BP_GO_essentiality    54.516055
smallest_BP_GO_expression      50.381150
smallest_CC_GO_essentiality    76.114576
dtype: float64

In [59]:
output_dir = get_data_path(['output', 'processed_CRISPR_screens'], '')

for i, df in enumerate(annotated_datasets):
    base_filename = filenames[i]
    output_path = os.path.join(output_dir, f"{base_filename}_network.csv")
    df.to_csv(output_path, index=False)
    print(f"Saved: {output_path}")

Saved: /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_CRISPR_screens/processed_ito_df_network.csv
