### Annotate Network Features to CRISPR Screens

In [7]:
# import modules
import os
import re
import csv
import numpy as np
import pandas as pd
import pyarrow.parquet as pq

In [8]:
get_data_path = lambda folders, fname: os.path.normpath(os.environ['DRIVE_PATH'] + '/' + '/'.join(folders) + '/' + fname)
crispr_screens_path = get_data_path(['GitRepos', 'context_specific_SL_prediction', 'output', 'processed_CRISPR_screens'], '')

In [9]:
def get_target_files(folder, pattern_suffix='_CCLE22Q4'):
    """Return sorted list of CSV file paths and their base names (without _CCLE22Q4 suffix) from a folder."""
    csv_files = [
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.endswith('.csv') and pattern_suffix in f
    ]
    csv_files = sorted(csv_files)
    # Extract base names without the pattern suffix and .csv extension
    filenames = [
        os.path.splitext(os.path.basename(f))[0].replace(pattern_suffix, '') 
        for f in csv_files
    ]
    return csv_files, filenames

In [10]:
crispr_files, filenames = get_target_files(crispr_screens_path)
crispr_files

['/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_CRISPR_screens/processed_ito_df_CCLE22Q4.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_CRISPR_screens/processed_klingbeil_df_CCLE22Q4.csv']

In [4]:
cell_lines = pd.DataFrame(target_pairs['DepMap_ID'].unique()).rename(columns={0:"DepMap_ID"})

In [None]:
# # save pathnames of csv PPI files
# folder = '/Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/features/ppi_calculations'

# ppi_files = list()
# ppi_filenames = list()

# for filename in os.listdir(folder):
#     if filename.endswith('.parquet'):
#         base_filename = os.path.split(filename)[1]
#         ppi_filenames.append(base_filename.split(".")[0])

#         file_directory = os.path.join(folder, filename)
#         ppi_files.append(file_directory)

#     else:
#         continue

In [5]:
# save pathnames of csv PPI files
folder = '/Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/input_files/PPI'

ppi_files = list()
ppi_filenames = list()

for filename in os.listdir(folder):
    if filename.endswith('.csv'):
    
        base_filename = os.path.split(filename)[1]
        ppi_filenames.append(base_filename.split(".")[0])
        
        file_directory = os.path.join(folder, filename)
        ppi_files.append(file_directory)
    elif filename.endswith('.parquet'):
        base_filename = os.path.split(filename)[1]
        ppi_filenames.append(base_filename.split(".")[0])

        file_directory = os.path.join(folder, filename)
        ppi_files.append(file_directory)
    else:
        continue

In [6]:
#remove_idx = [1]
#ppi_files = np.delete(ppi_files, remove_idx)
#ppi_filenames = np.delete(ppi_filenames, remove_idx)
ppi_files

['/Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/input_files/PPI/z_ranked_BioGRIDBIOGRID-MV-Physical-4.4.221.tab3.csv',
 '/Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/input_files/PPI/combined_weighted_PPI_essentiality_new.parquet',
 '/Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/input_files/PPI/ranked_BioGRIDBIOGRID-MV-Physical-4.4.221.tab3.csv',
 '/Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/input_files/PPI/z_ranked_BioGRIDBIOGRID-ALL-4.4.221.tab3.csv',
 '/Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/input_files/PPI/combined_weighted_PPI_expression_new.parquet',
 '/Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/input_files/PPI/ranked_BioGRIDBIOGRID-ALL-4.4.221.tab3.csv']

In [7]:
# save pathnames of csv PPI ranked files
folder = '/Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/input_files/GO'

go_files = list()
go_filenames = list()

for filename in os.listdir(folder):
    if filename.endswith('.csv'):
    
        base_filename = os.path.split(filename)[1]
        go_filenames.append(base_filename.split(".")[0])
        
        file_directory = os.path.join(folder, filename)
        go_files.append(file_directory)
    elif filename.endswith('.parquet'):
        base_filename = os.path.split(filename)[1]
        go_filenames.append(base_filename.split(".")[0])

        file_directory = os.path.join(folder, filename)
        go_files.append(file_directory)
    else:
        continue

In [8]:
go_files

['/Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/input_files/GO/GO_CC_expression.parquet',
 '/Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/input_files/GO/go_BP_ranked_essentiality.parquet',
 '/Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/input_files/GO/go_BP_expression.parquet',
 '/Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/input_files/GO/go_CC_ranked_essentiality.parquet']

In [9]:
go_dict = {
    'go_BP_expression':'smallest_BP_GO_gene_expression',
    'GO_CC_expression':'smallest_CC_GO_gene_expression',
    'go_BP_ranked_essentiality':'smallest_BP_GO_ranked',
    'go_CC_ranked_essentiality':'smallest_CC_GO_gene_effect'
}

In [10]:
def process_ppi_files(ppi_files, cell_lines):

    results = []

    for file in ppi_files:
        if file.endswith('.csv'):
            ppi_df = pd.read_csv(file, index_col=0, low_memory=False)
            ppi_df = ppi_df[ppi_df.index.isin(cell_lines['DepMap_ID'])].reset_index()
            ppi_df = ppi_df.rename(columns={"index":"DepMap_ID"})

            # Convert wide format to long format
            melt_ppi = ppi_df.melt(id_vars=["DepMap_ID"], var_name="genepair", value_name="PPI")
            melt_ppi = melt_ppi.rename(columns={'PPI': os.path.basename(file).split(".")[0]})
            results.append(melt_ppi)
            print(f"Processed {file}")

        else:
            ppi_df = pd.read_parquet(file)
            ppi_df = ppi_df[ppi_df.index.isin(cell_lines['DepMap_ID'])].reset_index()
            ppi_df = ppi_df.rename(columns={"index":"DepMap_ID"})

            # Convert wide format to long format
            melt_ppi = ppi_df.melt(id_vars=["DepMap_ID"], var_name="genepair", value_name="PPI")
            melt_ppi = melt_ppi.rename(columns={'PPI': os.path.basename(file).split(".")[0]})
            results.append(melt_ppi)
            print(f"Processed {file}")
            
    return results

In [11]:
list_of_ppi_files = process_ppi_files(ppi_files, cell_lines)

Processed /Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/input_files/PPI/z_ranked_BioGRIDBIOGRID-MV-Physical-4.4.221.tab3.csv
Processed /Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/input_files/PPI/combined_weighted_PPI_essentiality_new.parquet
Processed /Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/input_files/PPI/ranked_BioGRIDBIOGRID-MV-Physical-4.4.221.tab3.csv
Processed /Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/input_files/PPI/z_ranked_BioGRIDBIOGRID-ALL-4.4.221.tab3.csv
Processed /Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/input_files/PPI/combined_weighted_PPI_expression_new.parquet
Processed /Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/input_files/PPI/ranked_BioGRIDBIOGRID-ALL-4.4.221

In [12]:
def process_go_files(go_files, cell_lines, go_dict):

    results = []

    for file in go_files:
        basename = os.path.basename(file).split(".")[0]
        value_col = go_dict.get(basename)

        if not value_col:
            print(f"Skipping file {file} as it does not match any known GO file names in go_dict.")
            continue

        if file.endswith('.parquet'):
            go_df = pd.read_parquet(file)
            go_df = go_df[go_df['cell_line'].isin(cell_lines['DepMap_ID'])]
            go_df = go_df[['cell_line', 'paralog_pair', value_col]]

        go_df = go_df.rename(columns={"cell_line": "DepMap_ID", "paralog_pair": "genepair", value_col: basename})
        results.append(go_df)
        print(f"Processed {file}")
            
    return results

In [13]:
list_of_go_files = process_go_files(go_files, cell_lines, go_dict)

Processed /Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/input_files/GO/GO_CC_expression.parquet
Processed /Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/input_files/GO/go_BP_ranked_essentiality.parquet
Processed /Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/input_files/GO/go_BP_expression.parquet
Processed /Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/input_files/GO/go_CC_ranked_essentiality.parquet


In [14]:
from functools import reduce

ppi_merged_df = reduce(lambda left, right: pd.merge(left, right, on=['DepMap_ID', 'genepair'], how='outer'), list_of_ppi_files)
go_merged_df = reduce(lambda left, right: pd.merge(left, right, on=['DepMap_ID', 'genepair'], how='outer'), list_of_go_files)

In [15]:
rename_dict = {
    'combined_weighted_PPI_expression_new':'Expression_weighted_PPI',                  
    'combined_weighted_PPI_essentiality_new':'ranked_Essentiality_weighted_PPI',
    'z_ranked_BioGRIDBIOGRID-MV-Physical-4':'z_ranked_BioGRID_MW',
    'ranked_BioGRIDBIOGRID-MV-Physical-4':'ranked_BioGRID_MV',
    'z_ranked_BioGRIDBIOGRID-ALL-4':'z_ranked_BioGRID_ALL',
    'ranked_BioGRIDBIOGRID-ALL-4':'ranked_BioGRID_ALL'
}

ppi_merged_df = ppi_merged_df.rename(columns=rename_dict)

In [16]:
rename_dict = {
    'GO_CC_expression':'smallest_GO_CC_expression',
    'go_BP_ranked_essentiality':'smallest_GO_ranked_ess',
    'go_BP_expression':'smallest_gene_expression',
    'go_CC_ranked_essentiality': 'smallest_GO_CC_ranked_ess',
}

go_merged_df = go_merged_df.rename(columns=rename_dict)

In [17]:
network_all_df = pd.merge(ppi_merged_df, go_merged_df, on=['DepMap_ID', 'genepair'], how='outer')

In [18]:
target_pairs_annotated = pd.merge(target_pairs, network_all_df, on=['DepMap_ID', 'genepair'], how='left')

In [19]:
#target_pairs_annotated[:3]

In [20]:
#target_pairs_annotated.isna().sum()  / len(target_pairs_annotated) * 100

In [21]:
#target_pairs_annotated.isna().sum()

In [22]:
# check NA values in the dataset
print('#of NAs in weighted_expression:', target_pairs_annotated['Expression_weighted_PPI'].isna().sum())
print('#of rows in the target dataset:', target_pairs_annotated.shape[0])

#of NAs in weighted_expression: 3806
#of rows in the target dataset: 49753


In [23]:
# Checking the percentage of missing values in PPI-related columns
ppi_columns = ['z_ranked_BioGRID_MW', 'ranked_BioGRID_MV', 'z_ranked_BioGRID_ALL',
               'Expression_weighted_PPI', 'ranked_Essentiality_weighted_PPI',
               'ranked_BioGRID_ALL', 'smallest_GO_CC_expression',
               'smallest_GO_ranked_ess', 'smallest_gene_expression',
               'smallest_GO_CC_ranked_ess'
]

missing_values = target_pairs_annotated[ppi_columns].isna().sum() / len(target_pairs_annotated) * 100
missing_values

z_ranked_BioGRID_MW                 71.197717
ranked_BioGRID_MV                   71.197717
z_ranked_BioGRID_ALL                45.149036
Expression_weighted_PPI              7.649790
ranked_Essentiality_weighted_PPI    16.045264
ranked_BioGRID_ALL                  45.149036
smallest_GO_CC_expression           81.162945
smallest_GO_ranked_ess              53.088256
smallest_gene_expression            48.662392
smallest_GO_CC_ranked_ess           82.895504
dtype: float64

In [24]:
target_pairs_annotated.shape

(49753, 48)

In [25]:
target_pairs_annotated[:3]

Unnamed: 0,genepair,A1,A2,A1_entrez,A2_entrez,DepMap_ID,cell_line,Gemini_FDR,raw_LFC,SL,...,z_ranked_BioGRID_MW,ranked_Essentiality_weighted_PPI,ranked_BioGRID_MV,z_ranked_BioGRID_ALL,Expression_weighted_PPI,ranked_BioGRID_ALL,smallest_GO_CC_expression,smallest_GO_ranked_ess,smallest_gene_expression,smallest_GO_CC_ranked_ess
0,A3GALT2_ABO,A3GALT2,ABO,127550,28,ACH-000022,PATU8988S_PANCREAS,0.998944,0.088856,False,...,,9763.43168,,,2.861362,,,,,
1,A3GALT2_ABO,A3GALT2,ABO,127550,28,ACH-000307,PK1_PANCREAS,0.986587,0.201704,False,...,,10787.05036,,,2.827488,,,,,
2,A3GALT2_ABO,A3GALT2,ABO,127550,28,ACH-000632,HS944T_SKIN,1.0,0.069772,False,...,,6173.971711,,,1.605424,,,,,


In [26]:
os.chdir('/Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/output_files/')

target_pairs_annotated.to_csv('./ito_pairs_ppi_new.csv', index=False)
#target_pairs_annotated.to_csv('./klingbeil_pairs/klingbeil_pairs_ppi_new.csv', index=False)
#target_pairs_annotated.to_csv('./parrish_pairs/parrish_pairs_ppi.csv', index=False)
#target_pairs_annotated.to_csv('./36K_dataset/36Kdataset_ppi.csv', index=False)


### Fill nan values with average essentiality of shared ppi

In [None]:
#mapped_df['B'] = mapped_df['B'].fillna(mapped_df['B'].mean())

In [None]:
# # Fill NaN values with mean per cell line
# for cl in merged_ppi['DepMap_ID'].unique():
#     rows_to_update = merged_ppi[merged_ppi['DepMap_ID'] == cl]
#     mean_of_cols = rows_to_update[col_names].mean()
#     merged_ppi.loc[merged_ppi['DepMap_ID'] == cl, col_names] = merged_ppi.loc[merged_ppi['DepMap_ID'] == cl, col_names].fillna(mean_of_cols)

In [None]:
# Fill NA values within each cell line group using mean
target_pairs_annotated[ppi_columns] = target_pairs_annotated.groupby("DepMap_ID")[ppi_columns].transform(lambda x: x.fillna(x.mean()))

In [None]:
# check NA values in the dataset
missing_values = target_pairs_annotated[ppi_columns].isna().sum() / len(target_pairs_annotated) * 100
missing_values

In [None]:
from scipy.stats import pearsonr
from statsmodels.stats.multitest import multipletests

test_df = target_pairs_annotated.dropna(how='any', subset=['Essentiality_weighted_PPI', 'BioGRID_ALL', 'ranked_Essentiality_weighted_PPI', 'ranked_BioGRID_ALL'], axis=0)
correlation, p_value = pearsonr(test_df['Essentiality_weighted_PPI'], test_df['ranked_Essentiality_weighted_PPI'])
correlation_2, p_value_2 = pearsonr(test_df['BioGRID_ALL'], test_df['ranked_BioGRID_ALL'])
print(f'correlation between PPI vs ranked PPI: {correlation}, {p_value}')
print(f'correlation between PPI vs ranked PPI (BG): {correlation_2}, {p_value_2}')

In [None]:
# in case there are too many NAs
# drop or place 0

#umapped_df = mapped_df[~mapped_df.isna().any(axis=1)].reset_index(drop=True)
#umapped_df

### save to csv

In [None]:
os.chdir('/Users/narod/Documents/GitHub/context_specific_SL_prediction/output_files/')

target_pairs_annotated.to_csv('./22Q4/ito_screen/ito_pairs_ppi_new.csv', index=False)

#os.chdir('/Users/narod/Documents/GitHub/context_specific_SL_prediction/output_files/in4mer/')

#os.chdir('/Users/narod/Documents/GitHub/context_specific_SL_prediction/output_files/zdLFC')

#ranked_ppi_mean_df.to_csv('./ito_screen/ito_pairs_ppi.csv', index=False)
#ranked_ppi_mean_df.to_csv('./dede_screen/dede_pairs_ppi.csv', index=False)
#ranked_ppi_mean_df.to_csv('./chymera_screen/chymera_pairs_ppi.csv', index=False)
#ranked_ppi_mean_df.to_csv('./parrish_screen/parrish_pairs_ppi.csv', index=False)
#ranked_ppi_mean_df.to_csv('./thompson_screen/thompson_pairs_ppi.csv', index=False)
#ranked_ppi_mean_df.to_csv('./klingbeil_screen/klingbeil_pairs_ppi.csv', index=False)
#ranked_ppi_mean_df.to_csv('./hart/hart_pairs_ppi.csv', index=False)
#ranked_ppi_mean_df.to_parquet('/Users/narod/Documents/GitHub/context_specific_SL_prediction/output_files/wt_mut_shap_values/mut_breast_paralog_pairs_ppi.parquet')

#avg_mapped_df.to_csv('./klingbeil_screen/klingbeil_pairs_ppi_v2.csv', index=False)
#avg_mapped_df.to_csv('./vicky_screen/vicky_pairs_ppi_v2.csv', index=False)