In [1]:
# import modules
import os
import pandas as pd
import numpy as np

In [2]:
# set the base directory for the project
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, "..", ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

crispr_screens_path = get_data_path(['output', 'processed_CRISPR_screens'], '')
dekegel_table8_path = get_data_path(['input', 'other'], 'processed_DeKegel_TableS8.csv')

In [3]:
def get_target_files(folder, pattern_suffix='_ranked_ess'):
    """Return sorted list of CSV file paths and their base names (without _CCLE22Q4 suffix) from a folder."""
    csv_files = [
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.endswith('.csv') and pattern_suffix in f
    ]
    csv_files = sorted(csv_files)
    # Extract base names without the pattern suffix and .csv extension
    filenames = [
        os.path.splitext(os.path.basename(f))[0].replace(pattern_suffix, '') 
        for f in csv_files
    ]
    return csv_files, filenames

In [4]:
crispr_files, filenames = get_target_files(crispr_screens_path)
crispr_files

['/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_CRISPR_screens/processed_ito_df_ranked_ess.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_CRISPR_screens/processed_klingbeil_df_ranked_ess.csv']

In [5]:
# read prediction score data
bp = pd.read_csv(dekegel_table8_path)

print(f'# of unique genepairs: {bp.genepair.nunique()}')
print('')

# test the orientation of the genepairs
display(bp.loc[bp['genepair'] == 'ADAMTS6_ADAMTS10'])
display(bp.loc[bp['genepair'] == 'ADAM8_ADAM12'])
display(bp.loc[bp['genepair'] == 'USP4_USP15'])

# of unique genepairs: 36648



Unnamed: 0,prediction_rank,prediction_percentile,old_genepair,genepair,A1,A2,A1_entrez,A2_entrez,A1_ensembl,A2_ensembl,...,shared_ppi_mean_essentiality,gtex_spearman_corr,gtex_min_mean_expr,gtex_max_mean_expr,A1_entrez_new,A2_entrez_new,A1_new,A2_new,A1_ensembl_new,A2_ensembl_new
15506,15507,42.4,ADAMTS10_ADAMTS6,ADAMTS6_ADAMTS10,ADAMTS10,ADAMTS6,81794,11174,ENSG00000142303,ENSG00000049192,...,0.0,0.611649,1.630857,18.696327,11174.0,81794.0,ADAMTS6,ADAMTS10,ENSG00000049192,ENSG00000142303


Unnamed: 0,prediction_rank,prediction_percentile,old_genepair,genepair,A1,A2,A1_entrez,A2_entrez,A1_ensembl,A2_ensembl,...,shared_ppi_mean_essentiality,gtex_spearman_corr,gtex_min_mean_expr,gtex_max_mean_expr,A1_entrez_new,A2_entrez_new,A1_new,A2_new,A1_ensembl_new,A2_ensembl_new
16781,16782,45.8,ADAM12_ADAM8,ADAM8_ADAM12,ADAM12,ADAM8,8038,101,ENSG00000148848,ENSG00000151651,...,0.0,0.18273,2.448785,21.963953,101.0,8038.0,ADAM8,ADAM12,ENSG00000151651,ENSG00000148848


Unnamed: 0,prediction_rank,prediction_percentile,old_genepair,genepair,A1,A2,A1_entrez,A2_entrez,A1_ensembl,A2_ensembl,...,shared_ppi_mean_essentiality,gtex_spearman_corr,gtex_min_mean_expr,gtex_max_mean_expr,A1_entrez_new,A2_entrez_new,A1_new,A2_new,A1_ensembl_new,A2_ensembl_new
1688,1689,4.7,USP15_USP4,USP4_USP15,USP15,USP4,9958,7375,ENSG00000135655,ENSG00000114316,...,0.46926,0.7735,9.646537,26.546565,7375.0,9958.0,USP4,USP15,ENSG00000114316,ENSG00000135655


In [6]:
late_features_df = bp[['genepair', 'A1_entrez_new', 'A2_entrez_new', 'prediction_score', 'min_sequence_identity', 'closest', 'WGD', 'family_size',
                        'cds_length_ratio', 'shared_domains', 'has_pombe_ortholog',
                        'has_essential_pombe_ortholog', 'has_cerevisiae_ortholog',
                        'has_essential_cerevisiae_ortholog', 'conservation_score', 'mean_age',
                        'either_in_complex', 'mean_complex_essentiality', 'colocalisation',
                        'interact', 'n_total_ppi', 'fet_ppi_overlap',
                        'gtex_spearman_corr', 'gtex_min_mean_expr', 'gtex_max_mean_expr']]
late_features_df = late_features_df.rename(columns={'A1_entrez_new':'A1_entrez', 'A2_entrez_new': 'A2_entrez'})

In [7]:
def integrate_features(df, features_df):
    integrated_df = pd.merge(df, features_df, 
                             left_on=['genepair', 'A1_entrez', 'A2_entrez'], 
                             right_on=['genepair', 'A1_entrez', 'A2_entrez'], 
                             how='left')
    bool_cols = ['closest', 'WGD', 'has_pombe_ortholog', 'has_essential_cerevisiae_ortholog', 'either_in_complex', 'interact']
    integrated_df[bool_cols] = integrated_df[bool_cols].astype(bool)
    return integrated_df

In [8]:
annotated_datasets = []

for i, file in enumerate(crispr_files):
    # Load target pair dataset
    target_df = pd.read_csv(file)
    # annotate with prediction scores
    annotated_target_pairs = integrate_features(target_df, late_features_df)
    # append to the list
    annotated_datasets.append(annotated_target_pairs)

In [9]:
# summary of ito
ito = annotated_datasets[0]

# Analyze gene pair and cell line triplets in ito2
print(f"Total number of rows (gene pair - cell line combinations): {len(ito)}")
print(f"Number of unique gene pairs: {ito['genepair'].nunique()}")
print(f"Number of unique cell lines: {ito['cell_line'].nunique()}")
print(f"Number of unique gene pair - cell line combinations: {ito[['genepair', 'cell_line']].drop_duplicates().shape[0]}")

# Show some sample data
print("\nSample of the data:")
print(ito.head())

Total number of rows (gene pair - cell line combinations): 49764
Number of unique gene pairs: 4524
Number of unique cell lines: 11
Number of unique gene pair - cell line combinations: 49764

Sample of the data:
      genepair       A1   A2  A1_entrez  A2_entrez   DepMap_ID  \
0  A3GALT2_ABO  A3GALT2  ABO   127550.0       28.0  ACH-000022   
1  A3GALT2_ABO  A3GALT2  ABO   127550.0       28.0  ACH-000307   
2  A3GALT2_ABO  A3GALT2  ABO   127550.0       28.0  ACH-000632   
3  A3GALT2_ABO  A3GALT2  ABO   127550.0       28.0  ACH-000681   
4  A3GALT2_ABO  A3GALT2  ABO   127550.0       28.0  ACH-000756   

                    cell_line  Gemini_FDR   raw_LFC     SL  ... mean_age  \
0          PATU8988S_PANCREAS    0.998944  0.088856  False  ...    226.1   
1                PK1_PANCREAS    0.986587  0.201704  False  ...    226.1   
2                 HS944T_SKIN    1.000000  0.069772  False  ...    226.1   
3                   A549_LUNG    0.977988  0.379455  False  ...    226.1   
4  GI1_CENTR

In [10]:
na_target_pairs = annotated_datasets[0].loc[annotated_datasets[0]['prediction_score'].isna(), 'genepair'].unique()
print(f'# of unique genepairs with no prediction score: {len(na_target_pairs)}')

# of unique genepairs with no prediction score: 306


In [11]:
pd.set_option("display.max_rows", None)
annotated_datasets[0].isna().sum()

genepair                                 0
A1                                       0
A2                                       0
A1_entrez                                0
A2_entrez                                0
DepMap_ID                                0
cell_line                                0
Gemini_FDR                             224
raw_LFC                                  0
SL                                       0
org_A1                                   0
org_A2                                   0
A1_copy_number_data                    168
A1_expression_data                     352
A1_gene_effect_data                   6854
A1_zexpression_data                    352
A1_zgene_effect_data                  6854
A2_copy_number_data                    290
A2_expression_data                     528
A2_gene_effect_data                   7254
A2_zexpression_data                    528
A2_zgene_effect_data                  7254
A1_Deleterious                           0
A1_mut     

In [12]:
output_dir = get_data_path(['output', 'processed_CRISPR_screens'], '')

for i, df in enumerate(annotated_datasets):
    base_filename = filenames[i]
    output_path = os.path.join(output_dir, f"{base_filename}_scored.csv")
    df.to_csv(output_path, index=False)
    print(f"Saved: {output_path}")

Saved: /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_CRISPR_screens/processed_ito_df_scored.csv
Saved: /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_CRISPR_screens/processed_klingbeil_df_scored.csv
