# Annotation of CCLE data to target dataset

In [6]:
import os
import pandas as pd
import numpy as np
from functools import reduce

In [7]:
def get_feature_files(folder):
    """Return sorted list of CSV file paths and their base names (without extension) from a folder."""
    csv_files = [
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.endswith('.csv')
    ]
    csv_files = sorted(csv_files)
    filenames = [os.path.splitext(os.path.basename(f))[0] for f in csv_files]
    return csv_files, filenames

In [8]:
folder = '/Users/narod/Documents/GitHub/context_specific_SL_prediction/input_files/ccle_files/22Q4_processed'
feature_files, feature_names = get_feature_files(folder)

In [9]:
feature_files

['/Users/narod/Documents/GitHub/context_specific_SL_prediction/input_files/ccle_files/22Q4_processed/common_essentials.csv',
 '/Users/narod/Documents/GitHub/context_specific_SL_prediction/input_files/ccle_files/22Q4_processed/copy_number_data.csv',
 '/Users/narod/Documents/GitHub/context_specific_SL_prediction/input_files/ccle_files/22Q4_processed/expression_data.csv',
 '/Users/narod/Documents/GitHub/context_specific_SL_prediction/input_files/ccle_files/22Q4_processed/gene_effect_data.csv',
 '/Users/narod/Documents/GitHub/context_specific_SL_prediction/input_files/ccle_files/22Q4_processed/mutation_data.csv',
 '/Users/narod/Documents/GitHub/context_specific_SL_prediction/input_files/ccle_files/22Q4_processed/mutation_data_updated.csv',
 '/Users/narod/Documents/GitHub/context_specific_SL_prediction/input_files/ccle_files/22Q4_processed/zexpression_data.csv',
 '/Users/narod/Documents/GitHub/context_specific_SL_prediction/input_files/ccle_files/22Q4_processed/zgene_effect_data.csv']

In [10]:
# extract the mutation data, adn common essentials from files and filenames
#remove_idx = [2, 3, 5, 6] #parrish
remove_idx = [0, 4, 5]
ufeature_files = np.delete(feature_files, remove_idx)
ufeature_names = np.delete(feature_names, remove_idx)

In [11]:
def get_csv_files_from_folder(folder_path):
    return sorted([
        os.path.join(folder_path, file)
        for file in os.listdir(folder_path)
        if file.endswith('.csv')
    ])

In [4]:
crispr_screens_folder = '/Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/input_files/crispr_screens'
target_pair_files = get_csv_files_from_folder(crispr_screens_folder)
target_pair_files

['/Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/input_files/crispr_screens/ito_pairs.csv',
 '/Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/input_files/crispr_screens/klingbeil_dataset.csv',
 '/Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/input_files/crispr_screens/parrish_pairs_new.csv']

In [12]:
def process_depmap_files(ufiles, cell_lines):

    results = []

    for file in ufiles:
        filtered_df = pd.read_csv(file, index_col=0, low_memory=False)
        filtered_df = filtered_df[filtered_df.index.isin(cell_lines['DepMap_ID'])].reset_index()
        filtered_df = filtered_df.rename(columns={"index":"DepMap_ID"})

        # Convert wide format to long format
        melt_df = pd.melt(filtered_df.drop(["cell_name"], axis=1), 
                          id_vars=["DepMap_ID"], value_vars=filtered_df.drop(["cell_name"], axis=1).columns,
                          var_name='entrez_id', value_name='value')
        
        melt_df = melt_df.astype({'entrez_id':'int'})
        melt_df = melt_df.rename(columns={'value': os.path.basename(file).split(".")[0]})
        results.append(melt_df)
        print(f"Processed {file}")
            
    return results

In [13]:
def annotate_features(target_df, processed_features):
    df = target_df.copy()

    # Annotate A1
    for feature_df in processed_features:
        df = pd.merge(
            df,
            feature_df.rename(columns={'entrez_id': 'A1_entrez'}),
            on=['DepMap_ID', 'A1_entrez'],
            how='left'
        )

    df = df.rename(columns={
        'copy_number_data': 'A1_copy_number_data',
        'expression_data': 'A1_expression_data',
        'gene_effect_data': 'A1_gene_effect_data',
        'zexpression_data': 'A1_zexpression_data',
        'zgene_effect_data': 'A1_zgene_effect_data'
    })

    # Annotate A2
    for feature_df in processed_features:
        df = pd.merge(
            df,
            feature_df.rename(columns={'entrez_id': 'A2_entrez'}),
            on=['DepMap_ID', 'A2_entrez'],
            how='left'
        )

    df = df.rename(columns={
        'copy_number_data': 'A2_copy_number_data',
        'expression_data': 'A2_expression_data',
        'gene_effect_data': 'A2_gene_effect_data',
        'zexpression_data': 'A2_zexpression_data',
        'zgene_effect_data': 'A2_zgene_effect_data'
    })

    return df


In [14]:
def annotate_mutations(mapped_df, mutation_df):
    filtered = mutation_df[mutation_df['DepMap_ID'].isin(mapped_df['DepMap_ID'])]
    filtered = filtered[['entrez_id', 'DepMap_ID', 'Damaging', 'VariantInfo']].copy()
    filtered['VariantInfo'] = 1
    filtered = filtered.drop_duplicates(subset=['entrez_id', 'DepMap_ID']).reset_index(drop=True)

    # Merge A1 mutations
    mapped_df = pd.merge(
        mapped_df,
        filtered.rename(columns={
            'entrez_id': 'A1_entrez',
            'VariantInfo': 'A1_mut',
            'Damaging': 'A1_Deleterious'
        }),
        on=['DepMap_ID', 'A1_entrez'],
        how='left'
    )

    # Merge A2 mutations
    mapped_df = pd.merge(
        mapped_df,
        filtered.rename(columns={
            'entrez_id': 'A2_entrez',
            'VariantInfo': 'A2_mut',
            'Damaging': 'A2_Deleterious'
        }),
        on=['DepMap_ID', 'A2_entrez'],
        how='left'
    )

    return mapped_df


In [15]:
def filter_common_essentials(df, essentials_df):
    filtered_df = df[~df["A1_entrez"].isin(essentials_df["Essentials"])] 
    mut_filtered_df = filtered_df[~filtered_df["A2_entrez"].isin(essentials_df["Essentials"])]
    mut_filtered_df = mut_filtered_df.reset_index(drop=True)
    return mut_filtered_df

In [16]:
mutation_df = pd.read_csv(feature_files[5], low_memory=False)
common_essentials_df = pd.read_csv(feature_files[0], low_memory=False)
annotated_datasets = []

for i, file in enumerate(target_pair_files):
    # Load target pair dataset
    target_df = pd.read_csv(file)
    
    # Extract cell lines for that dataset
    cell_lines = pd.DataFrame(target_df["DepMap_ID"].unique(), columns=["DepMap_ID"])
    
    # Process feature files for this dataset's cell lines
    processed_features = process_depmap_files(ufeature_files, cell_lines)
    
    mapped_df = annotate_features(target_df, processed_features)
    mapped_df = mapped_df.sort_values(['genepair', 'DepMap_ID'], ascending=[True, True]).reset_index(drop=True)

    print(f"Processed {file} with shape: {mapped_df.shape}")

    # Add mutation annotations
    mut_mapped_df = annotate_mutations(mapped_df, mutation_df)

    # Common essential filtering
    final_df = filter_common_essentials(mut_mapped_df, common_essentials_df)

    annotated_datasets.append(final_df)
    print(f"Final shape: {final_df.shape}")

Processed /Users/narod/Documents/GitHub/context_specific_SL_prediction/input_files/ccle_files/22Q4_processed/copy_number_data.csv
Processed /Users/narod/Documents/GitHub/context_specific_SL_prediction/input_files/ccle_files/22Q4_processed/expression_data.csv
Processed /Users/narod/Documents/GitHub/context_specific_SL_prediction/input_files/ccle_files/22Q4_processed/gene_effect_data.csv
Processed /Users/narod/Documents/GitHub/context_specific_SL_prediction/input_files/ccle_files/22Q4_processed/zexpression_data.csv
Processed /Users/narod/Documents/GitHub/context_specific_SL_prediction/input_files/ccle_files/22Q4_processed/zgene_effect_data.csv
Processed /Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/input_files/crispr_screens/ito_pairs.csv with shape: (53427, 22)
Final shape: (49753, 26)
Processed /Users/narod/Documents/GitHub/context_specific_SL_prediction/input_files/ccle_files/22Q4_processed/copy_number_data.csv
Processed /Users/narod/Do

In [17]:
def calculate_pairwise_features(df):
    return df.assign(
        zMaxExp_A1A2 = lambda x: np.amax(x[['A1_zexpression_data', 'A2_zexpression_data']], axis=1),
        zMinExp_A1A2 = lambda x: np.amin(x[['A1_zexpression_data', 'A2_zexpression_data']], axis=1),

        rMaxExp_A1A2 = lambda x: np.amax(x[['A1_expression_data', 'A2_expression_data']], axis=1),
        rMinExp_A1A2 = lambda x: np.amin(x[['A1_expression_data', 'A2_expression_data']], axis=1),

        max_cn = lambda x: np.amax(x[['A1_copy_number_data', 'A2_copy_number_data']], axis=1),
        min_cn = lambda x: np.amin(x[['A1_copy_number_data', 'A2_copy_number_data']], axis=1),

        zMaxESS_A1A2 = lambda x: np.amax(x[['A1_zgene_effect_data', 'A2_zgene_effect_data']], axis=1),
        zMinESS_A1A2 = lambda x: np.amin(x[['A1_zgene_effect_data', 'A2_zgene_effect_data']], axis=1),

        rMaxESS_A1A2 = lambda x: np.amax(x[['A1_gene_effect_data', 'A2_gene_effect_data']], axis=1),
        rMinESS_A1A2 = lambda x: np.amin(x[['A1_gene_effect_data', 'A2_gene_effect_data']], axis=1)
    )

In [18]:
final_annotated_datasets = []

for df in annotated_datasets:
    enriched_df = calculate_pairwise_features(df)
    final_annotated_datasets.append(enriched_df)
    print(f"Final shape: {enriched_df.shape}")

Final shape: (49753, 36)
Final shape: (53592, 39)
Final shape: (1838, 37)


In [19]:
def finalize_mutation_flags(df):
    pd.set_option('future.no_silent_downcasting', True)

    df['A1_Deleterious'] = df['A1_Deleterious'].fillna(0).astype(int)
    df['A2_Deleterious'] = df['A2_Deleterious'].fillna(0).astype(int)
    df['A1_mut'] = df['A1_mut'].fillna(0).astype(int)
    df['A2_mut'] = df['A2_mut'].fillna(0).astype(int)

    df['Protein_Altering'] = (df['A1_mut'] + df['A2_mut']).astype(int)
    df['Damaging'] = (df['A1_Deleterious'] + df['A2_Deleterious']).astype(int)

    # Optional: drop rows with any remaining NaNs
    # df.dropna(axis=0, how='any', inplace=True)

    return df

In [20]:
cleaned_final_datasets = []

for df in final_annotated_datasets:
    cleaned_df = finalize_mutation_flags(df)
    cleaned_final_datasets.append(cleaned_df)

In [21]:
print('Ito')
print('# of unique gene pairs', cleaned_final_datasets[0]['genepair'].nunique())
print('# of rows', cleaned_final_datasets[0].shape[0])
print(f'# of rows with NA value: {cleaned_final_datasets[0].loc[cleaned_final_datasets[0].isna().any(axis=1),].shape[0]}')

Ito
# of unique gene pairs 4523
# of rows 49753
# of rows with NA value: 9229


In [52]:
ito = cleaned_final_datasets[0]

In [53]:
ito = ito[['genepair', 'cell_line', 'A1_entrez', 'A2_entrez', 'rMaxESS_A1A2', 'rMinESS_A1A2']]
ito2 = ito.loc[ito['rMinESS_A1A2'] < -0.6]

In [54]:
ito2

Unnamed: 0,genepair,cell_line,A1_entrez,A2_entrez,rMaxESS_A1A2,rMinESS_A1A2
341,ACACA_ACACB,PATU8988S_PANCREAS,31,32,-0.043196,-1.445091
348,ACACA_ACACB,IPC298_SKIN,31,32,-0.015118,-1.066633
350,ACACA_ACACB,HSC5_SKIN,31,32,-0.083262,-1.136024
425,ACAD9_ACADS,IPC298_SKIN,28976,35,0.079261,-0.859776
436,ACAD9_ACADSB,IPC298_SKIN,28976,36,-0.038891,-0.859776
...,...,...,...,...,...,...
49386,YTHDF1_YTHDF2,IPC298_SKIN,54915,51441,0.113554,-0.685195
49392,YTHDF2_YTHDF3,HS944T_SKIN,51441,253943,-0.629745,-0.629745
49395,YTHDF2_YTHDF3,HS936T_SKIN,51441,253943,-0.640001,-0.640001
49397,YTHDF2_YTHDF3,IPC298_SKIN,51441,253943,-0.685195,-0.685195


In [56]:
# Analyze gene pair and cell line triplets in ito2
print(f"Total number of rows (gene pair - cell line combinations): {len(ito)}")
print(f"Number of unique gene pairs: {ito['genepair'].nunique()}")
print(f"Number of unique cell lines: {ito['cell_line'].nunique()}")
print(f"Number of unique gene pair - cell line combinations: {ito[['genepair', 'cell_line']].drop_duplicates().shape[0]}")

# Show some sample data
print("\nSample of the data:")
print(ito.head())

Total number of rows (gene pair - cell line combinations): 49753
Number of unique gene pairs: 4523
Number of unique cell lines: 11
Number of unique gene pair - cell line combinations: 49753

Sample of the data:
      genepair                   cell_line  A1_entrez  A2_entrez  \
0  A3GALT2_ABO          PATU8988S_PANCREAS     127550         28   
1  A3GALT2_ABO                PK1_PANCREAS     127550         28   
2  A3GALT2_ABO                 HS944T_SKIN     127550         28   
3  A3GALT2_ABO                   A549_LUNG     127550         28   
4  A3GALT2_ABO  GI1_CENTRAL_NERVOUS_SYSTEM     127550         28   

   rMaxESS_A1A2  rMinESS_A1A2  
0     -0.135723     -0.135723  
1     -0.035230     -0.035230  
2     -0.179614     -0.179614  
3      0.099984      0.099984  
4     -0.096832     -0.096832  


### save to csv

In [None]:
output_dir = '/Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/output_files'

for i, df in enumerate(cleaned_final_datasets):
    base_filename = os.path.splitext(os.path.basename(target_pair_files[i]))[0]
    output_path = os.path.join(output_dir, f"{base_filename}_CCLE22Q4.csv")
    df.to_csv(output_path, index=False)
    print(f"Saved: {output_path}")