## Construct HUGE dataset

For downstream analysis, a HUGE dataset includes A1 gene dependency score and A2 CNV information would be constructed.

**Input**
- CRISPR gene dependecy scores: 
    - crispr_broad_paralog_standardZscore.csv
    - crispr_broad_paralog_robustZscore.csv
    - crispr_broad_paralog_ranking.csv
- Sanger CNV data: CNV_sanger_paralog.csv
- Pairs to test list: pairs_to_test.csv


**Output**
- Integrated dataset used for downstream analysis: HUGE_dataset.csv

In [1]:
## Import modules
import pandas as pd
import numpy as np

In [2]:
## Load dataset 
pairs_to_test = pd.read_csv('/Users/amy/Desktop/SyntheticLethalityProject/2_data_analysis/01_candidate_pairs_to_test/pairs_to_test.csv', index_col=None)
cnv_sanger = pd.read_csv('/Users/amy/Desktop/SyntheticLethalityProject/1_data_processing/04_paralog_genes/cnv_sanger_paralog.csv', index_col = None)
crispr_broad = pd.read_csv('/Users/amy/Desktop/SyntheticLethalityProject/1_data_processing/04_paralog_genes/crispr_broad_paralog.csv', index_col = None)
stand_z_gene_effect = pd.read_csv('/Users/amy/Desktop/SyntheticLethalityProject/2_data_analysis/02_CRISPR_gene_dependency_scores_processing/crispr_broad_paralog_standardZscore.csv', index_col = None)
robust_z_gene_effect = pd.read_csv('/Users/amy/Desktop/SyntheticLethalityProject/2_data_analysis/02_CRISPR_gene_dependency_scores_processing/crispr_broad_paralog_robustZscore.csv', index_col = None)
rank_gene_effect = pd.read_csv('/Users/amy/Desktop/SyntheticLethalityProject/2_data_analysis/02_CRISPR_gene_dependency_scores_processing/crispr_broad_paralog_ranking.csv', index_col = None)

In [3]:
## Dataframe with basic information of a paralog pair
# Candidate paralog pairs
pairs_to_test_huge = pd.DataFrame(np.repeat(pairs_to_test.values, cnv_sanger.shape[0], axis = 0), columns = pairs_to_test.columns).reset_index(drop=True)

pairs_to_test_huge[:2]

Unnamed: 0,sorted_gene_pair,A1,A2,A1_entrez,A2_entrez,A1_ensembl,A2_ensembl,A2_hgnc_symbol,pairs_to_test,pairs_to_test_symbol
0,ABL1_ABL2,ABL1,ABL2,25,27,ENSG00000097007,ENSG00000143322,ABL2,25-27,ABL1_ABL2
1,ABL1_ABL2,ABL1,ABL2,25,27,ENSG00000097007,ENSG00000143322,ABL2,25-27,ABL1_ABL2


In [4]:
# BROAD ID
broad_id_huge = pd.concat([cnv_sanger[['BROAD_ID']]] * pairs_to_test.shape[0], ignore_index=True)

## Dataframe contains basic information
pairs_to_test_concat = pd.concat([pairs_to_test_huge, broad_id_huge], axis = 1)

pairs_to_test_concat[:2]

Unnamed: 0,sorted_gene_pair,A1,A2,A1_entrez,A2_entrez,A1_ensembl,A2_ensembl,A2_hgnc_symbol,pairs_to_test,pairs_to_test_symbol,BROAD_ID
0,ABL1_ABL2,ABL1,ABL2,25,27,ENSG00000097007,ENSG00000143322,ABL2,25-27,ABL1_ABL2,ACH-000948
1,ABL1_ABL2,ABL1,ABL2,25,27,ENSG00000097007,ENSG00000143322,ABL2,25-27,ABL1_ABL2,ACH-000011


In [5]:
## Based on the basic dataset, map A2 gene status and A1 dependency scores (Original gene dependency score, Standard Z-score, Robust Z-score, ranking)
pairs_candidate = pairs_to_test.pairs_to_test.astype(str)
for i in range(pairs_to_test.shape[0]):
    
    df = pairs_to_test_concat[pairs_to_test_concat['pairs_to_test'] == pairs_candidate[i]]
    
    # Get the CNV information of A2
    df_cnv = cnv_sanger[['BROAD_ID', str(df.A2_entrez.iloc[1])]]
    # Rename the column 
    df_cnv = df_cnv.rename(columns = {str(df.A2_entrez.iloc[1]):'A2_CNV'})
    
    # Get gene dependency score of A1
    df_crispr = crispr_broad[['BROAD_ID', str(df.A1_entrez.iloc[1])]]
    df_crispr_StandardZscore = stand_z_gene_effect[[str(df.A1_entrez.iloc[1])]]
    df_crispr_RobustZscore = robust_z_gene_effect[[str(df.A1_entrez.iloc[1])]]
    df_crispr_Ranking = rank_gene_effect[[str(df.A1_entrez.iloc[1])]]
    
    
    # Rename the column
    df_crispr = df_crispr.rename(columns = {str(df.A1_entrez.iloc[1]):'A1_Chronosscore'})
    df_crispr_StandardZscore = df_crispr_StandardZscore.rename(columns = {str(df.A1_entrez.iloc[1]):'A1_StandardZscore'})
    df_crispr_RobustZscore = df_crispr_RobustZscore.rename(columns = {str(df.A1_entrez.iloc[1]):'A1_RobustZscore'})
    df_crispr_Ranking = df_crispr_Ranking.rename(columns = {str(df.A1_entrez.iloc[1]):'A1_Ranking'})
    
    
    # Combine together for A1 score
    ls = [df_crispr, df_crispr_StandardZscore, df_crispr_RobustZscore, df_crispr_Ranking]
    df_A1_score = pd.concat(ls, axis=1)
    
   
    # Merge the dataframe (A2 status and A1 dependency score)
    df_map = pd.merge(df, df_cnv, on = ['BROAD_ID'], how = 'left')
    df_map_2 = pd.merge(df_map, df_A1_score, on = ['BROAD_ID'], how = 'left')

    
    if i == 0:
        huge_df = df_map_2
    
    else:
        huge_df = pd.concat([huge_df, df_map_2])

In [6]:
## Save the data
huge_df.to_csv('/Users/amy/Desktop/SyntheticLethalityProject/2_data_analysis/03_HUGE_dataset/HUGE_dataset.csv', index = False)      

huge_df[:2]

Unnamed: 0,sorted_gene_pair,A1,A2,A1_entrez,A2_entrez,A1_ensembl,A2_ensembl,A2_hgnc_symbol,pairs_to_test,pairs_to_test_symbol,BROAD_ID,A2_CNV,A1_Chronosscore,A1_StandardZscore,A1_RobustZscore,A1_Ranking
0,ABL1_ABL2,ABL1,ABL2,25,27,ENSG00000097007,ENSG00000143322,ABL2,25-27,ABL1_ABL2,ACH-000948,Gain,0.052967,0.35345,0.492425,212.0
1,ABL1_ABL2,ABL1,ABL2,25,27,ENSG00000097007,ENSG00000143322,ABL2,25-27,ABL1_ABL2,ACH-000011,Neutral,-0.374167,-1.681954,-3.452557,692.0
