## Gene loss in tumor samples (TCGA)

**Input**
- CNA data of A2 gene of SL pair downloaded from TCGA (https://www.cbioportal.org/)
    - Only the study with more than a hundred cases would be selected 
    - we will particulary focus on those gene deletion in more than 1% among specific cancer types
- All of the candidate pairs are annotated with gene pair features: candidate_pairs_anno.csv
- tcga_study_abbreviations.txt: https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations
- Cell line model information: model_list_20230307.csv

In [1]:
## Import moduels 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

In [2]:
cand_pair = pd.read_csv('/Users/amy/Desktop/SyntheticLethalityProject/2_data_analysis/07_analysis_of_the_hits/candidate_pairs_anno.csv', index_col = None)
cand_pair.sort_values(by = ['SL']).reset_index(drop=True)
cand_pair[:2]

Unnamed: 0,sorted_gene_pair,A1,A2,A1_entrez,A2_entrez,A1_ensembl,A2_ensembl,A2_hgnc_symbol,pairs_to_test,pairs_to_test_symbol,...,min_sequence_identity,WGD,either_in_complex,prediction_score,closest,model_name,tissue,cancer_type,cancer_type_detail,SL
0,VPS4A_VPS4B,VPS4A,VPS4B,27183,9525,ENSG00000132612,ENSG00000119541,VPS4B,27183-9525,VPS4A_VPS4B,...,0.801802,True,False,0.183805,True,KP-2,Pancreas,Pancreatic Carcinoma,Pancreatic Carcinoma,True
1,UBALD1_UBALD2,UBALD2,UBALD1,283991,124402,ENSG00000185262,ENSG00000153443,UBALD1,283991-124402,UBALD2_UBALD1,...,0.610169,True,False,0.031177,True,MDA-MB-468,Breast,Breast Carcinoma,Breast Adenocarcinoma,True


In [3]:
## Load TCGA data 
# CNA
SLs_A2_CNA = pd.read_table('/Users/amy/Desktop/SyntheticLethalityProject/sources/SLs_A2_CNA.txt', index_col = None)
SLs_A2_CNA[:1]

Unnamed: 0,STUDY_ID,SAMPLE_ID,VPS4B,UBALD1,EP300,ACSL4,LUC7L,NXT2,AXIN2,TLK1,AXIN1,NCOR2,DMXL1,POLR3G,SMARCC1,CSTF2T,CNOT7,CAPZA1,ENO1
0,laml_tcga_pan_can_atlas_2018,TCGA-AB-2989-03,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [4]:
# Study code abbreviation
tcga_study_codes = pd.read_table('/Users/amy/Desktop/SyntheticLethalityProject/sources/tcga_study_abbreviations.txt', index_col = None)
# Add code for Colorectal Adenocarcinoma
tcga_study_codes.loc[37] = ['COADREAD','Colorectal Adenocarcinoma']
tcga_study_codes[:1]

Unnamed: 0,Study Abbreviation,Study Name
0,LAML,Acute Myeloid Leukemia


In [5]:
## Get the study code of the TCGA sample 
SLs_A2_CNA['Study Abbreviation'] = SLs_A2_CNA['STUDY_ID'].apply(lambda x: x.split('_')[0].upper())

In [6]:
## For loop to calculate the percentage of gene loss in each cancer type across all the 58 A2 genes
A2_symbol = cand_pair[cand_pair['SL'] == True].A2_hgnc_symbol

for i in range(len(A2_symbol)):
    
    gene_symbol = A2_symbol[i]
    # Get the CNA data for selected gene
    gene_loss = SLs_A2_CNA[['Study Abbreviation', gene_symbol]]
    # Merge the CNA data with study codes info
    gene_loss = pd.merge(tcga_study_codes,gene_loss, on = ['Study Abbreviation'], how = 'left')
    # Clean the data (remove NA and NP)
    gene_loss = gene_loss[~(gene_loss[gene_symbol]=='NP')]
    gene_loss = gene_loss.dropna(subset = [gene_symbol])
    gene_loss = gene_loss.astype({gene_symbol:'int'})
    
    # Calculate the percentage of gene loss in each cancer type
    calc_gene_loss_percent = lambda x: sum(x == -2)/x.count()*100
    gene_loss = gene_loss.groupby(['Study Abbreviation','Study Name']).agg({gene_symbol:calc_gene_loss_percent}).reset_index()
    # Sort the data 
    gene_loss = gene_loss.sort_values(gene_symbol, ascending=False)
    
    df = pd.melt(gene_loss, id_vars=['Study Abbreviation','Study Name'], var_name='A2', value_name='deletion').dropna()
    
    if i == 0:
        gene_loss_in_ct = df
        
    else:
        gene_loss_in_ct = pd.concat([gene_loss_in_ct, df])
        

In [7]:
cand_pair = cand_pair[cand_pair['SL'] == True]
gene_pair = cand_pair[['pairs_to_test_symbol', 'A2']]
gene_loss_in_ct = pd.merge(gene_loss_in_ct, gene_pair, on = ['A2'], how = 'left')
gene_loss_in_ct = gene_loss_in_ct[['Study Name', 'deletion', 'A2', 'pairs_to_test_symbol']]
gene_loss_in_ct = gene_loss_in_ct.rename(columns = {'pairs_to_test_symbol':'gene_pair', 'Study Name':'cancer_type', 'deletion':'deletion_frequency'})

In [8]:
## Save the data 
gene_loss_in_ct.to_csv('/Users/amy/Desktop/SyntheticLethalityProject/2_data_analysis/08_gene_loss_in_tcga/percentage_gene_loss_in_cancerType.csv', index = False)

**Bar plot to show the percentage of HD of each gene across cancer types** 

In [9]:
## Load the dataset 
gene_loss_in_ct = pd.read_csv('/Users/amy/Desktop/SyntheticLethalityProject/2_data_analysis/08_gene_loss_in_tcga/percentage_gene_loss_in_cancerType.csv', index_col = None)
## Drop the cancer types with zero percentage 
gene_loss_in_ct = gene_loss_in_ct[~(gene_loss_in_ct['deletion_frequency'] < 1)]

## Change the long name into study abbreviation
pd.DataFrame(gene_loss_in_ct).loc[gene_loss_in_ct['cancer_type'] == 'Cervical squamous cell carcinoma and endocervical adenocarcinoma', 'cancer_type'] = 'Cervical squamous cell Ca and endocervical AdCa'

In [10]:
gene_loss_in_ct[:5]

Unnamed: 0,cancer_type,deletion_frequency,A2,gene_pair
0,Esophageal carcinoma,3.846154,VPS4B,VPS4A_VPS4B
1,Head and Neck squamous cell carcinoma,3.288201,VPS4B,VPS4A_VPS4B
2,Stomach adenocarcinoma,2.739726,VPS4B,VPS4A_VPS4B
3,Pancreatic adenocarcinoma,2.73224,VPS4B,VPS4A_VPS4B
4,Testicular Germ Cell Tumors,2.013423,VPS4B,VPS4A_VPS4B


In [11]:
## For loop to plot bar plot for all the A2 gene in the list 
A2_symbol = gene_loss_in_ct.A2.unique()

for i in range(len(A2_symbol)):
    gene_symbol = A2_symbol[i]
    
    # Subset the data 
    df = gene_loss_in_ct[gene_loss_in_ct['A2'] == gene_symbol]
    
    # Plot 
    f, ax = plt.subplots(1,1, figsize = (3.5, df['cancer_type'].shape[0]*0.3))
    
    sns.barplot(x='deletion_frequency', y='cancer_type', data=df, ax=ax, palette=["#CCCCCC"],
                saturation=1, edgecolor='white', width=0.8)
    
    # Figure format 
    ax.set_ylabel('')
    ax.set_xlabel('Homozygous deletion frequency (%)')
    ax.set_title(gene_symbol, fontsize = 12, fontstyle = 'italic', y = 1.02)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    # Save plot to a Figure object
    plt.gcf()
    # Close the plot to release memory
    plt.close()
    
    f.savefig('/Users/amy/Desktop/SyntheticLethalityProject/2_data_analysis/08_gene_loss_in_tcga/barplot_gene_loss_in_ct/barplot_'+gene_symbol+'.pdf', dpi=400, bbox_inches='tight')
    