In [1]:
import pandas as pd
import os

In [2]:
def cytobands_hrr_genes(version, cytoband_hrr_path):
    '''
    Creates new cytoband level (cytoband_HRR) where only cytobands are included that have a HRR gene on it.
    Additionaly it creates a new patient level (patient_level) with the CIN ect. calculated from the filtered cytoband results. 
    Input:
    version (string): Indication the version of the results folder
    cytoband_hrr_path (string): Path to the annotation file cytoband_HRRgenes
    '''
    
    file_name = '_level_cytobands.csv'
    output_path = '../data/CIN_Output_version_'+version
    
    cytobands_hrr = pd.read_csv(cytoband_hrr_path, sep='\t', header = 0)
    ## add 'chr' as a prefix
    cytobands_hrr = annotateCytobandHRR(cytobands_hrr)
    
    ## Check if this version exists
    if not os.path.exists(output_path):
        print('Version results do not exist')
        pass
    
    ## Get all cohort names
    subfolder_list = getListOfSubfolders(output_path)
    
    ## Patterns for the columns that should be included
    patterns = cytobands_hrr['cytoband_chr'].tolist()
    
    for cancertype in subfolder_list:
        
        file = output_path+'/'+cancertype+'/'+cancertype+file_name
        
        ## Check if cytoband results exit
        if not checkForCytobandsResults(file):
            print('No cytoband level results found for cohort '+ cancertype)
            continue
        ## Load results
        cyto_results = pd.read_csv(file, sep=',', header = 0, index_col = 0)
        
        ## Filter the results to only include cytoband mentioned in the cytoband_HRRgenes file
        columns_to_keep = ['File Name'] + [col for col in cyto_results.columns if col.startswith(tuple(patterns))]
        cyto_results_filtered = cyto_results[columns_to_keep]
        
        ## Save new cytoband level
        cyto_results_filtered.to_csv(output_path+'/'+cancertype+'/'+cancertype+'_level_cytobands_HRR.csv', sep=',', header = True)
        
        ## Create new patient level and save it
        new_patient_level_results = createPatientHRRCIN_df(cyto_results_filtered)
        
        new_patient_level_results.to_csv(output_path+'/'+cancertype+'/'+cancertype+'_level_patient_HRR.csv', sep=',', header = True)
        
        print('Finished cohort '+ cancertype)
        
def getListOfSubfolders(folder_path):
    '''
    Get the names of the subfolder in the the given folder.
    Input:
    folder_path (string): Path to the parent folder
    Output:
    subfolders (list of strings): Names of the subfolder of the parent folder given
    '''
    subfolders = []
    for item in os.listdir(folder_path):
        item_path = os.path.join(folder_path, item)
        if os.path.isdir(item_path):
            subfolders.append(item)
    return subfolders

def annotateCytobandHRR(cytobands_hrr):
    '''
    Add 'chr' to the annotation file
    Input:
    cytobands_hrr (dataframe): cytoband_HRRgenes dataframe
    Output:
    cytobands_hrr (dataframe): 'cytoband_chr' column with new prefix 'chr'
    '''
    cytobands_hrr['cytoband_chr'] = cytobands_hrr['cytoband_chr'].apply(lambda x: 'chr' + x)
    
    return cytobands_hrr

def checkForCytobandsResults(file_path):
    '''
    Checks if file exits.
    Input:
    file_path (string): Path to file
    Output:
    (boolean)
    '''
    return os.path.exists(file_path)

def createPatientHRRCIN_df(cyto_results_filtered):
    '''
    Based of the filtered cytoband results it creates a new patient level dataframe.
    Sums the results like cin, norm_cin ect.
    Input:
    cyto_results_filtered (dataframe): Filtered cytoband results dataframe
    Output:
    new_df (dataframe): New patient level dataframe
    '''
    sum_columns = {
        'general_cin_HRR': '_general_cin',
        'cn_cin_HRR': '_cn_cin',
        'number_cn_loh_HRR': '_number_cn_loh',
        'number_cn_gain_HRR': '_number_cn_gain',
        'number_cn_amp_HRR': '_number_cn_amp',
        'number_homo_del_HRR': '_number_homo_del',
        'number_hemi_del_HRR': '_number_hemi_del',
        'number_neutral_HRR': '_number_neutral',
        'number_gain_HRR': '_number_gain',
        'number_amp_HRR': '_number_amp'
    }

    # Initialize the new DataFrame
    new_df = pd.DataFrame(columns=['File Name'] + list(sum_columns.keys()))

    # Iterate over rows in cyto_results_filtered
    for index, row in cyto_results_filtered.iterrows():
        file_name = row['File Name']
        row_data = {'File Name': file_name}

        # Calculate summations for each column
        for sum_col, suffix in sum_columns.items():
            if suffix == '_cin':
                columns_to_sum = [col for col in cyto_results_filtered.columns if col.endswith(suffix) and not col.endswith('_norm_cin')]
            else:
                columns_to_sum = [col for col in cyto_results_filtered.columns if col.endswith(suffix)]
                
            row_data[sum_col] = row[columns_to_sum].sum()

        # Append the row to the new DataFrame
        new_df = pd.concat([new_df, pd.DataFrame(row_data, index=[0])], ignore_index=True)

    return new_df

In [3]:
cytoband_hrr_path = '../data/cytoband_HRRgenes.tsv'

In [6]:
cytobands_hrr = pd.read_csv(cytoband_hrr_path, sep='\t', header = 0)
## add 'chr' as a prefix
cytobands_hrr = annotateCytobandHRR(cytobands_hrr)
cytobands_hrr.to_csv('../data/cytoband_HRRgenes_annotated.csv', sep=',', header = True)

In [6]:
cytobands_hrr_genes('2_0', cytoband_hrr_path)

Finished cohort TCGA-TGCT
Finished cohort TCGA-CESC
Finished cohort TCGA-STAD
Finished cohort TCGA-KIRP
Finished cohort TCGA-PAAD
Finished cohort TCGA-UVM
Finished cohort TCGA-MESO
Finished cohort TCGA-KIRC
Finished cohort TARGET-AML
Finished cohort TARGET-ALL-P2
Finished cohort TCGA-THCA
Finished cohort TCGA-LUSC
Finished cohort TCGA-GBM
Finished cohort TCGA-PCPG
Finished cohort TCGA-DLBC
Finished cohort TCGA-THYM
Finished cohort TCGA-CHOL
Finished cohort TCGA-UCS
Finished cohort TCGA-BLCA
Finished cohort TCGA-BRCA
Finished cohort TCGA-COAD
Finished cohort TCGA-PRAD
Finished cohort TCGA-ACC
Finished cohort TARGET-CCSK
Finished cohort TCGA-UCEC
Finished cohort TCGA-ESCA
Finished cohort TCGA-LGG
Finished cohort TCGA-LAML
Finished cohort TCGA-LUAD
Finished cohort TCGA-OV
Finished cohort TCGA-HNSC
Finished cohort TCGA-READ
Finished cohort TCGA-SARC
Finished cohort TCGA-LIHC
Finished cohort TCGA-KICH
Finished cohort TARGET-OS
Finished cohort TCGA-SKCM
