In [1]:
import pandas as pd
import os
import glob
import numpy as np

# BRCA

In [2]:
# Load data
# IMPORTANT: the clinical data here referse to the one downloaded from cBioportal (contains mol.Subtype) and not from GDC
HRD_scores_pan_cancer = pd.read_csv('../../HRD_score/data/HRD_scores_pan_cancer_annotated_typecorrect.csv', sep=',', header = 0)
HRD_scores_pan_cancer = pd.read_csv('../../HRD_score/data/HRD_scores_pan_cancer_annotated_v2.csv', sep=',', header = 0)
firebrowse_data = pd.read_csv('../data/gene_expression/firebrowse_BRCA/BRCA.mRNAseq_raw_counts.txt', sep='\t', header = 0)
clincial_data = pd.read_csv('../data/metadata/brca_tcga_pan_can_atlas_2018_clinical_data.tsv', sep = '\t', header = 0)

In [7]:
def curation(rna_counts):
    '''
    Splits up the HYBRIDIZATION R column into gene names and ID and only keeps the gene names as rownames
    Input:
    rna_counts (dataframe): count matrix
    Output: 
    rna_counts (dataframe): count matrix with new rownames (gene names)
    '''
    gene_list = list(rna_counts['HYBRIDIZATION R'])
    gene_names = [x.split('|')[0] for x in gene_list]
    rna_counts = rna_counts.drop(rna_counts.columns[0], axis = 1)
    rna_counts.index = gene_names
    
    return rna_counts

def prepareClinical(clinical):
    '''
    Extracts the needed part of the clinical data, set the sample ID as new rownames and transposes the matrix
    Input:
    clinical (dataframe): Clinical metadata
    Output:
    cut_clinical_named (dataframe): Subframe of the input with new rownames and transposed
    '''
    cut_clinical = clinical[['Patient ID', 'Sample ID', 'Subtype']]
    cut_clinical_named = cut_clinical.set_index('Sample ID')
    cut_clinical_named = cut_clinical_named.T
    
    return cut_clinical_named
    
def prepareScorefile(scorefile, cancertype):
    '''
    This functions prepares the HRD score file. Because it contains a column Sample ID which looks like (ID_1|ID_2),
    the column is cut and two new columns are added each containing one of the IDs.
    All IDs are given back seperatly as an addition.
    Input:
    scorefile (dataframe): Contains the results from HRDscar
    cancertype (string): Indicates the wanted cancertype project name (example: TCGA-BRCA)
    Output:
    scorefile_type_primary (dataframe): Subframe only containing the Primary sample types and with the column Sample ID split into two columns
    sample_id_score (list of strings): List all Sample IDs (ID_1 and ID_2)
    '''
    
    scorefile_type = scorefile[scorefile['Project ID'] == cancertype]
    scorefile_type_primary = scorefile_type[scorefile_type['Type'] == 'Primary']
    double_ids = list(scorefile_type_primary['Sample ID'])
    
    sample_id = list()
    sample_id1 =list()
    sample_id2 = list()
    
    for sampleid in double_ids:
        ids = sampleid.split(', ')
        id_1 = ids[0]
        id_2 = ids[1]
        sample_id1.append(id_1)
        sample_id2.append(id_2)
        sample_id.append(id_1)
        sample_id.append(id_2)
    sample_id_scores = list(np.unique(np.array(sample_id)))
    sample_id_scores = [s[:-1] for s in sample_id_scores]
    sample_id1 = [s[:-1] for s in sample_id1]
    sample_id2 = [s[:-1] for s in sample_id2]
    
    scorefile_type_primary['sample_id_1'] = ''
    scorefile_type_primary['sample_id_1'] = sample_id1
    scorefile_type_primary['sample_id_2'] = ''
    scorefile_type_primary['sample_id_2'] = sample_id2
    
    return scorefile_type_primary, sample_id_scores

def filterForMatches(clinical, rna_counts, scorefile, sample_id_scores):
    '''
    Compares all three dataframes with each other and only keeping cases which are in all 3 dataframes
    Input:
    clinical (dataframe):
    rna_counts (dataframe):
    scorefile (dataframe):
    sample_id_scores (list of strings):
    
    Output:
    clinical_reordered (dataframe):
    rna_counts_filtered (dataframe):
    scorefile_filtered_third (dataframe):
    '''
    
    col_order_count = rna_counts.columns.tolist()
    col_order_clinical = clinical.columns.tolist()
    set_id_HRD = set(sample_id_scores)
    set_id_clincial = set(col_order_clinical)
    
    matching = list(set_id_HRD.intersection(set_id_clincial))
    set_match = set(matching)
    set_id_count = set(col_order_count)
    matching = list(set_match.intersection(set_id_count))

    #Check for matches and delete dublicates
    scorefile_filtered = scorefile[scorefile['sample_id_1'].isin(matching) | scorefile['sample_id_2'].isin(matching)]
    scorefile_filtered_second = scorefile_filtered.drop_duplicates(subset='sample_id_1', keep='first')
    scorefile_filtered_third = scorefile_filtered_second.drop_duplicates(subset='sample_id_2', keep='first')

    clinical_filtered = clinical.loc[:, matching]
    rna_counts_filtered = rna_counts.loc[:,matching]
    
    clinical_reordered = clinical_filtered.reindex(columns=rna_counts_filtered.columns)

    return clinical_reordered, rna_counts_filtered, scorefile_filtered_third

def addScoreToClinical(clinical, scorefile):
    '''
    Adds the 4 HRD scores (HRDsum, LST,LOH,TAI) to the clinical data
    Input:
    clinical (dataframe):
    scorefile (dataframe):
    Output:
    clinical_t (dataframe):
    '''
    hrds = dict()
    loh = dict()
    lst = dict()
    tai = dict()

    for id_ in clinical.columns:
        if id_ in list(scorefile['sample_id_1']):
            hrds[id_] = scorefile[scorefile['sample_id_1'] == id_]['HRD_sum'].values[0]
            loh[id_] = scorefile[scorefile['sample_id_1'] == id_]['LOH'].values[0]
            lst[id_] = scorefile[scorefile['sample_id_1'] == id_]['LST'].values[0]
            tai[id_] = scorefile[scorefile['sample_id_1'] == id_]['TAI'].values[0]
        else:
            hrds[id_] = scorefile[scorefile['sample_id_2'] == id_]['HRD_sum'].values[0]
            loh[id_] = scorefile[scorefile['sample_id_2'] == id_]['LOH'].values[0]
            lst[id_] = scorefile[scorefile['sample_id_2'] == id_]['LST'].values[0]
            tai[id_] = scorefile[scorefile['sample_id_2'] == id_]['TAI'].values[0]
        

    clinical.loc['HRD_sum'] = hrds
    clinical.loc['LOH'] = loh
    clinical.loc['LST'] = lst
    clinical.loc['TAI'] = tai
    
    clinical_t = clinical.T
    clinical_t.fillna('undefined', inplace=True)
    
    return clinical_t

def statistics(clinical, rna_count, sample_id_scores):
    '''
    Function which prints out the number of matches between the datasets
    Input:
    clinincal (dataframe):
    rna_count (dataframe):
    sample_id_scores (list of string):
    '''
    col_order = rna_count.columns.tolist()
    col_order_clinical = clinical.columns.tolist()

    col_order.pop(0)

    print('Comparing clinical and raw counts dataset')
    set1 = set(col_order)
    set2 = set(col_order_clinical)
    
    matching = list(set1.intersection(set2))
    print('Number of cases in raw counts: '+ str(len(col_order)))
    print('Number of cases in clinical: '+ str(len(col_order_clinical)))
    print('Number of matches: ' + str(len(matching)))
    
    print('\n')
    
    print('Comparing clinical and HRDscores dataset')

    set1 = set(sample_id_scores)
    set2 = set(col_order_clinical)
    
    matching = list(set1.intersection(set2))
    print('Number of cases in HRDscores: '+ str(len(sample_id_scores)))
    print('Number of cases in clinical: '+ str(len(col_order_clinical)))
    print('Number of matches: ' + str(len(matching)))
    
    print('\n')

    print('Comparing clinical and HRDscores and raw counts dataset')
    set1 = set(matching)
    set2 = set(col_order)
    
    matching = list(set1.intersection(set2))
    print('Number of matches: ' + str(len(matching)))
    
    print('\n')
    
    print('Comparing raw count and HRDscores dataset')

    set1 = set(col_order)
    set2 = set(sample_id_scores)
    
    matching = list(set1.intersection(set2))
    print('Number of cases in raw counts: '+ str(len(col_order)))
    print('Number of cases in HRD scores: '+ str(len(sample_id_scores)))
    print('Number of matches: ' + str(len(matching)))

def saveData(clinical, rna_count, scorefile, path, cancertype):
    '''
    Saves the prepared files
    Input:
    clincial (dataframe):
    rna_count (dataframe):
    scorefile (dataframe):
    path (string):
    cancertype (string):
    '''
    rna_count.to_csv(path + 'raw_count_'+cancertype+'.csv', sep=',', index = True, header = True)
    clinical.to_csv(path + 'clinical_'+cancertype+'.csv', sep=',', header = True, index = True)
    scorefile.to_csv(path + 'HRD_scores_'+cancertype+'.csv', sep=',', header = True, index = None)

In [8]:
rna_counts_annotated = curation(firebrowse_data)
clinical_named = prepareClinical(clincial_data)
score_file_prep, sample_id_scores = prepareScorefile(HRD_scores_pan_cancer, 'TCGA-BRCA')

clinical_reordered, rna_counts_filtered, scorefile_filtered = filterForMatches(clinical_named, rna_counts_annotated, score_file_prep, sample_id_scores)
clinical_t = addScoreToClinical(clinical_reordered, scorefile_filtered)
statistics(clinical_named, rna_counts_annotated, sample_id_scores)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scorefile_type_primary['sample_id_1'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scorefile_type_primary['sample_id_1'] = sample_id1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scorefile_type_primary['sample_id_2'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
T

Comparing clinical and raw counts dataset
Number of cases in raw counts: 877
Number of cases in clinical: 1084
Number of matches: 764


Comparing clinical and HRDscores dataset
Number of cases in HRDscores: 2142
Number of cases in clinical: 1084
Number of matches: 1057


Comparing clinical and HRDscores and raw counts dataset
Number of matches: 746


Comparing raw count and HRDscores dataset
Number of cases in raw counts: 877
Number of cases in HRD scores: 2142
Number of matches: 806


In [13]:
print(rna_counts_annotated.shape)

(20532, 878)


In [14]:
statistics(clinical_named, rna_counts_annotated, sample_id_scores)

raw and clinical overlap
877
1084
764
#### hrd set and clinical overlap
2142
1084
1057
#### overlap of hrd and clinical overlaped with raw count
746
### hrd set and raw count overlap
877
2142
806


# LUAD

In [92]:
HRD_scores_pan_cancer = pd.read_csv('../../HRD_score/data/HRD_scores_pan_cancer_annotated_typecorrect.csv', sep=',', header = 0)
rna_count_luad = pd.read_csv('../data/gene_expression/LUAD_cohort/LUAD.mRNAseq_raw_counts.txt', sep='\t', header = 0)
clincial_luad = pd.read_csv('../data/gene_expression/LUAD_cohort/luad_tcga_pan_can_atlas_2018_clinical_data.tsv', sep = '\t', header = 0)

In [96]:
rna_counts_annotated = curation(rna_count_luad)
clinical_named = prepareClinical(clincial_luad)
score_file_prep, sample_id_scores = prepareScorefile(HRD_scores_pan_cancer, 'TCGA-LUAD')

clinical_reordered, rna_counts_filtered, scorefile_filtered = filterForMatches(clinical_named, rna_counts_annotated, score_file_prep, sample_id_scores)
clinical_t = addScoreToClinical(clinical_reordered, scorefile_filtered)
statistics(clinical_named, rna_counts_annotated, sample_id_scores)

(20532, 162)
(543, 21)
502
(2, 121)
(20532, 121)
(121, 6)
raw and clinical overlap
161
566
121
#### hrd set and clinical overlap
1026
566
502
#### overlap of hrd and clinical overlaped with raw count
120
### hrd set and raw count overlap
161
1026
148


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scorefile_type_primary['sample_id_1'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scorefile_type_primary['sample_id_1'] = sample_id1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scorefile_type_primary['sample_id_2'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
T

In [98]:
print(rna_counts_filtered.shape)
print(clinical_t.shape)
print(rna_count_luad.shape)

(20532, 121)
(121, 6)
(20532, 163)


# LUSC

In [101]:
HRD_scores_pan_cancer = pd.read_csv('../../HRD_score/data/HRD_scores_pan_cancer_annotated_typecorrect.csv', sep=',', header = 0)
rna_count_lusc = pd.read_csv('../data/gene_expression/LUSC_cohort/LUSC.mRNAseq_raw_counts.txt', sep='\t', header = 0)
clincial_lusc = pd.read_csv('../data/gene_expression/LUSC_cohort/lusc_tcga_pan_can_atlas_2018_clinical_data.tsv', sep = '\t', header = 0)

In [102]:
rna_counts_annotated = curation(rna_count_lusc)
clinical_named = prepareClinical(clincial_lusc)
score_file_prep, sample_id_scores = prepareScorefile(HRD_scores_pan_cancer, 'TCGA-LUSC')

clinical_reordered, rna_counts_filtered, scorefile_filtered = filterForMatches(clinical_named, rna_counts_annotated, score_file_prep, sample_id_scores)
clinical_t = addScoreToClinical(clinical_reordered, scorefile_filtered)
statistics(clinical_named, rna_counts_annotated, sample_id_scores)

(20532, 240)
(514, 21)
479
(2, 213)
(20532, 213)
(213, 6)
raw and clinical overlap
239
487
212
#### hrd set and clinical overlap
986
487
479
#### overlap of hrd and clinical overlaped with raw count
212
### hrd set and raw count overlap
239
986
233


# OV

In [7]:
HRD_scores_pan_cancer = pd.read_csv('../../HRD_score/data/HRD_scores_pan_cancer_annotated_typecorrect.csv', sep=',', header = 0)
HRD_scores_pan_cancer = pd.read_csv('../../HRD_score/data/HRD_scores_pan_cancer_annotated_v2.csv', sep=',', header = 0)
rna_count_ov = pd.read_csv('../data/gene_expression/OV_cohort/OV.mRNAseq_raw_counts.txt', sep='\t', header = 0)
clincial_ov = pd.read_csv('../data/gene_expression/OV_cohort/ov_tcga_pan_can_atlas_2018_clinical_data.tsv', sep = '\t', header = 0)

In [8]:
HRD_scores_pan_cancer['SAMPLE_ID'] = HRD_scores_pan_cancer['Sample ID'].str.slice(0, -1)

In [9]:
HRD_scores_pan_cancer_ov = HRD_scores_pan_cancer[HRD_scores_pan_cancer['Project ID'] == 'TCGA-OV']
HRD_scores_pan_cancer_prim = HRD_scores_pan_cancer_ov[HRD_scores_pan_cancer_ov['Type'] == 'Primary']

In [13]:
column_names = list(rna_count_ov.columns)

In [14]:
column_names.pop(0)

'HYBRIDIZATION R'

In [17]:
set1 = set(column_names)
set2 = set(list(HRD_scores_pan_cancer_prim['SAMPLE_ID']))
    
matching = list(set1.intersection(set2))

In [19]:
len(matching)

283

In [5]:
rna_counts_annotated = curation(rna_count_ov)
clinical_named = prepareClinical(clincial_ov)
score_file_prep, sample_id_scores = prepareScorefile(HRD_scores_pan_cancer, 'TCGA-OV')

clinical_reordered, rna_counts_filtered, scorefile_filtered = filterForMatches(clinical_named, rna_counts_annotated, score_file_prep, sample_id_scores)
clinical_t = addScoreToClinical(clinical_reordered, scorefile_filtered)
statistics(clinical_named, rna_counts_annotated, sample_id_scores)

(20442, 299)
(562, 21)
555


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scorefile_type_primary['sample_id_1'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scorefile_type_primary['sample_id_1'] = sample_id1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scorefile_type_primary['sample_id_2'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
T

(2, 281)
(20442, 281)
(281, 6)
raw and clinical overlap
298
585
290
#### hrd set and clinical overlap
1118
585
555
#### overlap of hrd and clinical overlaped with raw count
280
### hrd set and raw count overlap
298
1118
282


In [9]:
saveData(clinical_t, rna_counts_filtered, scorefile_filtered, '../data/gene_expression/prepared_data/', 'OV')