In [1]:
import pandas as pd
import os
import glob
import numpy as np

# BRCA

In [78]:
HRD_scores_pan_cancer = pd.read_csv('../../HRD_score/data/HRD_scores_pan_cancer_annotated_typecorrect.csv', sep=',', header = 0)
firebrowse_data = pd.read_csv('../data/gene_expression/firebrowse_BRCA/BRCA.mRNAseq_raw_counts.txt', sep='\t', header = 0)
clincial_data = pd.read_csv('../data/metadata/brca_tcga_pan_can_atlas_2018_clinical_data.tsv', sep = '\t', header = 0)

In [8]:
def curation(rna_counts):
    gene_list = list(rna_counts['HYBRIDIZATION R'])
    gene_names = [x.split('|')[0] for x in gene_list]
    rna_counts = rna_counts.drop(rna_counts.columns[0], axis = 1)
    rna_counts.index = gene_names
    print(rna_counts.shape)
    return rna_counts

def prepareClinical(clinical):
    cut_clinical = clinical[['Patient ID', 'Sample ID', 'Subtype']]
    cut_clinical_named = cut_clinical.set_index('Sample ID')
    cut_clinical_named = cut_clinical_named.T
    
    return cut_clinical_named
    
def prepareScorefile(scorefile, cancertype):
    scorefile_type = scorefile[scorefile['Project ID'] == cancertype]
    scorefile_type_primary = scorefile_type[scorefile_type['Type'] == 'Primary']
    double_ids = list(scorefile_type_primary['Sample ID'])
    
    sample_id = list()
    sample_id1 =list()
    sample_id2 = list()
    
    for sampleid in double_ids:
        ids = sampleid.split(', ')
        id_1 = ids[0]
        id_2 = ids[1]
        sample_id1.append(id_1)
        sample_id2.append(id_2)
        sample_id.append(id_1)
        sample_id.append(id_2)
    sample_id_scores = list(np.unique(np.array(sample_id)))
    sample_id_scores = [s[:-1] for s in sample_id_scores]
    sample_id1 = [s[:-1] for s in sample_id1]
    sample_id2 = [s[:-1] for s in sample_id2]
    
    scorefile_type_primary['sample_id_1'] = ''
    scorefile_type_primary['sample_id_1'] = sample_id1
    scorefile_type_primary['sample_id_2'] = ''
    scorefile_type_primary['sample_id_2'] = sample_id2
    print(scorefile_type_primary.shape)
    return scorefile_type_primary, sample_id_scores

def filterForMatches(clinical, rna_counts, scorefile, sample_id_scores):
    
    col_order_count = rna_counts.columns.tolist()
    col_order_clinical = clinical.columns.tolist()
    set_id_HRD = set(sample_id_scores)
    set_id_clincial = set(col_order_clinical)
    
    matching = list(set_id_HRD.intersection(set_id_clincial))
    print(len(matching))
    set_match = set(matching)
    set_id_count = set(col_order_count)
    matching = list(set_match.intersection(set_id_count))


    scorefile_filtered = scorefile[scorefile['sample_id_1'].isin(matching) | scorefile['sample_id_2'].isin(matching)]
    scorefile_filtered_second = scorefile_filtered.drop_duplicates(subset='sample_id_1', keep='first')
    scorefile_filtered_third = scorefile_filtered_second.drop_duplicates(subset='sample_id_2', keep='first')

    clinical_filtered = clinical.loc[:, matching]
    rna_counts_filtered = rna_counts.loc[:,matching]
    
    clinical_reordered = clinical_filtered.reindex(columns=rna_counts_filtered.columns)
    print(clinical_reordered.shape)
    print(rna_counts_filtered.shape)
    return clinical_reordered, rna_counts_filtered, scorefile_filtered_third

def addScoreToClinical(clinical, scorefile):
    hrds = dict()
    loh = dict()
    lst = dict()
    tai = dict()

    for id_ in clinical.columns:
        if id_ in list(scorefile['sample_id_1']):
            hrds[id_] = scorefile[scorefile['sample_id_1'] == id_]['HRD_sum'].values[0]
            loh[id_] = scorefile[scorefile['sample_id_1'] == id_]['LOH'].values[0]
            lst[id_] = scorefile[scorefile['sample_id_1'] == id_]['LST'].values[0]
            tai[id_] = scorefile[scorefile['sample_id_1'] == id_]['TAI'].values[0]
        else:
            hrds[id_] = scorefile[scorefile['sample_id_2'] == id_]['HRD_sum'].values[0]
            loh[id_] = scorefile[scorefile['sample_id_2'] == id_]['LOH'].values[0]
            lst[id_] = scorefile[scorefile['sample_id_2'] == id_]['LST'].values[0]
            tai[id_] = scorefile[scorefile['sample_id_2'] == id_]['TAI'].values[0]
        

    clinical.loc['HRD_sum'] = hrds
    clinical.loc['LOH'] = loh
    clinical.loc['LST'] = lst
    clinical.loc['TAI'] = tai
    
    clinical_t = clinical.T
    clinical_t.fillna('undefined', inplace=True)
    print(clinical_t.shape)
    return clinical_t

def statistics(clinical, rna_count, sample_id_scores):
    col_order = rna_count.columns.tolist()
    col_order_clinical = clinical.columns.tolist()

    col_order.pop(0)

    print('raw and clinical overlap')
    set1 = set(col_order)
    set2 = set(col_order_clinical)
    
    matching = list(set1.intersection(set2))
    print(len(col_order))
    print(len(col_order_clinical))
    print(len(matching))

    print('#### hrd set and clinical overlap')

    set1 = set(sample_id_scores)
    set2 = set(col_order_clinical)
    
    matching = list(set1.intersection(set2))
    print(len(sample_id_scores))
    print(len(col_order_clinical))
    print(len(matching))

    print('#### overlap of hrd and clinical overlaped with raw count')
    set1 = set(matching)
    set2 = set(col_order)
    
    matching = list(set1.intersection(set2))
    print(len(matching))

    print('### hrd set and raw count overlap')

    set1 = set(col_order)
    set2 = set(sample_id_scores)
    
    matching = list(set1.intersection(set2))
    print(len(col_order))
    print(len(sample_id_scores))
    print(len(matching))

def saveData(clinical, rna_count, scorefile, path, cancertype):
    rna_count.to_csv(path + 'raw_count_'+cancertype+'.csv', sep=',', index = True, header = True)
    clinical.to_csv(path + 'clinical_'+cancertype+'.csv', sep=',', header = True, index = True)
    scorefile.to_csv(path + 'HRD_scores_'+cancertype+'.csv', sep=',', header = True, index = None)

In [80]:
rna_counts_annotated = curation(firebrowse_data)
clinical_named = prepareClinical(clincial_data)
score_file_prep, sample_id_scores = prepareScorefile(HRD_scores_pan_cancer, 'TCGA-BRCA')

clinical_reordered, rna_counts_filtered, scorefile_filtered = filterForMatches(clinical_named, rna_counts_annotated, score_file_prep, sample_id_scores)
clinical_t = addScoreToClinical(clinical_reordered, scorefile_filtered)


(20532, 878)
(1077, 21)
1057
(2, 747)
(20532, 747)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scorefile_type_primary['sample_id_1'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scorefile_type_primary['sample_id_1'] = sample_id1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scorefile_type_primary['sample_id_2'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
T

(747, 6)


In [81]:
# Only keep gene symbol and make them rownames
gene_list = list(firebrowse_data['HYBRIDIZATION R'])
gene_names = [ x.split('|')[0] for x in gene_list]
firebrowse_data = firebrowse_data.drop(firebrowse_data.columns[0], axis=1)
firebrowse_data.index = gene_names
print(firebrowse_data.shape)

(20532, 878)


In [82]:
# Get the nessessary columns of the clinical
cut_clinical = clincial_data[['Patient ID', 'Sample ID', 'Subtype']]

In [83]:
cut_clinical_named = cut_clinical.set_index('Sample ID')
cut_clinical_named = cut_clinical_named.T

In [84]:
# Prepare the score file
# Split the Sample IDs and save both in new columns

HRD_scores_brca = HRD_scores_pan_cancer[HRD_scores_pan_cancer['Project ID'] == 'TCGA-BRCA']
HRD_scores_brca_primary = HRD_scores_brca[HRD_scores_brca['Type'] == 'Primary']
hrd_patient_id = list(HRD_scores_brca_primary['Sample ID'])

sample_id = list()
sample_id1 = list()
sample_id2 = list()
for sampleid in hrd_patient_id:
    ids = sampleid.split(', ')
    id_1 = ids[0]
    id_2 = ids[1]
    sample_id1.append(id_1)
    sample_id2.append(id_2)
    sample_id.append(id_1)
    sample_id.append(id_2)
sample_id_hrd = list(np.unique(np.array(sample_id)))
sample_id_hrd = [s[:-1] for s in sample_id_hrd]
sample_id1 = [s[:-1] for s in sample_id1]
sample_id2 = [s[:-1] for s in sample_id2]

HRD_scores_brca_primary['sample_id_1'] = ''
HRD_scores_brca_primary['sample_id_1'] = sample_id1
HRD_scores_brca_primary['sample_id_2'] = ''
HRD_scores_brca_primary['sample_id_2'] = sample_id2
print(HRD_scores_brca_primary.shape)

(1077, 21)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  HRD_scores_brca_primary['sample_id_1'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  HRD_scores_brca_primary['sample_id_1'] = sample_id1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  HRD_scores_brca_primary['sample_id_2'] = ''
A value is trying to be set on a copy of a slice from a DataFrame

In [85]:
# Filter the three data set to extract only the matches

col_order_fire = firebrowse_data.columns.tolist()
col_order_clinical = cut_clinical_named.columns.tolist()
set_id_HRD = set(sample_id_hrd)
set_id_clincial = set(col_order_clinical)
    
matching = list(set_id_HRD.intersection(set_id_clincial))
print(len(matching))
set_match = set(matching)
set_id_fire = set(col_order_fire)
matching = list(set_match.intersection(set_id_fire))


HRD_scores_brca_primary_filtered = HRD_scores_brca_primary[HRD_scores_brca_primary['sample_id_1'].isin(matching) | HRD_scores_brca_primary['sample_id_2'].isin(matching)]
HRD_scores_brca_primary_filtered_second = HRD_scores_brca_primary_filtered.drop_duplicates(subset='sample_id_1', keep='first')
HRD_scores_brca_primary_filtered_third = HRD_scores_brca_primary_filtered_second.drop_duplicates(subset='sample_id_2', keep='first')

cut_clinical_named_filtered = cut_clinical_named.loc[:, matching]

firebrowse_data_filtered = firebrowse_data.loc[:,matching]
print(firebrowse_data_filtered.shape)
# Checking for dublicates or check number of cases

# test = list(HRD_scores_brca_primary_filtered['sample_id_1'])
# test2 = list(HRD_scores_brca_primary_filtered['sample_id_2'])
# duplicates = list(set([x for x in test2 if test2.count(x) > 1]))
# dublicates_2 = list(set([x for x in test2 if test2.count(x) > 1]))
# print(len(test))
# print(len(np.unique(np.array(test))))
# print(len(cut_clinical_named_filtered.columns.tolist()))
# print(len(firebrowse_data_filtered.columns.tolist()))

# extra_value = set(firebrowse_data_filtered.columns.tolist()) - set(list(HRD_scores_brca_primary_filtered_third['sample_id_1']) + list(HRD_scores_brca_primary_filtered_third['sample_id_2']))
# print(extra_value)

1057
(20532, 747)


In [86]:
# Reorder the clinical data
cut_clinical_named_filtered_reordered = cut_clinical_named_filtered.reindex(columns=firebrowse_data_filtered.columns)
print(cut_clinical_named_filtered_reordered.shape)

(2, 747)


In [87]:
# Extract the scores and add them to the clincial data
hrds = dict()
loh = dict()
lst = dict()
tai = dict()

for id_ in cut_clinical_named_filtered_reordered.columns:
    if id_ in list(HRD_scores_brca_primary_filtered_third['sample_id_1']):
        hrds[id_] = HRD_scores_brca_primary_filtered_third[HRD_scores_brca_primary_filtered_third['sample_id_1'] == id_]['HRD_sum'].values[0]
        loh[id_] = HRD_scores_brca_primary_filtered_third[HRD_scores_brca_primary_filtered_third['sample_id_1'] == id_]['LOH'].values[0]
        lst[id_] = HRD_scores_brca_primary_filtered_third[HRD_scores_brca_primary_filtered_third['sample_id_1'] == id_]['LST'].values[0]
        tai[id_] = HRD_scores_brca_primary_filtered_third[HRD_scores_brca_primary_filtered_third['sample_id_1'] == id_]['TAI'].values[0]
    else:
        hrds[id_] = HRD_scores_brca_primary_filtered_third[HRD_scores_brca_primary_filtered_third['sample_id_2'] == id_]['HRD_sum'].values[0]
        loh[id_] = HRD_scores_brca_primary_filtered_third[HRD_scores_brca_primary_filtered_third['sample_id_2'] == id_]['LOH'].values[0]
        lst[id_] = HRD_scores_brca_primary_filtered_third[HRD_scores_brca_primary_filtered_third['sample_id_2'] == id_]['LST'].values[0]
        tai[id_] = HRD_scores_brca_primary_filtered_third[HRD_scores_brca_primary_filtered_third['sample_id_2'] == id_]['TAI'].values[0]
        

cut_clinical_named_filtered_reordered.loc['HRD_sum'] = hrds
cut_clinical_named_filtered_reordered.loc['LOH'] = loh
cut_clinical_named_filtered_reordered.loc['LST'] = lst
cut_clinical_named_filtered_reordered.loc['TAI'] = tai

In [88]:
cut_clinical_named_filtered_reordered_t = cut_clinical_named_filtered_reordered.T
cut_clinical_named_filtered_reordered_t.fillna('undefined', inplace=True)
print(cut_clinical_named_filtered_reordered_t.shape)
cut_clinical_named_filtered_reordered_t['De'] = np.where(cut_clinical_named_filtered_reordered_t['Subtype'] == 'BRCA_Basal', 'Basal', 'other')

(747, 6)


In [49]:
firebrowse_data_filtered.to_csv('../data/gene_expression/prepared_data/raw_count_BRCA.csv', sep=',', index = True, header = True)
cut_clinical_named_filtered_reordered_t.to_csv('../data/gene_expression/prepared_data/clinical_BRCA.csv', sep=',', header = True, index = True)
HRD_scores_brca_primary_filtered_third.to_csv('../data/gene_expression/prepared_data/HRD_scores_BRCA.csv', sep=',', header = True, index = None)

In [89]:
# Check how much overlay there is in the three data sets
col_order = firebrowse_data.columns.tolist()
col_order_clinical = cut_clinical_named.columns.tolist()

col_order.pop(0)

print('raw and clinical overlap')
set1 = set(col_order)
set2 = set(col_order_clinical)
    
matching = list(set1.intersection(set2))
print(len(col_order))
print(len(col_order_clinical))
print(len(matching))

print('#### hrd set and clinical overlap')

set1 = set(sample_id_hrd)
set2 = set(col_order_clinical)
    
matching = list(set1.intersection(set2))
print(len(sample_id_hrd))
print(len(col_order_clinical))
print(len(matching))

print('#### overlap of hrd and clinical overlaped with raw count')
set1 = set(matching)
set2 = set(col_order)
    
matching = list(set1.intersection(set2))
print(len(matching))

print('### hrd set and raw count overlap')

set1 = set(col_order)
set2 = set(sample_id_hrd)
    
matching = list(set1.intersection(set2))
print(len(col_order))
print(len(sample_id_hrd))
print(len(matching))

#df2_filtered = cut_clinical_named.loc[:, col_order]

# reorder columns of second dataframe (df2_filtered) based on the list from reference dataframe
#df2_reordered = df2_filtered.reindex(columns=col_order)

raw and clinical overlap
877
1084
764
#### hrd set and clinical overlap
2142
1084
1057
#### overlap of hrd and clinical overlaped with raw count
746
### hrd set and raw count overlap
877
2142
806


In [90]:
statistics(clinical_named, rna_counts_annotated, sample_id_scores)

raw and clinical overlap
877
1084
764
#### hrd set and clinical overlap
2142
1084
1057
#### overlap of hrd and clinical overlaped with raw count
746
### hrd set and raw count overlap
877
2142
806


# LUAD

In [92]:
HRD_scores_pan_cancer = pd.read_csv('../../HRD_score/data/HRD_scores_pan_cancer_annotated_typecorrect.csv', sep=',', header = 0)
rna_count_luad = pd.read_csv('../data/gene_expression/LUAD_cohort/LUAD.mRNAseq_raw_counts.txt', sep='\t', header = 0)
clincial_luad = pd.read_csv('../data/gene_expression/LUAD_cohort/luad_tcga_pan_can_atlas_2018_clinical_data.tsv', sep = '\t', header = 0)

In [96]:
rna_counts_annotated = curation(rna_count_luad)
clinical_named = prepareClinical(clincial_luad)
score_file_prep, sample_id_scores = prepareScorefile(HRD_scores_pan_cancer, 'TCGA-LUAD')

clinical_reordered, rna_counts_filtered, scorefile_filtered = filterForMatches(clinical_named, rna_counts_annotated, score_file_prep, sample_id_scores)
clinical_t = addScoreToClinical(clinical_reordered, scorefile_filtered)
statistics(clinical_named, rna_counts_annotated, sample_id_scores)

(20532, 162)
(543, 21)
502
(2, 121)
(20532, 121)
(121, 6)
raw and clinical overlap
161
566
121
#### hrd set and clinical overlap
1026
566
502
#### overlap of hrd and clinical overlaped with raw count
120
### hrd set and raw count overlap
161
1026
148


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scorefile_type_primary['sample_id_1'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scorefile_type_primary['sample_id_1'] = sample_id1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scorefile_type_primary['sample_id_2'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
T

In [98]:
print(rna_counts_filtered.shape)
print(clinical_t.shape)
print(rna_count_luad.shape)

(20532, 121)
(121, 6)
(20532, 163)


# LUSC

In [101]:
HRD_scores_pan_cancer = pd.read_csv('../../HRD_score/data/HRD_scores_pan_cancer_annotated_typecorrect.csv', sep=',', header = 0)
rna_count_lusc = pd.read_csv('../data/gene_expression/LUSC_cohort/LUSC.mRNAseq_raw_counts.txt', sep='\t', header = 0)
clincial_lusc = pd.read_csv('../data/gene_expression/LUSC_cohort/lusc_tcga_pan_can_atlas_2018_clinical_data.tsv', sep = '\t', header = 0)

In [102]:
rna_counts_annotated = curation(rna_count_lusc)
clinical_named = prepareClinical(clincial_lusc)
score_file_prep, sample_id_scores = prepareScorefile(HRD_scores_pan_cancer, 'TCGA-LUSC')

clinical_reordered, rna_counts_filtered, scorefile_filtered = filterForMatches(clinical_named, rna_counts_annotated, score_file_prep, sample_id_scores)
clinical_t = addScoreToClinical(clinical_reordered, scorefile_filtered)
statistics(clinical_named, rna_counts_annotated, sample_id_scores)

(20532, 240)
(514, 21)
479
(2, 213)
(20532, 213)
(213, 6)
raw and clinical overlap
239
487
212
#### hrd set and clinical overlap
986
487
479
#### overlap of hrd and clinical overlaped with raw count
212
### hrd set and raw count overlap
239
986
233


# OV

In [2]:
HRD_scores_pan_cancer = pd.read_csv('../../HRD_score/data/HRD_scores_pan_cancer_annotated_typecorrect.csv', sep=',', header = 0)
rna_count_ov = pd.read_csv('../data/gene_expression/OV_cohort/OV.mRNAseq_raw_counts.txt', sep='\t', header = 0)
clincial_ov = pd.read_csv('../data/gene_expression/OV_cohort/ov_tcga_pan_can_atlas_2018_clinical_data.tsv', sep = '\t', header = 0)

In [5]:
rna_counts_annotated = curation(rna_count_ov)
clinical_named = prepareClinical(clincial_ov)
score_file_prep, sample_id_scores = prepareScorefile(HRD_scores_pan_cancer, 'TCGA-OV')

clinical_reordered, rna_counts_filtered, scorefile_filtered = filterForMatches(clinical_named, rna_counts_annotated, score_file_prep, sample_id_scores)
clinical_t = addScoreToClinical(clinical_reordered, scorefile_filtered)
statistics(clinical_named, rna_counts_annotated, sample_id_scores)

(20442, 299)
(562, 21)
555


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scorefile_type_primary['sample_id_1'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scorefile_type_primary['sample_id_1'] = sample_id1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scorefile_type_primary['sample_id_2'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
T

(2, 281)
(20442, 281)
(281, 6)
raw and clinical overlap
298
585
290
#### hrd set and clinical overlap
1118
585
555
#### overlap of hrd and clinical overlaped with raw count
280
### hrd set and raw count overlap
298
1118
282


In [9]:
saveData(clinical_t, rna_counts_filtered, scorefile_filtered, '../data/gene_expression/prepared_data/', 'OV')