In [1]:
import pandas as pd
import os
import glob
import numpy as np

In [2]:
# First the sample_sheets and clinical data from the two batches need to be merged.
# This will likely already be done

path = '../data/metadata/'
file = 'gdc_samplesheet.csv'

if os.path.exists(path+file):
    print('Smaplesheet has already been created and therefore the clinical data as well')
    
else:
    batch_1_sheet = pd.read_csv('../data/metadata/batch1/gdc_sample_sheet.2023-02-21.tsv', sep='\t', header = 0)
    batch_2_sheet = pd.read_csv('../data/metadata/batch2/gdc_sample_sheet.2023-02-21.tsv', sep='\t', header = 0)
    batch_1_clinical = pd.read_csv('../data/metadata/batch1/clinical.tsv', sep='\t', header = 0)
    batch_2_clinical = pd.read_csv('../data/metadata/batch2/clinical.tsv', sep='\t', header = 0)
    
    sample_sheet = pd.concat([batch_1_sheet,batch_2_sheet])
    clinical = pd.concat([batch_1_clinical,batch_2_clinical])
    
    sample_sheet.to_csv(path+file, header = True, sep = ',', index = None)
    clinical.to_csv(path+'clinical.csv', header = True, sep = ',', index = None)
    
    #Check if everthing is there (from missing_files we know there should be 11582 files in case for the sample sheet)
    print(len(batch_1_sheet)+len(batch_2_sheet))
    print(len(sample_sheet))
    print(len(batch_1_clinical)+len(batch_2_clinical))
    print(len(clinical))

Smaplesheet has already been created and therefore the clinical data as well


In [3]:
# Update code table from TCGA, adding TARGET codes and Project ID
# This also might already be done
code_table = pd.read_csv('../data/TCGA_code_tables/diseaseStudy.tsv', sep='\t', header = 0)
code_table['Project ID'] = 'TCGA-'+ code_table['Study Abbreviation']


target_rows = pd.DataFrame({'Study Abbreviation': ['ALL-P2','AML','CCSK','OS'],
                            'Study Name': ['Acute Lymphoblastic Leukemia - Phase II','Acute Myeloid Leukemia','Clear Cell Sarcoma of the Kidney','Osteosarcoma'],
                            'Project ID': ['TARGET-ALL-P2','TARGET-AML','TARGET-CCSK','TARGET-OS']})
code_table = pd.concat([code_table, target_rows]).reset_index(drop=True)
code_table.to_csv('../data/TCGA_code_tables/diseaseStudy_updated.csv', sep=',', header = True, index = None)

In [4]:
# Dictionaries used for mapping sample types
# We broke down the sample types to: Primary, Recurrent, Metastatic
sampletype_mapping = {
    'Additional - New Primary, Blood Derived Normal': 'Additional - New Primary, Blood Derived Normal',
    'Blood Derived Normal, Additional - New Primary': 'Additional - New Primary, Blood Derived Normal',
    'Blood Derived Normal, Metastatic': 'Metastatic, Blood Derived Normal',
    'Blood Derived Normal, Primary Blood Derived Cancer - Bone Marrow': 'Primary Blood Derived Cancer - Bone Marrow, Blood Derived Normal',
    'Blood Derived Normal, Primary Blood Derived Cancer - Peripheral Blood': 'Primary Blood Derived Cancer - Peripheral Blood, Blood Derived Normal',
    'Blood Derived Normal, Primary Tumor': 'Primary Tumor, Blood Derived Normal',
    'Blood Derived Normal, Recurrent Blood Derived Cancer - Bone Marrow': 'Recurrent Blood Derived Cancer - Bone Marrow, Blood Derived Normal',
    'Blood Derived Normal, Recurrent Tumor': 'Recurrent Tumor, Blood Derived Normal',
    'Bone Marrow Normal, Primary Blood Derived Cancer - Bone Marrow': 'Primary Blood Derived Cancer - Bone Marrow, Bone Marrow Normal',
    'Bone Marrow Normal, Primary Blood Derived Cancer - Peripheral Blood': 'Primary Blood Derived Cancer - Peripheral Blood, Bone Marrow Normal',
    'Bone Marrow Normal, Primary Tumor': 'Primary Tumor, Bone Marrow Normal',
    'Bone Marrow Normal, Recurrent Blood Derived Cancer - Bone Marrow': 'Recurrent Blood Derived Cancer - Bone Marrow, Bone Marrow Normal',
    'Buccal Cell Normal, Primary Tumor':'Primary Tumor, Buccal Cell Normal',
    'Metastatic, Blood Derived Normal': 'Metastatic, Blood Derived Normal',
    'Metastatic, Solid Tissue Normal': 'Metastatic, Solid Tissue Normal',
    'Primary Blood Derived Cancer - Bone Marrow, Blood Derived Normal': 'Primary Blood Derived Cancer - Bone Marrow, Blood Derived Normal',
    'Primary Blood Derived Cancer - Bone Marrow, Bone Marrow Normal': 'Primary Blood Derived Cancer - Bone Marrow, Bone Marrow Normal',
    'Primary Blood Derived Cancer - Peripheral Blood, Blood Derived Normal': 'Primary Blood Derived Cancer - Peripheral Blood, Blood Derived Normal',
    'Primary Blood Derived Cancer - Peripheral Blood, Bone Marrow Normal': 'Primary Blood Derived Cancer - Peripheral Blood, Bone Marrow Normal',
    'Primary Blood Derived Cancer - Peripheral Blood, Solid Tissue Normal' : 'Primary Blood Derived Cancer - Peripheral Blood, Solid Tissue Normal',
    'Primary Tumor, Blood Derived Normal' : 'Primary Tumor, Blood Derived Normal',
    'Primary Tumor, Bone Marrow Normal': 'Primary Tumor, Bone Marrow Normal',
    'Primary Tumor, Buccal Cell Normal': 'Primary Tumor, Buccal Cell Normal',
    'Primary Tumor, Solid Tissue Normal': 'Primary Tumor, Solid Tissue Normal',
    'Recurrent Blood Derived Cancer - Bone Marrow, Bone Marrow Normal': 'Recurrent Blood Derived Cancer - Bone Marrow, Bone Marrow Normal',
    'Recurrent Tumor, Blood Derived Normal': 'Recurrent Tumor, Blood Derived Normal',
    'Recurrent Tumor, Solid Tissue Normal':'Recurrent Tumor, Solid Tissue Normal',
    'Solid Tissue Normal, Metastatic': 'Metastatic, Solid Tissue Normal',
    'Solid Tissue Normal, Primary Blood Derived Cancer - Peripheral Blood': 'Primary Blood Derived Cancer - Peripheral Blood, Solid Tissue Normal',
    'Solid Tissue Normal, Primary Tumor': 'Primary Tumor, Solid Tissue Normal',
    'Solid Tissue Normal, Recurrent Tumor': 'Recurrent Tumor, Solid Tissue Normal' 
}

type_mapping = {
    'Additional - New Primary, Blood Derived Normal' : 'Primary',
    'Metastatic, Blood Derived Normal': 'Metastatic',
    'Metastatic, Solid Tissue Normal' : 'Metastatic',
    'Primary Blood Derived Cancer - Bone Marrow, Blood Derived Normal': 'Primary',
    'Primary Blood Derived Cancer - Bone Marrow, Bone Marrow Normal' : 'Primary',
    'Primary Blood Derived Cancer - Peripheral Blood, Blood Derived Normal' : 'Primary',
    'Primary Blood Derived Cancer - Peripheral Blood, Bone Marrow Normal' : 'Primary',
    'Primary Blood Derived Cancer - Peripheral Blood, Solid Tissue Normal' : 'Primary',
    'Primary Tumor, Blood Derived Normal' : 'Primary',
    'Primary Tumor, Bone Marrow Normal': 'Primary',
    'Primary Tumor, Buccal Cell Normal': 'Primary',
    'Primary Tumor, Solid Tissue Normal': 'Primary',
    'Recurrent Blood Derived Cancer - Bone Marrow, Bone Marrow Normal': 'Recurrent',
    'Recurrent Tumor, Blood Derived Normal': 'Recurrent',
    'Recurrent Tumor, Solid Tissue Normal':'Recurrent',
    'Recurrent Blood Derived Cancer - Bone Marrow, Blood Derived Normal': 'Recurrent'
}

In [7]:
# Loading results and metadata
sample_sheet = pd.read_csv('../data/metadata/gdc_samplesheet.csv', sep=',', header = 0)
clinical =  pd.read_csv('../data/metadata/clinical.csv', sep=',', header = 0)
HRD_scores = pd.read_csv('../data/HRD_scores_pan_cancer.csv', sep=',', header = 0)

# Add column, case_id from the file names
file_names = HRD_scores['File Name']
case_ids = [s.split('.')[1] for s in file_names]
HRD_scores.insert(1, "case_id", case_ids)

# Split up Case ID from Sample Sheet and add it as case_submitter_id
case_ids_double = sample_sheet['Case ID']
case_ids = [s.split(',')[0] for s in case_ids_double]
sample_sheet = sample_sheet.assign(case_submitter_id=case_ids)

# Merge HRD scores with sample file
score_sample = pd.merge(HRD_scores, sample_sheet, on = 'File Name')

# Preparing clincial data (drop dublicates, dublicates are due to different treatment of a patient)
clinical.rename(columns={'project_id':'Project ID'}, inplace=True)
pid_primarydiag = clinical[['case_submitter_id','Project ID','primary_diagnosis']]

pid_primarydiag_unique = pid_primarydiag.drop_duplicates(
  subset = ['case_submitter_id','Project ID','primary_diagnosis'],
  keep = 'last').reset_index(drop = True)


# Merge with clinical (some of the cases (26) do not have clinical data, but it is still merged)
scsa_clinical = pd.merge(score_sample, pid_primarydiag_unique, how = 'left', on = ['case_submitter_id','Project ID'])


# Adding new columns for the types (ProjectID_Type (combination of ID and type, Type (Primary, Recurrent, Metastatic))
scsa_clinical['sampleType_correct'] = ""

for sampletype in sampletype_mapping:
    scsa_clinical.loc[scsa_clinical['Sample Type'] == sampletype,'sampleType_correct'] = sampletype_mapping[sampletype]

scsa_clinical['Type'] = ""

for type_ in type_mapping:
    scsa_clinical.loc[scsa_clinical['sampleType_correct'] == type_,'Type'] = type_mapping[type_]

scsa_clinical['PID_Type'] = scsa_clinical['Project ID'] + ' ' + scsa_clinical['Type']

# Add the full names of the cancers (aka Study Name)
code_table = pd.read_csv('../data/TCGA_code_tables/diseaseStudy_updated.csv', sep=',', header = 0)
code_table = code_table[['Study Name','Project ID']]

scsa_clinical = pd.merge(scsa_clinical, code_table, on = ['Project ID'])



# Save all and only primary results
scsa_clinical.to_csv('../data/HRD_scores_pan_cancer_annotated_typecorrect.csv', sep=',', header = True, index = None)

scsa_clinical_primary = scsa_clinical.loc[scsa_clinical['Type'] == 'Primary']
scsa_clinical_primary.to_csv('../data/HRD_scores_pan_cancer_annotated_primary.csv', sep=',', header = True, index = None)




  clinical =  pd.read_csv('../data/metadata/clinical.csv', sep=',', header = 0)


## Tests
Test if the data is complete ect.

Does not has to be used, was to check during coding

Might get deleted

In [None]:
# Checks 


# print(pid_primarydiag[['case_submitter_id','Project ID']].value_counts(ascending = True))
# print(pid_primarydiag_unique[['case_submitter_id','Project ID','primary_diagnosis']].value_counts(ascending = True))

# print(sample_sheet['case_submitter_id'].isin(clinical['case_submitter_id']).value_counts())
# print(HRD_scores['case_id'].isin(clinical['case_id']).value_counts())
# print(HRD_scores['case_id'])
# print(HRD_scores[HRD_scores['case_id'].duplicated() == True])
# print(clinical[clinical['case_id'].duplicated() == True])


# df_missing = HRD_scores[~HRD_scores['File Name'].isin(scsa_clinical['File Name'])]
# print(len(df_missing))
# df_missing = score_sample[~score_sample['File Name'].isin(scsa_clinical['File Name'])]
# print(len(HRD_scores))
# print(len(sample_sheet))
# print(len(clinical))
# print(len(df_missing))
# print(df_missing['File Name'])
# df_missing.to_csv('../data/missing_subtype_files.csv', sep=',', header = True, index = None)

# print(sample_sheet['case_submitter_id'].isin(clinical['case_submitter_id']).value_counts())


In [12]:
files = glob.glob('../data/allele_specific_cnv/allele_cnv_txt/*.seg.txt')
files = [os.path.basename(file) for file in files]
projects = [s.split('.')[0] for s in files]
print(len(projects))
sum_ = 0
for project in np.unique(projects):
    print(project + ': ' + str(projects.count(project)))
    sum_ = sum_ +  projects.count(project)
print(sum_)

In [15]:
sample_type = list(sample_sheet['Sample Type'])
sum_ = 0
sample_list = list()
for type_ in np.unique(sample_type):
    words = type_.split(',')
    sample_list.append(type_)
    if 'Metastatic' in words:
        sum_ = sum_ + 1
    print(type_ + ': ' + str(sample_type.count(type_)))
print('Metastatic types: '+str(sum_))

Additional - New Primary, Blood Derived Normal: 5
Blood Derived Normal, Additional - New Primary: 5
Blood Derived Normal, Metastatic: 191
Blood Derived Normal, Primary Blood Derived Cancer - Bone Marrow: 114
Blood Derived Normal, Primary Blood Derived Cancer - Peripheral Blood: 9
Blood Derived Normal, Primary Tumor: 4445
Blood Derived Normal, Recurrent Blood Derived Cancer - Bone Marrow: 3
Blood Derived Normal, Recurrent Tumor: 32
Bone Marrow Normal, Primary Blood Derived Cancer - Bone Marrow: 59
Bone Marrow Normal, Primary Blood Derived Cancer - Peripheral Blood: 5
Bone Marrow Normal, Primary Tumor: 3
Bone Marrow Normal, Recurrent Blood Derived Cancer - Bone Marrow: 7
Buccal Cell Normal, Primary Tumor: 1
Metastatic, Blood Derived Normal: 195
Metastatic, Solid Tissue Normal: 2
Primary Blood Derived Cancer - Bone Marrow, Blood Derived Normal: 101
Primary Blood Derived Cancer - Bone Marrow, Bone Marrow Normal: 54
Primary Blood Derived Cancer - Peripheral Blood, Blood Derived Normal: 11
P

In [2]:
gdc_cases = [1067,561,536,517,517,510,509,507,498,493,486,466,454,436,391,369,299,293,284,248,190,183,181,167,162,150,123,90,86,81,80,73,66,53,48,36,11]
files_numbers = [1084,589,545,521,574,542,527,544,505,514,491,468,509,436,396,373,301,294,298,254,190,184,182,172,167,156,123,90,86,81,80,92,66,53,48,36,11]
diff = list()
for i,num in enumerate(gdc_cases):
    diff.append(abs(num-files_numbers[i]))
print(sum(diff))

361
