In [54]:
import pandas as pd
import os
import glob
import numpy as np
from dateutil.relativedelta import relativedelta

In [55]:
# Define the path to the source folder
source_folder = './Resources/PeterMac_HRD_Clinical/'

In [56]:
# Get filenames of files with 'sample' column
qa_gi_file = glob.glob(source_folder + '*-combined_QA_GI_status.csv')
lc_table_file = glob.glob(source_folder + '*-combined_LC_table.csv')
cov_decay_file = glob.glob(source_folder + 'concat_cov_decay.csv')
dup_frac_file = glob.glob(source_folder + 'concat_dup_frac.csv')

In [57]:
# Get filename of files with 'episode'
mids_ep_file = glob.glob(source_folder + '*-combined_MIDS_episode.csv')
ee_dump_file = glob.glob(source_folder + 'HRD_Report_clean_EE_dump_*.csv')

In [58]:
# Read CSV files into DataFrames
qa_gi_df = pd.read_csv(qa_gi_file[0])
lc_table_df = pd.read_csv(lc_table_file[0])
cov_decay_df = pd.read_csv(cov_decay_file[0])
dup_frac_df = pd.read_csv(dup_frac_file[0])
mids_ep_df = pd.read_csv(mids_ep_file[0])
ee_dump_df = pd.read_csv(ee_dump_file[0])

In [59]:
# Merge DataFrames based on 'Sample' column
sample_merged_df = pd.merge(qa_gi_df, lc_table_df, on='Sample')
sample_merged_df = pd.merge(sample_merged_df, cov_decay_df, on='Sample')
sample_merged_df = pd.merge(sample_merged_df, dup_frac_df, on='Sample')

In [60]:
# Calculate, tidy, drop columns
sample_merged_df['MIDS'] = sample_merged_df['Sample'].str.split('-').str[-1]
sample_merged_df['Panel(M)'] = sample_merged_df['Total(M)']-sample_merged_df['WGS(M)']
sample_merged_df['Run'] = sample_merged_df['Run'].str.split('_').str[-1]
sample_merged_df.drop(columns=['QA_status','%CovOut', 'Cov_10%_qtile', 'Cov Het'], inplace = True)

In [61]:
# Fix % columns
percent_columns = ['%WGS', '25x', '50x', '100x', '200x', '500x', '1000x', 'Duplicate Fraction']
for column in percent_columns:
    sample_merged_df[column] = pd.to_numeric(sample_merged_df[column].str.replace('%',''), errors='coerce') / 100
    sample_merged_df[column] = sample_merged_df[column].apply(lambda x: round(x, 2))
sample_merged_df['%Panel'] = 1 - sample_merged_df['%WGS']

In [62]:
# Fix numeric columns
numeric_columns = ['GI_index', 'PPR']
for column in numeric_columns:
    sample_merged_df[column] = pd.to_numeric(sample_merged_df[column].replace('-', np.nan), errors='coerce')

In [63]:
# Function to hyphenate episode number
def hyphenate(episode):
    ep_str = str(episode)
    hyphen_ep = ep_str[:-4] + '-' + ep_str[-4:]
    return hyphen_ep

In [64]:
# Hyphenate mids_ep_df 'Episode'
mids_ep_df['Episode'] = mids_ep_df['Episode'].apply(hyphenate)

In [65]:
# Merge DataFrames based on 'Episode' column
episode_merged_df = pd.merge(mids_ep_df, ee_dump_df, on='Episode')

In [66]:
# Get header list for episode_merged_df 
header_list = episode_merged_df.columns.tolist()

# Write header list to text file for editing
with open('episode_merged_headers.txt', 'w') as f:
    for item in header_list:
        f.write("%s\n" % item)

In [67]:
# Use edited header list to delete extraneous columns
with open('episode_keep_headers.txt', 'r') as f:
    keep_episode_merged_headers = [line.strip() for line in f.readlines()]
episode_merged_df = episode_merged_df[keep_episode_merged_headers]

In [68]:
# Calculate, tidy columns
episode_merged_df['Run'] = episode_merged_df['Run'].str.split('_').str[-1]
episode_merged_df['VAF_1'] = pd.to_numeric(episode_merged_df['VAF_1'].replace('-', np.nan), errors='coerce')

In [69]:
# Merge episode-merged and sample-merged dataframes on 'Run' and 'MIDS'
full_merged_df = pd.merge(sample_merged_df, episode_merged_df, on=['Run', 'MIDS'], how = 'inner')

In [70]:
full_merged_df.head()

Unnamed: 0,Sample,GI_index,GI_status,Total(M),WGS(M),%WGS,PPR,RN,SNR,QAStatus,...,Panel(M),%Panel,Episode,State/Territory,Collected,Received,Est Tum Purity,Gene_1,HGVSp_1,VAF_1
0,200085906-407-S10,8.8,Positive,22.1,14.2,0.64,,0.08,4.08,Medium,...,7.9,0.36,1410-1517,TAS,23/5/2023,31/5/2023,90.0,TP53,p.(Thr155Serfs*6),92.0
1,200085907-387-S11,9.2,Positive,21.8,18.7,0.86,,0.08,1.34,Medium,...,3.1,0.14,1413-3340,QLD,1/6/2023,9/6/2023,25.0,TP53,p.(Ile195Thr),16.0
2,200085909-404-S13,0.8,Positive,22.5,16.5,0.73,0.28,0.07,4.64,High,...,6.0,0.27,1410-3300,TAS,15/5/2023,1/6/2023,90.0,TP53,p.(Arg158Hisfs*21),79.0
3,200085910-394-S14,10.2,Positive,21.0,16.9,0.8,0.2,0.08,4.67,High,...,4.1,0.2,1413-4688,VIC,7/6/2023,20/6/2023,70.0,TP53,p.(Cys238Phe),61.0
4,200085911-388-S15,4.7,Positive,17.0,13.1,0.77,0.45,0.1,3.64,High,...,3.9,0.23,1407-7410,VIC,26/5/2023,26/5/2023,80.0,TP53,p.(?),77.0


In [71]:
# Remove non-BRCA entries
condition = ~((full_merged_df['Gene_1'] == 'BRCA1') | (full_merged_df['Gene_1'] == 'BRCA2'))
full_merged_df['GI_status'] = full_merged_df['GI_status'].replace('Negative*', 'Negative')
modify_columns = ['Gene_1', 'HGVSp_1', 'VAF_1']
modify_rows = full_merged_df.index[condition]
full_merged_df.loc[modify_rows, modify_columns] = np.nan

In [72]:
# DropNA rows for Date and Purity
drop_columns = ['Collected', 'Received', 'Est Tum Purity']
full_merged_df.dropna(subset=drop_columns, inplace=True)

In [73]:
# Calculate sample age
full_merged_df['Collected'] = pd.to_datetime(full_merged_df['Collected'])
full_merged_df['Received'] = pd.to_datetime(full_merged_df['Received'])
full_merged_df['DaysOld'] = (full_merged_df['Received'] - full_merged_df['Collected']).dt.days
full_merged_df.drop(columns=['Collected','Received'], inplace = True)


  full_merged_df['Collected'] = pd.to_datetime(full_merged_df['Collected'])
  full_merged_df['Received'] = pd.to_datetime(full_merged_df['Received'])


In [74]:
# Get header list for full_merged_df 
header_list = full_merged_df.columns.tolist()

In [75]:

# Write header list to text file for manual re-ordering
with open('full_merged_headers_unordered.txt', 'w') as f:
    for item in header_list:
        f.write("%s\n" % item)

In [76]:
# Read re-ordered old headers to list
with open('full_merged_headers_reordered.txt', 'r') as f:
    old_headers = [line.strip() for line in f.readlines()]
    # print(len(old_headers))

# Read new header names to list (matching columns in 'PeterMac_HRD_Validation.csv')
with open('new_clin_column_names.txt', 'r') as f:
    new_headers = [line.strip() for line in f.readlines()]
    # print(len(new_headers))

# Create column re-naming dictionary
rename_dict = dict(zip(old_headers, new_headers))
# rename_dict

In [77]:
# Rename and reorder columns
full_merged_df = full_merged_df.reindex(columns=rename_dict).rename(columns=rename_dict)

In [78]:
# Save merged DataFrame to CSV file
full_merged_df.to_csv(source_folder + 'PeterMac_HRD_clinical_data.csv', index=False)