## Preprocessing

In [48]:
# Import our dependencies
from sklearn.preprocessing import StandardScaler
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.impute import SimpleImputer


In [49]:

# Define file paths
data_file_path = Path('../Resources/PeterMac_HRD_Validation.csv')
agreement_data_path = Path('../Resources/HRD_Agreement.csv')
data_file_df = pd.read_csv(data_file_path)
agreement_data_df = pd.read_csv(agreement_data_path)


In [50]:
# Merge initial data with agreement data
data_df = pd.merge(data_file_df, agreement_data_df, on='SampleID', how='inner')
data_df.shape
data_df.columns

Index(['Run', 'SampleID', 'Source', 'MonthsOld', 'Purity', 'SeqRunID',
       'DDMSampleID', 'MIDS', 'TotalReads(M)', 'lpWGSReads(M)',
       'TargetPanelReads(M)', '%ReadslpWGS', '%ReadsPanel', '1000x', '500x',
       '200x', '100x', '50x', '25x', 'DupFrac', 'LowCovRegions',
       'PurityPloidyRatio', 'ResNoise', 'SignalNoiseRatio', 'QAStatus', 'Gene',
       'Variant', '%VariantFraction', 'MyriadGIScore_x', 'MyriadGIStatus_x',
       'SOPHiAGIIndex_x', 'SophiaGIStatus_x', 'MyriadGIScore_y',
       'MyriadGIStatus_y', 'SOPHiAGIIndex_y', 'SophiaGIStatus_y', 'Agreement1',
       'Agreement2'],
      dtype='object')

In [51]:
# List columns to keep
columns_to_keep = ['Purity', 'TotalReads(M)', '%ReadsPanel', '1000x', '500x', '200x', '100x', '50x', '25x', 'DupFrac',\
                    'LowCovRegions', 'PurityPloidyRatio', 'ResNoise', 'SignalNoiseRatio', 'QAStatus', 'Gene',\
                          'SOPHiAGIIndex_x', 'SophiaGIStatus_x', 'Agreement2']

In [52]:
# Drop unwanted columns and fix names
data_df = data_df[columns_to_keep]
data_df.columns = [col.replace('_x', '') for col in data_df.columns]
data_df.head()

Unnamed: 0,Purity,TotalReads(M),%ReadsPanel,1000x,500x,200x,100x,50x,25x,DupFrac,LowCovRegions,PurityPloidyRatio,ResNoise,SignalNoiseRatio,QAStatus,Gene,SOPHiAGIIndex,SophiaGIStatus,Agreement2
0,20,7.3,19%,1%,5%,49%,84%,98%,100%,67%,769,-,0.13,2.95,Medium,.,3.2,1,1
1,30,7.3,24%,2%,12%,73%,97%,100%,100%,68%,580,0.2,0.11,2.91,High,.,-15.7,2,1
2,20,9.6,36%,41%,97%,100%,100%,100%,100%,58%,7,0.15,0.1,1.64,High,.,-4.6,2,1
3,20,8.9,37%,16%,71%,99%,100%,100%,100%,71%,33,0.17,0.09,3.49,High,.,-4.6,2,1
4,60,8.6,42%,2%,46%,100%,100%,100%,100%,81%,15,0.2,0.11,2.18,High,.,-8.2,2,1


In [53]:
#List datatypes
data_df.dtypes

Purity                object
TotalReads(M)        float64
%ReadsPanel           object
1000x                 object
500x                  object
200x                  object
100x                  object
50x                   object
25x                   object
DupFrac               object
LowCovRegions          int64
PurityPloidyRatio     object
ResNoise              object
SignalNoiseRatio      object
QAStatus              object
Gene                  object
SOPHiAGIIndex         object
SophiaGIStatus         int64
Agreement2             int64
dtype: object

In [54]:
# Convert hyphens and dots to NaN
data_df.replace(['-', '.'], np.nan)

Unnamed: 0,Purity,TotalReads(M),%ReadsPanel,1000x,500x,200x,100x,50x,25x,DupFrac,LowCovRegions,PurityPloidyRatio,ResNoise,SignalNoiseRatio,QAStatus,Gene,SOPHiAGIIndex,SophiaGIStatus,Agreement2
0,20,7.3,19%,1%,5%,49%,84%,98%,100%,67%,769,,0.13,2.95,Medium,,3.2,1,1
1,30,7.3,24%,2%,12%,73%,97%,100%,100%,68%,580,0.2,0.11,2.91,High,,-15.7,2,1
2,20,9.6,36%,41%,97%,100%,100%,100%,100%,58%,7,0.15,0.1,1.64,High,,-4.6,2,1
3,20,8.9,37%,16%,71%,99%,100%,100%,100%,71%,33,0.17,0.09,3.49,High,,-4.6,2,1
4,60,8.6,42%,2%,46%,100%,100%,100%,100%,81%,15,0.2,0.11,2.18,High,,-8.2,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,10,25.5,26%,88%,100%,100%,100%,100%,100%,67%,0,,0.13,1.08,Medium,,-9,2,1
135,,18.6,30%,0%,31%,99%,100%,100%,100%,89%,3,,0.13,0.4,Medium,,,3,0
136,40,18.3,34%,10%,62%,100%,100%,100%,100%,86%,11,,0.16,1.56,Medium,,-4.9,2,1
137,50,14.0,25%,1%,29%,91%,99%,100%,100%,83%,43,,0.13,1.7,Medium,BRCA2,-1.4,2,1


In [55]:
# Convert percentages to numeric
columns_to_convert = ['%ReadsPanel', '1000x', '500x', '200x', '100x', '50x', '25x', 'DupFrac']
data_df[columns_to_convert] = data_df[columns_to_convert].apply(lambda col: col.str.rstrip('%').astype(float) / 100)

In [56]:
# Convert remaining number columns to numeric
columns_to_convert = ['Purity', 'PurityPloidyRatio', 'ResNoise', 'SignalNoiseRatio', 'SOPHiAGIIndex']
data_df[columns_to_convert] = data_df[columns_to_convert].apply(pd.to_numeric, errors='coerce')
data_df.dtypes

Purity               float64
TotalReads(M)        float64
%ReadsPanel          float64
1000x                float64
500x                 float64
200x                 float64
100x                 float64
50x                  float64
25x                  float64
DupFrac              float64
LowCovRegions          int64
PurityPloidyRatio    float64
ResNoise             float64
SignalNoiseRatio     float64
QAStatus              object
Gene                  object
SOPHiAGIIndex        float64
SophiaGIStatus         int64
Agreement2             int64
dtype: object

In [59]:
# Make sure Gene is only BRCA1 or BRCA2
data_df.loc[~data_df['Gene'].isin(['BRCA1', 'BRCA2']), 'Gene'] = np.nan

In [60]:
# Convert 'SophiaGIStatus' to categorical
data_df['SophiaGIStatus'] = data_df['SophiaGIStatus'].astype('category')

In [61]:
# Identify columns for one-hot encoding
onehot_cols = ["QAStatus", "Gene", 'SophiaGIStatus']

# Perform one-hot encoding
one_hot_encoded = pd.get_dummies(data_df[onehot_cols])

In [62]:
# Concatenate the one-hot encoded columns with the original DataFrame and drop 
data_df = pd.concat([data_df, one_hot_encoded], axis=1)
data_df = data_df.drop(onehot_cols, axis=1)

In [63]:
# Check for missing values
missing_values = data_df.isnull()
missing_counts = missing_values.sum()
print(missing_counts)

Purity               28
TotalReads(M)         0
%ReadsPanel           0
1000x                 0
500x                  0
200x                  0
100x                  0
50x                   0
25x                   0
DupFrac               0
LowCovRegions         0
PurityPloidyRatio    46
ResNoise              3
SignalNoiseRatio      3
SOPHiAGIIndex        13
Agreement2            0
QAStatus_High         0
QAStatus_Low          0
QAStatus_Medium       0
Gene_BRCA1            0
Gene_BRCA2            0
SophiaGIStatus_1      0
SophiaGIStatus_2      0
SophiaGIStatus_3      0
SophiaGIStatus_4      0
dtype: int64


In [64]:
# Define imputation strategies for NaN
imputation_strategy = 'mean'

In [65]:
# Define columns to impute
cols_to_impute = ['Purity', 'PurityPloidyRatio', 'SOPHiAGIIndex', 'ResNoise', 'SignalNoiseRatio']

In [66]:
# Impute missing values
imputer = SimpleImputer(strategy=imputation_strategy)
imputed = imputer.fit_transform(data_df[cols_to_impute])

In [67]:
# Create a DataFrame with the imputed values
imputed_df = pd.DataFrame(imputed, columns=cols_to_impute, index=data_df.index)

In [68]:
# Combine the imputed data with the other features
data_df = pd.concat([data_df.drop(columns=cols_to_impute), imputed_df], axis=1)

In [69]:
# Move target column to end
target = 'Agreement2'
target_column = data_df[target]
data_df = data_df.drop(target, axis=1)
data_df[target] = target_column

In [70]:
data_df.head()

Unnamed: 0,TotalReads(M),%ReadsPanel,1000x,500x,200x,100x,50x,25x,DupFrac,LowCovRegions,...,SophiaGIStatus_1,SophiaGIStatus_2,SophiaGIStatus_3,SophiaGIStatus_4,Purity,PurityPloidyRatio,SOPHiAGIIndex,ResNoise,SignalNoiseRatio,Agreement2
0,7.3,0.19,0.01,0.05,0.49,0.84,0.98,1.0,0.67,769,...,1,0,0,0,20.0,0.285591,3.2,0.13,2.95,1
1,7.3,0.24,0.02,0.12,0.73,0.97,1.0,1.0,0.68,580,...,0,1,0,0,30.0,0.2,-15.7,0.11,2.91,1
2,9.6,0.36,0.41,0.97,1.0,1.0,1.0,1.0,0.58,7,...,0,1,0,0,20.0,0.15,-4.6,0.1,1.64,1
3,8.9,0.37,0.16,0.71,0.99,1.0,1.0,1.0,0.71,33,...,0,1,0,0,20.0,0.17,-4.6,0.09,3.49,1
4,8.6,0.42,0.02,0.46,1.0,1.0,1.0,1.0,0.81,15,...,0,1,0,0,60.0,0.2,-8.2,0.11,2.18,1


In [71]:
data_df.shape

(139, 25)

In [72]:
data_df.to_csv('preprocessed_data.csv', index = False)