## Preprocessing

In [1]:
# Import our dependencies
from sklearn.preprocessing import StandardScaler
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.impute import SimpleImputer

In [2]:

# Define file paths
data_file_path = Path('../Resources/PeterMac_HRD_Clinical/correct_PeterMac_HRD_clinical_data.csv')
data_df = pd.read_csv(data_file_path)


In [3]:
# List columns to keep
columns_to_keep = ['Purity', 'TotalReads(M)', '%ReadsPanel', '1000x', '500x', '200x', '100x', '50x', '25x', 'DupFrac',\
                    'LowCovRegions', 'PurityPloidyRatio', 'ResNoise', 'SignalNoiseRatio', 'QAStatus', 'Gene',\
                          'SOPHiAGIIndex', 'SophiaGIStatus']

In [4]:
# Drop unwanted columns and fix names
data_df = data_df[columns_to_keep]
data_df.head()

Unnamed: 0,Purity,TotalReads(M),%ReadsPanel,1000x,500x,200x,100x,50x,25x,DupFrac,LowCovRegions,PurityPloidyRatio,ResNoise,SignalNoiseRatio,QAStatus,Gene,SOPHiAGIIndex,SophiaGIStatus
0,90,22.1,0.36,0.91,0.99,1.0,1.0,1.0,1.0,0.65,0,,0.08,4.08,Medium,,8.8,Positive
1,25,21.8,0.14,0.16,0.9,1.0,1.0,1.0,1.0,0.53,5,,0.08,1.34,Medium,,9.2,Positive
2,90,22.5,0.27,0.93,1.0,1.0,1.0,1.0,1.0,0.52,0,0.28,0.07,4.64,High,,0.8,Positive
3,70,21.0,0.2,0.12,0.9,1.0,1.0,1.0,1.0,0.73,0,0.2,0.08,4.67,High,,10.2,Positive
4,80,17.0,0.23,0.53,0.92,1.0,1.0,1.0,1.0,0.6,1,0.45,0.1,3.64,High,,4.7,Positive


In [5]:
#List datatypes
data_df.dtypes

Purity                 int64
TotalReads(M)        float64
%ReadsPanel          float64
1000x                float64
500x                 float64
200x                 float64
100x                 float64
50x                  float64
25x                  float64
DupFrac              float64
LowCovRegions          int64
PurityPloidyRatio    float64
ResNoise             float64
SignalNoiseRatio     float64
QAStatus              object
Gene                  object
SOPHiAGIIndex        float64
SophiaGIStatus        object
dtype: object

In [6]:
# Convert hyphens and dots to NaN
data_df.replace(['-', '.'], np.nan)

Unnamed: 0,Purity,TotalReads(M),%ReadsPanel,1000x,500x,200x,100x,50x,25x,DupFrac,LowCovRegions,PurityPloidyRatio,ResNoise,SignalNoiseRatio,QAStatus,Gene,SOPHiAGIIndex,SophiaGIStatus
0,90,22.1,0.36,0.91,0.99,1.00,1.00,1.00,1.00,0.65,0,,0.08,4.08,Medium,,8.8,Positive
1,25,21.8,0.14,0.16,0.90,1.00,1.00,1.00,1.00,0.53,5,,0.08,1.34,Medium,,9.2,Positive
2,90,22.5,0.27,0.93,1.00,1.00,1.00,1.00,1.00,0.52,0,0.28,0.07,4.64,High,,0.8,Positive
3,70,21.0,0.20,0.12,0.90,1.00,1.00,1.00,1.00,0.73,0,0.20,0.08,4.67,High,,10.2,Positive
4,80,17.0,0.23,0.53,0.92,1.00,1.00,1.00,1.00,0.60,1,0.45,0.10,3.64,High,,4.7,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,70,23.5,0.16,0.77,0.99,1.00,1.00,1.00,1.00,0.41,0,0.25,0.07,3.90,High,,-4.3,Negative
90,70,13.0,0.10,0.00,0.01,0.24,0.68,0.92,0.99,0.77,534,0.40,0.08,3.15,High,,9.9,Positive
91,75,22.6,0.31,0.88,1.00,1.00,1.00,1.00,1.00,0.71,0,0.25,0.09,2.49,High,,-0.9,Negative
92,90,19.2,0.30,0.28,0.78,1.00,1.00,1.00,1.00,0.79,0,0.18,0.09,2.94,High,,-17.3,Negative


In [7]:
# Convert remaining number columns to numeric
columns_to_convert = ['Purity', 'PurityPloidyRatio', 'ResNoise', 'SignalNoiseRatio', 'SOPHiAGIIndex']
data_df[columns_to_convert] = data_df[columns_to_convert].apply(pd.to_numeric, errors='coerce')
data_df.dtypes

Purity                 int64
TotalReads(M)        float64
%ReadsPanel          float64
1000x                float64
500x                 float64
200x                 float64
100x                 float64
50x                  float64
25x                  float64
DupFrac              float64
LowCovRegions          int64
PurityPloidyRatio    float64
ResNoise             float64
SignalNoiseRatio     float64
QAStatus              object
Gene                  object
SOPHiAGIIndex        float64
SophiaGIStatus        object
dtype: object

In [8]:
# Make sure Gene is only BRCA1 or BRCA2
data_df.loc[~data_df['Gene'].isin(['BRCA1', 'BRCA2']), 'Gene'] = np.nan

In [9]:
# Convert 'SophiaGIStatus' to values
status_mapping = {
    'Positive': 1,
    'Negative': 2,
    'Inconclusive': 3,
    'Rejected': 4
}

# Replace values in the 'SophiaGIStatus' column
data_df['SophiaGIStatus'] = data_df['SophiaGIStatus'].replace(status_mapping)

In [10]:
# Convert 'SophiaGIStatus' to categorical
data_df['SophiaGIStatus'] = data_df['SophiaGIStatus'].astype('category')

In [11]:
# Identify columns for one-hot encoding
onehot_cols = ["QAStatus", "Gene", 'SophiaGIStatus']

# Perform one-hot encoding
one_hot_encoded = pd.get_dummies(data_df[onehot_cols])

In [12]:
# Concatenate the one-hot encoded columns with the original DataFrame and drop 
data_df = pd.concat([data_df, one_hot_encoded], axis=1)
data_df = data_df.drop(onehot_cols, axis=1)

In [13]:
# Check for missing values
missing_values = data_df.isnull()
missing_counts = missing_values.sum()
print(missing_counts)

Purity                0
TotalReads(M)         0
%ReadsPanel           0
1000x                 0
500x                  0
200x                  0
100x                  0
50x                   0
25x                   0
DupFrac               0
LowCovRegions         0
PurityPloidyRatio    31
ResNoise              0
SignalNoiseRatio      0
SOPHiAGIIndex         1
QAStatus_High         0
QAStatus_Medium       0
Gene_BRCA1            0
Gene_BRCA2            0
SophiaGIStatus_1      0
SophiaGIStatus_2      0
SophiaGIStatus_3      0
dtype: int64


In [14]:
# Define imputation strategies for NaN
imputation_strategy = 'mean'

In [15]:
# Define columns to impute
cols_to_impute = ['PurityPloidyRatio', 'SOPHiAGIIndex']

In [16]:
# Impute missing values
imputer = SimpleImputer(strategy=imputation_strategy)
imputed = imputer.fit_transform(data_df[cols_to_impute])

In [17]:
# Create a DataFrame with the imputed values
imputed_df = pd.DataFrame(imputed, columns=cols_to_impute, index=data_df.index)

In [18]:
# Combine the imputed data with the other features
data_df = pd.concat([data_df.drop(columns=cols_to_impute), imputed_df], axis=1)

In [23]:
# Add missinf category columns with value = 0
data_df['SophiaGIStatus_4'] = 0
data_df['QAStatus_Low'] = 0

In [24]:
# DataFrame
data_df.head()

Unnamed: 0,Purity,TotalReads(M),%ReadsPanel,1000x,500x,200x,100x,50x,25x,DupFrac,...,QAStatus_Medium,Gene_BRCA1,Gene_BRCA2,SophiaGIStatus_1,SophiaGIStatus_2,SophiaGIStatus_3,PurityPloidyRatio,SOPHiAGIIndex,SophiaGIStatus_4,QAStatus_Low
0,90,22.1,0.36,0.91,0.99,1.0,1.0,1.0,1.0,0.65,...,1,0,0,1,0,0,0.251111,8.8,0,0
1,25,21.8,0.14,0.16,0.9,1.0,1.0,1.0,1.0,0.53,...,1,0,0,1,0,0,0.251111,9.2,0,0
2,90,22.5,0.27,0.93,1.0,1.0,1.0,1.0,1.0,0.52,...,0,0,0,1,0,0,0.28,0.8,0,0
3,70,21.0,0.2,0.12,0.9,1.0,1.0,1.0,1.0,0.73,...,0,0,0,1,0,0,0.2,10.2,0,0
4,80,17.0,0.23,0.53,0.92,1.0,1.0,1.0,1.0,0.6,...,0,0,0,1,0,0,0.45,4.7,0,0


In [25]:
data_df.to_csv('preprocessed_for_prediction_data.csv', index = False)

In [26]:
data_df.shape

(94, 24)