## Preprocessing

In [33]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
from pathlib import Path

# Provide the correct file path for charity_data.csv
data_file_path = Path('../Resources/PeterMac_HRD_Validation.csv')
data_df = pd.read_csv(data_file_path)
data_df.head()


Unnamed: 0,Run,SampleID,Source,MonthsOld,Purity,SeqRunID,DDMSampleID,MIDS,TotalReads(M),lpWGSReads(M),...,ResNoise,SignalNoiseRatio,QAStatus,Gene,Variant,%VariantFraction,MyriadGIScore,MyriadGIStatus,SOPHiAGIIndex,SophiaGIStatus
0,1,12749097,AZ,47,20,220121_NB501056_0748_AH2CV5BGXK,200058320-107-S2,2,7.3,5.9,...,0.13,2.95,Medium,.,.,.,51,1,3.2,1
1,1,12749205,AZ,93,30,220121_NB501056_0748_AH2CV5BGXK,200058326-106-S3,3,7.3,5.6,...,0.11,2.91,High,.,.,.,20,2,-15.7,2
2,1,12749267,AZ,71,20,220121_NB501056_0748_AH2CV5BGXK,200058327-104-S4,4,9.6,6.1,...,0.1,1.64,High,.,.,.,17,2,-4.6,2
3,1,12749335,AZ,167,20,220121_NB501056_0748_AH2CV5BGXK,200058329-109-S6,6,8.9,5.6,...,0.09,3.49,High,.,.,.,29,2,-4.6,2
4,1,12749366,AZ,131,60,220121_NB501056_0748_AH2CV5BGXK,200058330-91-S7,7,8.6,5.0,...,0.11,2.18,High,.,.,.,29,2,-8.2,2


In [34]:
# Drop the non-beneficial ID columns, SampleID, SeqRunID, DDMSampleID
# Make these columns into One hot columns QAStatus, Gene, Variant
# Make a Y value column we actually want to test 
# testing if there is a difference between MyriadGIStatus and SophiaGIStatus
# Non-agreement 
# As we are testing the agreement of MyriadGiStatus(1/2) and SophiaGIStatus(1/2/3/4), we need to 
# collapse the test into a binary result.
#    MyriadGIStatus        SophiaGIStatus      Laboratory Classsification  Binary Result
#       1                       1                   TruePositive                1
#       1                       2                   False Negative              0
#       1                       3                   Inconclusive                0
#       1                       4                   Inconclusive                0
#       2                       1                   False Positive              0
#       2                       2                   True Negative               1
#       2                       3                   Inconclusive                0
#       2                       4                   Inconclusive                0
# Remove columns for X sample MyriadGIStatus, SophiaGIStatus, MyriadGIScore, SophiaGIIndex
# Non-agreement
# the Y value will be Non-agreement
# We have to decide how to handle columns with actual non-values
# being PurityPloidyRatio, Variant. Our lack of records and sparse grouping 
# makes these columns liklely to be dropped.
#  YOUR CODE GOES HERE
columns_to_drop = ["SampleID", "SeqRunID", "DDMSampleID","MonthsOld"]

# Drop the specified columns from the DataFrame
data_df = data_df.drop(columns=columns_to_drop, axis=1)
data_df.head()

Unnamed: 0,Run,Source,Purity,MIDS,TotalReads(M),lpWGSReads(M),TargetPanelReads(M),%ReadslpWGS,%ReadsPanel,1000x,...,ResNoise,SignalNoiseRatio,QAStatus,Gene,Variant,%VariantFraction,MyriadGIScore,MyriadGIStatus,SOPHiAGIIndex,SophiaGIStatus
0,1,AZ,20,2,7.3,5.9,1.4,81%,19%,1%,...,0.13,2.95,Medium,.,.,.,51,1,3.2,1
1,1,AZ,30,3,7.3,5.6,1.7,76%,24%,2%,...,0.11,2.91,High,.,.,.,20,2,-15.7,2
2,1,AZ,20,4,9.6,6.1,3.5,64%,36%,41%,...,0.1,1.64,High,.,.,.,17,2,-4.6,2
3,1,AZ,20,6,8.9,5.6,3.3,63%,37%,16%,...,0.09,3.49,High,.,.,.,29,2,-4.6,2
4,1,AZ,60,7,8.6,5.0,3.6,58%,42%,2%,...,0.11,2.18,High,.,.,.,29,2,-8.2,2


In [35]:
# Finding attribute columns
application_categories = data_df.dtypes[data_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column of object type
columns = data_df[application_categories].nunique()

# Iterate through the columns and print the unique value counts for each column
# we iterate through every column in the dataframe, some of them of object type where we found the unique count
for column in data_df.columns:
    if column in columns.index:
        data_type = data_df[column].dtype
        print(f"{column.ljust(40)} (Data Type: {data_type}) - {columns[column]} unique value(s)")
    else:
        print(f"{column.ljust(20)} numeric- {data_df[column].nunique()} unique value(s)")



Run                  numeric- 13 unique value(s)
Source                                   (Data Type: object) - 4 unique value(s)
Purity                                   (Data Type: object) - 14 unique value(s)
MIDS                 numeric- 24 unique value(s)
TotalReads(M)        numeric- 95 unique value(s)
lpWGSReads(M)        numeric- 86 unique value(s)
TargetPanelReads(M)  numeric- 64 unique value(s)
%ReadslpWGS                              (Data Type: object) - 36 unique value(s)
%ReadsPanel                              (Data Type: object) - 36 unique value(s)
1000x                                    (Data Type: object) - 65 unique value(s)
500x                                     (Data Type: object) - 56 unique value(s)
200x                                     (Data Type: object) - 32 unique value(s)
100x                                     (Data Type: object) - 16 unique value(s)
50x                                      (Data Type: object) - 10 unique value(s)
25x               

In [36]:
# Convert columns we will not bin, from pobject type to numeric
column_names_to_convert = ['Purity']

# Step 1: Check the current data types of the columns
print(data_df[column_names_to_convert].dtypes)

# Step 2: Convert each column to numeric (if possible)
for col in column_names_to_convert:
    data_df[col] = pd.to_numeric(data_df[col], errors='coerce')

# Step 3: Check the new data types of the columns after the conversion
print(data_df[column_names_to_convert].dtypes)
# print(data_df["Source"].dtypes)

Purity    object
dtype: object
Purity    float64
dtype: object


In [37]:
print(data_df['%ReadslpWGS'].unique())
print(data_df['%ReadsPanel'].unique())


['81%' '76%' '64%' '63%' '58%' '54%' '90%' '89%' '84%' '85%' '98%' '72%'
 '68%' '69%' '74%' '66%' '73%' '70%' '80%' '86%' '15%' '91%' '62%' '79%'
 '88%' '92%' '71%' '77%' '59%' '60%' '55%' '65%' '67%' '57%' '61%' '75%']
['19%' '24%' '36%' '37%' '42%' '46%' '10%' '12%' '16%' '15%' '2%' '28%'
 '32%' '31%' '26%' '29%' '34%' '25%' '30%' '20%' '14%' '85%' '9%' '38%'
 '21%' '11%' '8%' '23%' '41%' '40%' '27%' '45%' '35%' '33%' '43%' '39%']


In [38]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
data_df['PurityPloidyRatio'] = data_df['PurityPloidyRatio'].replace('-', 0.0)
data_df['ResNoise'] = data_df['ResNoise'].replace('-', 0.0)
data_df['SignalNoiseRatio'] = data_df['SignalNoiseRatio'].replace('-', 0.0)

data_df['Gene'] = data_df['Gene'].replace('.', 'Unlisted')


# data_df['MonthsOld'] = data_df['MonthsOld'].fillna(0.0)
# data_df['MonthsOld'] = data_df['MonthsOld'].replace('.', 0.0)
data_df['Purity'] = data_df['Purity'].replace('.', 0.0)
data_df['%VariantFraction'] = data_df['%VariantFraction'].replace('.', 0.0)

# Convert the 'Purity' column to numeric, replacing '.' with 0.0
data_df['Purity'] = pd.to_numeric(data_df['Purity'], errors='coerce').fillna(0.0)

data_df['DupFrac'] = data_df['DupFrac'].replace('%', '', regex=True).astype(float)
data_df['%ReadslpWGS'] = data_df['%ReadslpWGS'].replace('%', '', regex=True).astype(float)
data_df['%ReadsPanel'] = data_df['%ReadsPanel'].replace('%', '', regex=True).astype(float)

data_df['Variant'] = data_df['Variant'].replace('.', 'Unlisted')

# Apply label encoding to 'Variant' column
label_encoder = LabelEncoder()
data_df['Variant'] = label_encoder.fit_transform(data_df['Variant'].astype(str))

data_df['1000x'] = data_df['1000x'].replace('%', '', regex=True).astype(float)

data_df['500x'] = data_df['1000x'].replace('%', '', regex=True).astype(float)
data_df['200x'] = data_df['1000x'].replace('%', '', regex=True).astype(float)
data_df['100x'] = data_df['1000x'].replace('%', '', regex=True).astype(float)
data_df['50x'] = data_df['1000x'].replace('%', '', regex=True).astype(float)
data_df['25x'] = data_df['1000x'].replace('%', '', regex=True).astype(float)



In [39]:
def calculate_non_agreement(row):
    if row['MyriadGIStatus'] == 1 and row['SophiaGIStatus'] == 1:
        return 1
    elif row['MyriadGIStatus'] == 1 and row['SophiaGIStatus'] == 2:
        return 0
    elif row['MyriadGIStatus'] == 1 and row['SophiaGIStatus'] == 3:
        return 0
    elif row['MyriadGIStatus'] == 1 and row['SophiaGIStatus'] == 4:
        return 0
    elif row['MyriadGIStatus'] == 2 and row['SophiaGIStatus'] == 1:
        return 0
    elif row['MyriadGIStatus'] == 2 and row['SophiaGIStatus'] == 2:
        return 1
    elif row['MyriadGIStatus'] == 2 and row['SophiaGIStatus'] == 3:
        return 0
    elif row['MyriadGIStatus'] == 2 and row['SophiaGIStatus'] == 4:
        return 0
    else:
        return None

# Apply the function to create the 'Non-agreement' column
data_df['Non-agreement'] = data_df.apply(calculate_non_agreement, axis=1)

#print(data_df['Non-agreement'].dtypes)
print(data_df)

     Run  Source  Purity  MIDS  TotalReads(M)  lpWGSReads(M)  \
0      1      AZ    20.0     2            7.3            5.9   
1      1      AZ    30.0     3            7.3            5.6   
2      1      AZ    20.0     4            9.6            6.1   
3      1      AZ    20.0     6            8.9            5.6   
4      1      AZ    60.0     7            8.6            5.0   
..   ...     ...     ...   ...            ...            ...   
134    6      AZ    10.0     3           25.5           18.8   
135   11  GREECE     0.0     4           18.6           13.0   
136    2      AZ    40.0    23           18.3           12.2   
137    5      AZ    50.0    23           14.0           10.5   
138   12  BRAZIL    60.0    23           21.6           15.5   

     TargetPanelReads(M)  %ReadslpWGS  %ReadsPanel  1000x  ...  \
0                    1.4         81.0         19.0    1.0  ...   
1                    1.7         76.0         24.0    2.0  ...   
2                    3.5         

In [40]:
print(data_df)

     Run  Source  Purity  MIDS  TotalReads(M)  lpWGSReads(M)  \
0      1      AZ    20.0     2            7.3            5.9   
1      1      AZ    30.0     3            7.3            5.6   
2      1      AZ    20.0     4            9.6            6.1   
3      1      AZ    20.0     6            8.9            5.6   
4      1      AZ    60.0     7            8.6            5.0   
..   ...     ...     ...   ...            ...            ...   
134    6      AZ    10.0     3           25.5           18.8   
135   11  GREECE     0.0     4           18.6           13.0   
136    2      AZ    40.0    23           18.3           12.2   
137    5      AZ    50.0    23           14.0           10.5   
138   12  BRAZIL    60.0    23           21.6           15.5   

     TargetPanelReads(M)  %ReadslpWGS  %ReadsPanel  1000x  ...  \
0                    1.4         81.0         19.0    1.0  ...   
1                    1.7         76.0         24.0    2.0  ...   
2                    3.5         

In [41]:
# Look at APPLICATION_TYPE value counts for binning  QAStatus

grouped_df = data_df.groupby("QAStatus").size().reset_index(name="COUNT")
sorted_df = grouped_df.sort_values(by="COUNT", ascending=False)
sorted_df = sorted_df.reset_index(drop=True)
print(sorted_df)
print('---')
grouped_df = data_df.groupby("Gene").size().reset_index(name="COUNT")
sorted_df = grouped_df.sort_values(by="COUNT", ascending=False)
sorted_df = sorted_df.reset_index(drop=True)
print(sorted_df)
print('---')
grouped_df = data_df.groupby("Variant").size().reset_index(name="COUNT")
sorted_df = grouped_df.sort_values(by="COUNT", ascending=False)
sorted_df = sorted_df.reset_index(drop=True)
print(sorted_df)
print('---')
grouped_df = data_df.groupby("PurityPloidyRatio").size().reset_index(name="COUNT")
sorted_df = grouped_df.sort_values(by="COUNT", ascending=False)
sorted_df = sorted_df.reset_index(drop=True)
print(sorted_df)

  QAStatus  COUNT
0     High     89
1   Medium     42
2      Low      8
---
       Gene  COUNT
0  Unlisted     99
1     BRCA1     21
2     BRCA2     18
3    RAD51D      1
---
    Variant  COUNT
0         0     99
1        14      2
2        21      2
3        36      1
4        27      1
5        22      1
6        23      1
7        24      1
8        25      1
9        37      1
10       26      1
11       28      1
12       35      1
13       20      1
14       30      1
15       31      1
16       32      1
17       33      1
18       34      1
19       29      1
20       19      1
21        1      1
22       18      1
23        2      1
24        3      1
25        4      1
26        5      1
27        6      1
28        7      1
29        8      1
30        9      1
31       10      1
32       11      1
33       12      1
34       13      1
35       15      1
36       16      1
37       17      1
38       38      1
---
   PurityPloidyRatio  COUNT
0                0.0     46
1    

In [42]:
# This is a dummy block of code in case we want to gather many column values into the same bucket
# application_types_to_replace = ["T9", "T13", "T12", "T2", "T14", "T25", "T29", "T15", "T17"]

# Replace the specified values in the "APPLICATION_TYPE" column with "Other"
# data_df['APPLICATION_TYPE'] = data_df['APPLICATION_TYPE'].replace(application_types_to_replace, "Other")

# Check the value counts after replacing
# print(data_df['APPLICATION_TYPE'].value_counts())

In [43]:
# Create one-hot columns for these 3 columns
onehot_cols = ["QAStatus", "Gene", "Variant"]  #"PurityPloidyRatio"]

# Use get_dummies() to one-hot encode only the categorical columns
one_hot_encoded = pd.get_dummies(data_df[onehot_cols])

# Concatenate the one-hot encoded columns with the original DataFrame
data_df = pd.concat([data_df, one_hot_encoded], axis=1)

# After this, you can print the data types of columns in the 'data_df' DataFrame
column_types = data_df.dtypes
print(column_types)


Run                      int64
Source                  object
Purity                 float64
MIDS                     int64
TotalReads(M)          float64
lpWGSReads(M)          float64
TargetPanelReads(M)    float64
%ReadslpWGS            float64
%ReadsPanel            float64
1000x                  float64
500x                   float64
200x                   float64
100x                   float64
50x                    float64
25x                    float64
DupFrac                float64
LowCovRegions            int64
PurityPloidyRatio       object
ResNoise                object
SignalNoiseRatio        object
QAStatus                object
Gene                    object
Variant                  int64
%VariantFraction        object
MyriadGIScore            int64
MyriadGIStatus           int64
SOPHiAGIIndex           object
SophiaGIStatus           int64
Non-agreement            int64
Variant                  int64
QAStatus_High            uint8
QAStatus_Low             uint8
QAStatus

In [44]:
data_df.to_csv('preprocessed_data.csv', index = False)

## Compile, Train and Evaluate the Model

In [45]:
def final_callback(message):
    print(f"Final callback: {message}")

In [46]:
def loss_accuracy_callback(epoch, loss, accuracy):
    print(f"Epoch {epoch}: Loss={loss:.4f}, Accuracy={accuracy:.4f}")

In [47]:
import tensorflow as tf
from tensorflow.keras.callbacks import CSVLogger
from datetime import datetime

def main_process(callback, X_train_scaled, y_train, nn, report_interval=5):
    # Train the model and store the training history
    fit_model = nn.fit(X_train_scaled, y_train, epochs=100, verbose=0, callbacks=[callback])  # Pass the callback here
    training_history = fit_model.history
    
    print("Training has started.")
    epochs = 100
    
    # Report loss and accuracy at the specified intervals
    for epoch in range(1, epochs + 1):
        if epoch % report_interval == 0 or epoch == epochs:
            loss = training_history['loss'][epoch - 1]
            accuracy = training_history['accuracy'][epoch - 1]
            print(f"Epoch {epoch}: Loss={loss:.4f}, Accuracy={accuracy:.4f}")#soos2
            callback.on_epoch_end(epoch, {'loss': loss, 'accuracy': accuracy})  # Manually call the on_epoch_end method

    result = "Task completed."
    # final_callback(result)  # Comment out or remove this line as it is not defined in the code
    print("Main process finished.")

# soos
current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S")
script_name = "Starter_Codev1"
csv_loss_accuracy_file = "startercodetest.csv"
#csv_loss_accuracy_file_name = f"{script_name}_test_result_{current_datetime}.csv"

# Create the CSVLogger callback and pass the file name
csv_logger = CSVLogger(csv_loss_accuracy_file)







## Optimisation
### In our original model, we created data frame application_df, which still exists.
- 1. I will create a pca method on this data frame which has had bucketing


In [48]:
# 1  pca method on application_df which has had bucketing performed for the original model.
data_df_x = data_df.copy() 
columns_to_drop = ["Gene", "Non-agreement", "MyriadGIStatus", "SophiaGIStatus", "MyriadGIScore", "SOPHiAGIIndex", "QAStatus"]
data_df_x = data_df_x.drop(columns=columns_to_drop, axis=1)


#data_df_x = data_df_x.drop(columns=["Gene","Source","Non_agreement", "MyriadGIStatus", "SophiaGIStatus", "MyriadGIScore", "SOPHiAGIIndex","QAStatus"], axis=1)
data_df_x.columns

Index(['Run', 'Source', 'Purity', 'MIDS', 'TotalReads(M)', 'lpWGSReads(M)',
       'TargetPanelReads(M)', '%ReadslpWGS', '%ReadsPanel', '1000x', '500x',
       '200x', '100x', '50x', '25x', 'DupFrac', 'LowCovRegions',
       'PurityPloidyRatio', 'ResNoise', 'SignalNoiseRatio', 'Variant',
       '%VariantFraction', 'Variant', 'QAStatus_High', 'QAStatus_Low',
       'QAStatus_Medium', 'Gene_BRCA1', 'Gene_BRCA2', 'Gene_RAD51D',
       'Gene_Unlisted'],
      dtype='object')

In [49]:
#data_df_x
# Step 1: Check which columns have missing values (NaN)
columns_with_missing_values = data_df_x.isna().any()

# Step 2: Output the columns where the count of missing values is larger than 0
columns_with_missing_values = columns_with_missing_values[columns_with_missing_values]

# Output the columns with missing values
print(columns_with_missing_values)
#print(data_df_x['%ReadslpWGS'].unique())
#print(data_df_x['%ReadsPanel'].unique())


Series([], dtype: bool)


In [50]:
# Check unique values in the "%VariantFraction" column
print(data_df['%VariantFraction'].unique())


[0.0 '42.6' '78.1' '78.5' '77.2' '22.1' '37.8' '56.4' '24.4' '87' '54.6'
 '51.1' '85.8' '71.8' '66.1' '9' '87.4' '84.1' '80.4' '73' '67.6' '17.3'
 '9.1' '35.8' '82.8' '80.5' 'Deleted' '18.7' '65.3' '51.8' '68.8' '62.6'
 '71.2' '89.8' '47' '59.6' '5.4' '26.7' '64.7']


In [51]:
# Replace "Deleted" with 0 in the '%ReadslpWGS' column
data_df_x['%VariantFraction'] = data_df_x['%VariantFraction'].replace('Deleted', 0)

# Convert the column to numeric (float) format
data_df_x['%VariantFraction'] = pd.to_numeric(data_df_x['%VariantFraction'])


In [52]:
#data_df_x.columns
import pandas as pd

# Assuming 'data_df_x' is the DataFrame containing the data
non_numeric_columns = data_df_x.select_dtypes(exclude=['number']).columns

print("Non-numeric columns:")
print(non_numeric_columns)


Non-numeric columns:
Index(['Source', 'PurityPloidyRatio', 'ResNoise', 'SignalNoiseRatio'], dtype='object')


In [53]:
non_numeric_columns = data_df_x.select_dtypes(exclude=[np.number]).columns
print(non_numeric_columns)



Index(['Source', 'PurityPloidyRatio', 'ResNoise', 'SignalNoiseRatio'], dtype='object')


In [54]:
# Check data types of the columns in data_df_x
print(data_df_x.dtypes)

# Check for missing values in data_df_x
print(data_df_x.isnull().sum())

# Convert the '%VariantFraction' column to numeric values, invalid values will be converted to NaN
data_df_x['%VariantFraction'] = pd.to_numeric(data_df_x['%VariantFraction'], errors='coerce')

# Create a mask to identify rows with NaN values in the '%VariantFraction' column
invalid_rows_mask = data_df_x['%VariantFraction'].isna()

# Use the mask to filter the DataFrame and get the rows with invalid values
invalid_rows = data_df_x[invalid_rows_mask]




# Check if any non-numeric values still exist in data_df_x
non_numeric_values = data_df_x.apply(pd.to_numeric, errors='coerce').isnull().sum()
print(non_numeric_values)


Run                      int64
Source                  object
Purity                 float64
MIDS                     int64
TotalReads(M)          float64
lpWGSReads(M)          float64
TargetPanelReads(M)    float64
%ReadslpWGS            float64
%ReadsPanel            float64
1000x                  float64
500x                   float64
200x                   float64
100x                   float64
50x                    float64
25x                    float64
DupFrac                float64
LowCovRegions            int64
PurityPloidyRatio       object
ResNoise                object
SignalNoiseRatio        object
Variant                  int64
%VariantFraction       float64
Variant                  int64
QAStatus_High            uint8
QAStatus_Low             uint8
QAStatus_Medium          uint8
Gene_BRCA1               uint8
Gene_BRCA2               uint8
Gene_RAD51D              uint8
Gene_Unlisted            uint8
dtype: object
Run                    0
Source                 0
Purity

In [55]:
# Check for non-numeric columns
non_numeric_cols = data_df_x.select_dtypes(exclude=[np.number]).columns
print("Non-numeric columns:", non_numeric_cols)

# Check for missing values
missing_values = data_df_x.isnull().sum()
print("Missing values:\n", missing_values)


Non-numeric columns: Index(['Source', 'PurityPloidyRatio', 'ResNoise', 'SignalNoiseRatio'], dtype='object')
Missing values:
 Run                    0
Source                 0
Purity                 0
MIDS                   0
TotalReads(M)          0
lpWGSReads(M)          0
TargetPanelReads(M)    0
%ReadslpWGS            0
%ReadsPanel            0
1000x                  0
500x                   0
200x                   0
100x                   0
50x                    0
25x                    0
DupFrac                0
LowCovRegions          0
PurityPloidyRatio      0
ResNoise               0
SignalNoiseRatio       0
Variant                0
%VariantFraction       0
Variant                0
QAStatus_High          0
QAStatus_Low           0
QAStatus_Medium        0
Gene_BRCA1             0
Gene_BRCA2             0
Gene_RAD51D            0
Gene_Unlisted          0
dtype: int64


In [56]:
columns_to_drop = ['Variant', 'Source']
data_df_x = data_df_x.drop(columns=columns_to_drop)


data_df_x.columns


Index(['Run', 'Purity', 'MIDS', 'TotalReads(M)', 'lpWGSReads(M)',
       'TargetPanelReads(M)', '%ReadslpWGS', '%ReadsPanel', '1000x', '500x',
       '200x', '100x', '50x', '25x', 'DupFrac', 'LowCovRegions',
       'PurityPloidyRatio', 'ResNoise', 'SignalNoiseRatio', '%VariantFraction',
       'QAStatus_High', 'QAStatus_Low', 'QAStatus_Medium', 'Gene_BRCA1',
       'Gene_BRCA2', 'Gene_RAD51D', 'Gene_Unlisted'],
      dtype='object')

In [57]:
invalid_rows = data_df_x['%VariantFraction'].apply(lambda x: not str(x).replace('.', '').isnumeric())

# Display the rows containing the invalid values
print(data_df_x[invalid_rows])


Empty DataFrame
Columns: [Run, Purity, MIDS, TotalReads(M), lpWGSReads(M), TargetPanelReads(M), %ReadslpWGS, %ReadsPanel, 1000x, 500x, 200x, 100x, 50x, 25x, DupFrac, LowCovRegions, PurityPloidyRatio, ResNoise, SignalNoiseRatio, %VariantFraction, QAStatus_High, QAStatus_Low, QAStatus_Medium, Gene_BRCA1, Gene_BRCA2, Gene_RAD51D, Gene_Unlisted]
Index: []

[0 rows x 27 columns]


In [58]:
data_df_x.to_csv('preprocessed_dat_x.csv', index = False)

In [59]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

# Assume data_df_x contains the features, and data_df contains the target "Non_agreement" column

# Split our preprocessed data into our features and target arrays
y2 = data_df["Non-agreement"].values
X2 = data_df_x.values

# Split the preprocessed data into a training and testing dataset
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=78)

# Now perform PCA only on the training data
pca = PCA(n_components=3)
X2_train_pca = pca.fit_transform(X2_train)

# Apply the same PCA transformation to the testing data
X2_test_pca = pca.transform(X2_test)


# Check the number of records in the original dataset
print(data_df_x.shape)

# Check the number of records after splitting into training and testing sets
print(X2_train.shape, X2_test.shape)

# Check the number of records after PCA transformation
print(X2_train_pca.shape, X2_test_pca.shape)

# Create a new DataFrame with the PCA data for both training and testing sets
df_train_pca = pd.DataFrame(X2_train_pca, columns=["PC1", "PC2", "PC3"])
df_test_pca = pd.DataFrame(X2_test_pca, columns=["PC1", "PC2", "PC3"])




(139, 27)
(111, 27) (28, 27)
(111, 3) (28, 3)


In [60]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X2_scaler = scaler.fit(X2_train)

# Scale the data
X2_train_scaled = X2_scaler.transform(X2_train)
X2_test_scaled = X2_scaler.transform(X2_test)

## Compile, Train and Evaluate my PCA bucketted Model

In [61]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
#  YOUR CODE GOES HERE
import tensorflow as tf
number_input_features = len(X2_train[0])
hidden_nodes_layer1 = 6
hidden_nodes_layer2 = 8
nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1,
input_dim=number_input_features, activation = "relu")
)
# I am setting the regularization
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="tanh"))
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))


#nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))
# Check the structure of the model

nn.summary()







Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 6)                 168       
                                                                 
 dense_5 (Dense)             (None, 6)                 42        
                                                                 
 dense_6 (Dense)             (None, 8)                 56        
                                                                 
 dense_7 (Dense)             (None, 1)                 9         
                                                                 
Total params: 275 (1.07 KB)
Trainable params: 275 (1.07 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [62]:
# Compile the model
#  YOUR CODE GOES HERE
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [63]:
#main_process(loss_accuracy_callback, X2_train_scaled, y2_train, nn)
# Start the training and use the CSVLogger callback to automatically log epoch, loss, and accuracy data to the CSV file
main_process(csv_logger, X2_train_scaled, y2_train, nn, report_interval=5)

Training has started.
Epoch 5: Loss=0.6773, Accuracy=0.5856
Epoch 10: Loss=0.6027, Accuracy=0.7477
Epoch 15: Loss=0.5503, Accuracy=0.8018
Epoch 20: Loss=0.5129, Accuracy=0.8018
Epoch 25: Loss=0.4864, Accuracy=0.8018
Epoch 30: Loss=0.4656, Accuracy=0.8018
Epoch 35: Loss=0.4508, Accuracy=0.8018
Epoch 40: Loss=0.4387, Accuracy=0.8018
Epoch 45: Loss=0.4270, Accuracy=0.8018
Epoch 50: Loss=0.4146, Accuracy=0.8018
Epoch 55: Loss=0.4037, Accuracy=0.8018
Epoch 60: Loss=0.3929, Accuracy=0.8018
Epoch 65: Loss=0.3826, Accuracy=0.8018
Epoch 70: Loss=0.3726, Accuracy=0.8018
Epoch 75: Loss=0.3627, Accuracy=0.8108
Epoch 80: Loss=0.3524, Accuracy=0.8108
Epoch 85: Loss=0.3430, Accuracy=0.8198
Epoch 90: Loss=0.3334, Accuracy=0.8288
Epoch 95: Loss=0.3248, Accuracy=0.8378
Epoch 100: Loss=0.3158, Accuracy=0.8378
Main process finished.


In [64]:
import csv
import json
from keras.models import load_model
from datetime import datetime

# ... Your code to create and train the neural network ...
# ... Assuming you already have the `nn` model ...

# Get the current date and time
current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S")

# Define the script name manually (replace 'Your_Script_Name' with the actual name of your script)
script_name = "PCAProject4_Codev1"

# Generate the output file name with the script name and date/time
output_file_name = f"{script_name}_{current_datetime}.h5"

# Save the model with the date and time in the file name
nn.save(output_file_name)

# Output CSV file name with the script name and date/time
csv_file_name = f"{script_name}_model_data_{current_datetime}.csv"
json_name = f"{script_name}_model_data_{current_datetime}.json"
# Extract model configuration (architecture and hyperparameters)
model_config = nn.get_config()

# Write the model information to the CSV file
with open(csv_file_name, "w", newline="") as csv_file:
    fieldnames = model_config.keys()  # Retrieve model configuration keys
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    writer.writeheader()  # Write header with field names
    writer.writerow(model_config)  # Write the model configuration to the CSV

# Corrected JSON-like string
corrected_json_str = '[{"class_name": "InputLayer", "config": {"batch_input_shape": [null, 43], "dtype": "float32", "sparse": false, "ragged": false, "name": "dense_input"}}, {"class_name": "Dense", "config": {"name": "dense", "trainable": true, "batch_input_shape": [null, 43], "dtype": "float32", "units": 8, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "dtype": "float32", "units": 5, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dense", "config": {"name": "dense_2", "trainable": true, "dtype": "float32", "units": 1, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}]'

# Convert the JSON-like string to a list of dictionaries
layers_list = json.loads(corrected_json_str)

with open(json_name, "w") as json_file:
    json.dump(layers_list, json_file)

# Print the list of dictionaries
print(layers_list)

[{'class_name': 'InputLayer', 'config': {'batch_input_shape': [None, 43], 'dtype': 'float32', 'sparse': False, 'ragged': False, 'name': 'dense_input'}}, {'class_name': 'Dense', 'config': {'name': 'dense', 'trainable': True, 'batch_input_shape': [None, 43], 'dtype': 'float32', 'units': 8, 'activation': 'relu', 'use_bias': True, 'kernel_initializer': {'class_name': 'GlorotUniform', 'config': {'seed': None}}, 'bias_initializer': {'class_name': 'Zeros', 'config': {}}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}}, {'class_name': 'Dense', 'config': {'name': 'dense_1', 'trainable': True, 'dtype': 'float32', 'units': 5, 'activation': 'relu', 'use_bias': True, 'kernel_initializer': {'class_name': 'GlorotUniform', 'config': {'seed': None}}, 'bias_initializer': {'class_name': 'Zeros', 'config': {}}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': Non

  saving_api.save_model(


In [65]:
import csv
from datetime import datetime

def write_test_result_to_csv(file_name, loss, accuracy, current_datetime):
    with open(file_name, "w", newline="") as csv_file:
        fieldnames = ["Metric", "Value"]
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

        writer.writeheader()  # Write header with field names
        writer.writerow({"Metric": "Loss", "Value": loss})
        writer.writerow({"Metric": "Accuracy", "Value": accuracy})
        writer.writerow({"Metric": "Date", "Value": current_datetime})  # Write current date and time

# Get the current date and time
current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S")

# Evaluate the model on the test data
model_loss, model_accuracy = nn.evaluate(X2_test_scaled, y2_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

# Output CSV file name for test results with the script name and date/time
#test_result_csv_file = f"{script_name}_loss_accuracy_{current_datetime}.csv"
test_result_csv_file = "test_results.csv"
# Write the test result to the CSV file with the current date and time
write_test_result_to_csv(test_result_csv_file, model_loss, model_accuracy, current_datetime)

1/1 - 0s - loss: 0.7959 - accuracy: 0.7857 - 184ms/epoch - 184ms/step
Loss: 0.7959306836128235, Accuracy: 0.7857142686843872


In [66]:
import csv
import json

def csv_to_json(csv_file_path):
    data = []

    with open(csv_file_path, 'r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for row in csv_reader:
            data.append(row)

    return data

csv_file_path = 'test_results.csv'

# Convert CSV data to JSON format
json_data = csv_to_json(csv_file_path)


json_file_path = f"{script_name}_loss_accuracy_{current_datetime}.json"
# Write the JSON data to a JSON file
# json_file_path = 'test_results.json'
with open(json_file_path, 'w') as json_file:
    json.dump(json_data, json_file, indent=4)

csv_file_path = 'startercodetest.csv'

# Convert CSV data to JSON format
json_data = csv_to_json(csv_file_path)

# Write the JSON data to a JSON file
# json_file_path = 'startercodetest.json'
json_file_path = f"{script_name}_test_result_{current_datetime}.json"
with open(json_file_path, 'w') as json_file:
    json.dump(json_data, json_file, indent=4)



In [67]:
from sklearn.metrics import confusion_matrix

# Assuming you have a trained model named 'nn' and test data 'X2_test_scaled', 'y2_test'
predictions = nn.predict(X2_test_scaled)

# Convert the probability predictions to class labels
predictions = (predictions > 0.5).astype(int)  # Assuming the threshold is 0.5 for binary classification

# Choose the class for which you want to calculate the binary confusion matrix
positive_class = 1

# Create binary labels for the positive class
y_true_binary = (y2_test == positive_class).astype(int)

# Calculate the binary confusion matrix
conf_matrix = confusion_matrix(y_true_binary, predictions)
np.set_printoptions(precision=2)
print(conf_matrix)

confusion_csv_file = f"{script_name}_confusion_matrix_{current_datetime}.csv"

np.savetxt(confusion_csv_file, conf_matrix, delimiter=",", fmt='%.2f')




[[ 2  6]
 [ 0 20]]


In [68]:
data_df_x.columns

Index(['Run', 'Purity', 'MIDS', 'TotalReads(M)', 'lpWGSReads(M)',
       'TargetPanelReads(M)', '%ReadslpWGS', '%ReadsPanel', '1000x', '500x',
       '200x', '100x', '50x', '25x', 'DupFrac', 'LowCovRegions',
       'PurityPloidyRatio', 'ResNoise', 'SignalNoiseRatio', '%VariantFraction',
       'QAStatus_High', 'QAStatus_Low', 'QAStatus_Medium', 'Gene_BRCA1',
       'Gene_BRCA2', 'Gene_RAD51D', 'Gene_Unlisted'],
      dtype='object')

In [69]:
import datetime

# Convert the non-numeric columns with 'NA' values to -1
non_numeric_columns = ['PurityPloidyRatio', 'ResNoise', 'SignalNoiseRatio']
for col in non_numeric_columns:
    try:
        column_idx = np.where(X2_test.dtype.names == col)[0][0]
        X2_test[col] = np.where(X2_test[col] == 'NA', -1, X2_test[col])
    except IndexError:
        print(f"Column '{col}' does not exist in the test data. Setting default value to -1.")

# Concatenate the processed X_test and y_test data
data_to_save = np.column_stack((X2_test, y2_test))

column_headers = ['Run', 'Purity', 'MIDS', 'TotalReads(M)', 'lpWGSReads(M)',
                  'TargetPanelReads(M)', '%ReadslpWGS', '%ReadsPanel', '1000x', '500x',
                  '200x', '100x', '50x', '25x', 'DupFrac', 'LowCovRegions',
                  'PurityPloidyRatio', 'ResNoise', 'SignalNoiseRatio', '%VariantFraction',
                  'QAStatus_High', 'QAStatus_Low', 'QAStatus_Medium', 'Gene_BRCA1',
                  'Gene_BRCA2', 'Gene_RAD51D', 'Gene_Unlisted','Y-Predict']



# Create a filename with the current datetime
#current_datetime = datetime.datetime. now().strftime("%Y%m%d%H%M%S")
#csv_file_name = f"processed_test_data_{current_datetime}.csv"
csv_file_name = "processed_test_data.csv"
data_with_headers = np.vstack((column_headers, data_to_save))



# Save the concatenated data to the file
np.savetxt(csv_file_name, data_with_headers, delimiter=',', fmt='%s')
# np.savetxt(csv_file_name, data_to_save, delimiter=',', fmt='%s')


def csv_to_json(csv_file_path):
    data = []

    with open(csv_file_path, 'r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for row in csv_reader:
            data.append(row)

    return data

csv_file_path = 'processed_test_data.csv'

# Convert CSV data to JSON format
json_data = csv_to_json(csv_file_path)

json_file_path = f"{script_name}_processed_test_data{current_datetime}.json"
with open(json_file_path, 'w') as json_file:
    json.dump(json_data, json_file, indent=4)




# Load the saved data from the file and print the first few rows
saved_data = np.genfromtxt(csv_file_name, delimiter=',', dtype=None, names=True, encoding=None)
print(saved_data[:5])



Column 'PurityPloidyRatio' does not exist in the test data. Setting default value to -1.
Column 'ResNoise' does not exist in the test data. Setting default value to -1.
Column 'SignalNoiseRatio' does not exist in the test data. Setting default value to -1.
[( 4, 70.,  4, 24. , 21.7, 2.3, 90., 10.,  0.,  0.,  0.,  0.,  0.,  0., 75., 14, 0.  , 0.13, 2.6 , 85.8, 0, 0, 1, 0, 1, 0, 0, 1)
 (12, 70.,  9, 17. , 11.6, 5.4, 68., 32., 67., 67., 67., 67., 67., 67., 66.,  0, 0.28, 0.14, 1.66,  0. , 1, 0, 0, 0, 0, 0, 1, 1)
 ( 6, 70., 11, 14.1, 10.3, 3.8, 73., 27.,  5.,  5.,  5.,  5.,  5.,  5., 81., 34, 0.35, 0.23, 1.35,  0. , 1, 0, 0, 0, 0, 0, 1, 1)
 (11,  0.,  6, 20.3, 14.1, 6.2, 70., 30., 96., 96., 96., 96., 96., 96., 62.,  0, 0.1 , 0.06, 2.14,  0. , 1, 0, 0, 0, 0, 0, 1, 1)
 (11,  0., 18, 18.4, 14.9, 3.5, 81., 19., 12., 12., 12., 12., 12., 12., 65.,  3, 0.  , 0.07, 1.96,  0. , 0, 0, 1, 0, 0, 0, 1, 1)]
