In [70]:
# Import our dependencies
# !pip install keras_tuner
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import keras_tuner as kt
import datetime
import shutil
import keras
import ast
import re


#  Import and read the charity_data.csv.
import pandas as pd
application_df = pd.read_csv("./TesticularCancerDataset.csv")
final_results_all = []
application_df['Overall Survival Status'].head(70)


0     1:DECEASED
1       0:LIVING
2       0:LIVING
3       0:LIVING
4       0:LIVING
         ...    
65    1:DECEASED
66           NaN
67           NaN
68      0:LIVING
69      0:LIVING
Name: Overall Survival Status, Length: 70, dtype: object

In [71]:
f = application_df.columns.tolist()
def sort_list_in_place(my_list):
    my_list.sort()
    return my_list

# Example usage:
example_list = [3, 1, 4, 1, 5, 9, 2]
sorted_list = sort_list_in_place(f)
for column in sorted_list:
    print(column)  # Output will be [1, 1, 2, 3, 4, 5, 9]



Adjuvant Postoperative Pharmaceutical Therapy Administered Indicator
American Joint Committee on Cancer Lymph Node Stage Code
American Joint Committee on Cancer Lymph Node Stage Code.1
American Joint Committee on Cancer Metastasis Stage Code
American Joint Committee on Cancer Publication Version Type
American Joint Committee on Cancer Tumor Stage Code
Cancer Type
Cancer Type Detailed
Days to Sample Collection.
Days to post orchi serum test
Days to pre orchi serum test
Diagnosis Age
Did patient start adjuvant postoperative radiotherapy?
Disease Free (Months)
Disease Free Status
Disease code
Ethnicity Category
Family History Cancer Relationship
Family history other cancer
Family history testicular cancer
First Pathologic Diagnosis Biospecimen Acquisition Method Type
First treatment success
Form completion date
Fraction Genome Altered
Histologic diagnosis percent
History fertility
History hypospadias
History of undescended testis
ICD-10 Classification
Igcccg stage
Informed consent verifie

In [72]:
application_df['Overall Survival Status'].value_counts()
print(application_df['Overall Survival Status'].dtypes)
application_df['Overall Survival Status'] = application_df['Overall Survival Status'].replace('0:LIVING', '0')
application_df['Overall Survival Status'] = application_df['Overall Survival Status'].replace('1:DECEASED', '1')
print("&&&&&&&&&&&&&&&")
print(application_df['Overall Survival Status'].loc[(application_df['Overall Survival Status']).isna()  == True])
print("****************")
application_df = application_df.loc[(application_df['Overall Survival Status']).isna()  == False]

print("&&&&&&&&&&&&&&&")
print(application_df['Overall Survival Status'].loc[(application_df['Overall Survival Status']).isna()  == True])
print("****************")
application_df['Overall Survival Status'] = application_df['Overall Survival Status'].astype('int64')
print(application_df['Overall Survival Status'].dtypes)


application_df['Overall Survival Status'].value_counts()


object
&&&&&&&&&&&&&&&
66    NaN
67    NaN
70    NaN
71    NaN
72    NaN
73    NaN
75    NaN
76    NaN
77    NaN
78    NaN
79    NaN
80    NaN
81    NaN
82    NaN
83    NaN
84    NaN
85    NaN
Name: Overall Survival Status, dtype: object
****************
&&&&&&&&&&&&&&&
Series([], Name: Overall Survival Status, dtype: object)
****************
int64


0    135
1      4
Name: Overall Survival Status, dtype: int64

In [73]:
# Extract X and y from the given dataframe
def extract_X_y(df, y_column_name):
    # Split our preprocessed data into our features and target arrays
    # print(a2pplication_df.columns)
    X = df.drop(columns=[y_column_name])
    y = df[y_column_name]
    return (X, y)

In [74]:
# Find and return X_train_scaled and X_test_scaled
def scale_it(X_train, X_test):
    # Create a StandardScaler instances
    scaler = StandardScaler()

    # Fit the StandardScaler
    X_scaler = scaler.fit(X_train)

    # Scale the data
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)

    return X_train_scaled, X_test_scaled

In [75]:
# Remove outliers from a given column when a value's z-score for that column is greater than a given limit
def remove_outliers(df, threshold, columns_to_remove_outliers_from):
    if len(columns_to_remove_outliers_from) == 0:
        print('hey')
        return [], df
    # Calculate Z-scores only for specified columns
    z_scores = np.abs(stats.zscore(df[columns_to_remove_outliers_from]))

    # Create a mask for outliers
    outliers = (z_scores > threshold)

    # Create a DataFrame with outliers for further inspection if needed
    outlier_rows = df[df.index.isin(df[columns_to_remove_outliers_from].index[outliers.any(axis=1)])]

    # Remove rows with outliers in the specified columns
    df_cleaned = df[~df.index.isin(df[columns_to_remove_outliers_from].index[outliers.any(axis=1)])]

    print("Outlier rows:")
    print(outlier_rows.index)
    
    return outlier_rows, df_cleaned

In [76]:
# Create a specific model instead of rely on keras_tuner.hyperband
def create_specific_model(list_of_activations, num_layers, list_of_number_of_nodes_hidden_layer, input_dim, output_activation, filepath_to_model_weights):

    nn = tf.keras.models.Sequential()

    for layer_number in range(0, num_layers):
        if list_of_activations[layer_number] == 'leaky_relu':
            activation = tf.keras.layers.LeakyReLU(alpha=0.01)
        else:
            activation = list_of_activations[layer_number]
        if layer_number == 0:
            # Input layer which requires input_dim
            nn.add(tf.keras.layers.Dense(units = list_of_number_of_nodes_hidden_layer[layer_number], activation = activation, input_dim = input_dim))
        else:
            nn.add(tf.keras.layers.Dense(units = list_of_number_of_nodes_hidden_layer[layer_number], activation = activation))


    # Output layer
    nn.add(tf.keras.layers.Dense(units = 1, activation = output_activation))
    
    if (filepath_to_model_weights != ''):
        nn.load_weights(filepath_to_model_weights)

    # Check the structure of the model
    print(nn.summary())
    return nn

In [77]:
# Recreate as X, y, X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled
# np.random.seed(42)
# tf.random.set_seed(42)
df_new_removed_column = application_df.copy()
df_new_removed_column = pd.get_dummies(df_new_removed_column)
z_threshold = 8
columns_to_remove_outliers_from = []
outlier_rows, no_outliers_df = remove_outliers(df_new_removed_column, z_threshold, columns_to_remove_outliers_from)


# Split our preprocessed data into our features and target array
X, y = extract_X_y(no_outliers_df, 'Overall Survival Status')

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=3,
                                                    stratify=y)

# Standardize X_train and X_test
X_train_scaled, X_test_scaled = scale_it(X_train, X_test)

hey


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [78]:
num_layers = 6
input_dim = X.shape[1]
list_of_activations = ['tanh', 'leaky_relu', 'leaky_relu', 'relu', 'relu', 'leaky_relu']
list_of_number_of_nodes_hidden_layer = [74, 50, 28, 17, 48, 41]
output_activation = 'sigmoid'
filepath_to_model_weights = ''

replicated_model = create_specific_model(list_of_activations, num_layers, list_of_number_of_nodes_hidden_layer, input_dim, output_activation, filepath_to_model_weights)
replicated_model

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_14 (Dense)            (None, 74)                89022     
                                                                 
 dense_15 (Dense)            (None, 50)                3750      
                                                                 
 dense_16 (Dense)            (None, 28)                1428      
                                                                 
 dense_17 (Dense)            (None, 17)                493       
                                                                 
 dense_18 (Dense)            (None, 48)                864       
                                                                 
 dense_19 (Dense)            (None, 41)                2009      
                                                                 
 dense_20 (Dense)            (None, 1)                

<keras.src.engine.sequential.Sequential at 0x29e8c3186d0>

In [79]:
lr = 0.0005293391761557029
replicated_model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(learning_rate=lr), metrics=['accuracy'])

fit_model = replicated_model.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [80]:
# Evaluate the model for accuracy and loss
model_loss, model_accuracy = replicated_model.evaluate(X_test_scaled,y_test,verbose=2)

2/2 - 0s - loss: nan - accuracy: 0.9714 - 105ms/epoch - 52ms/step


In [81]:
# application_df.corr().to_csv('testicular_correlation.csv')