In [None]:
## DATA PREPROCESSING

import pandas as pd
import scipy
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('Simulated_Data.csv', sep=";" , skiprows=1 , decimal=",", encoding="utf-8" , usecols=lambda col: col != 'Patient_ID')
df.info()
## Dropping null values from the dataset
df = df.dropna(how="all")
df.isnull().sum()
## Presenting data
df.describe()

## Checking for outliers
numeric_df = df.select_dtypes(include=['number'])
fig, axs = plt.subplots(len(numeric_df.columns), figsize=(10 , 5 *len(numeric_df.columns)))

## Handle the case where there is only one column
if len(numeric_df.columns) == 1:
    axs = [axs]

i = 0
for i, col in enumerate(numeric_df.columns):
    axs[i].boxplot(numeric_df[col].dropna(), vert=False)
    axs[i].set_title(col)
    i+=1

plt.tight_layout()
plt.show()

## After the above runs, we have only two outliers within Total_Sperm_Count (million)  
# The approach here is to keep these outliers and see what happens if it skews the model we will either 
# remove or cap them to the nearest non-outlier value

## In the below, we are checking which features correlate with each other
corr = numeric_df.corr()
plt.figure(figsize=(18, 15), dpi=130)
sns.heatmap(numeric_df.corr(), annot=True, fmt= '.2f')
plt.savefig("correlation_heatmap.png", dpi=300)
plt.show()

##Based on the heatmap above, the first test would be to keep all features 
#Second test we're going to run is dropping one of the features
#that are correlated to non-target variable features 
#This is done before we do any other column manipulation 

###columns_to_drop = df.drop(columns=["Immotile_Sperm (%)"])

## Based on the above heatmap, we can see that there aren't any strong correlations 
# between any of the features and the target values Fert and Blast, 

#after the above we then separate the targets from input features 
x = df.drop(columns=["Fertilization Rate (%)" , "Blastulation_Rate (%)"])
y = df[["Fertilization Rate (%)","Blastulation_Rate (%)"]]

## In the below we encode our categorical columns 
le_col= {}
for col in x.columns:
    if x[col].dtype == 'object':
        le = LabelEncoder()
        x[col] = le.fit_transform(x[col].astype(str))
        le_col[col] = le

## Convert encoded columns to dataframes to inspect 
# Making sure all the data is correct and no nulls for input features 
encoded_df = x.copy()
encoded_df.head()
print(encoded_df.dtypes)
print(encoded_df.describe())
print(encoded_df.isnull().sum())

## Making sure target variables are correct with no nulls 
print(y.head())          # sample values
print(y.dtypes)          # ensure numeric
print(y.nunique())
print(y.isnull().sum())

In [34]:
## Binning the target variables for classification
import pandas as pd
import numpy as np

##In the below we define the edges for the classification and make them equal widths
# the below can change based on domain knowledge 
fert_bin = [40,50,76,100] # Low: 40-50% Medium: 50-70% High: 70-100%
blast_bin = [30,50,70,100]  # Low: 0-33% Medium: 33-66% High: 66-100%

#Creating the labels for our classification problem 
fert_labels = ['Low','Meduim','High']
blast_labels = ['Low', 'Meduim','High']

# Binning the target variables
y_fert_binned = pd.cut(y["Fertilization Rate (%)"] , bins=fert_bin , labels=fert_labels, include_lowest=True)
y_blast_binned = pd.cut(y["Blastulation_Rate (%)"] , bins=blast_bin , labels=blast_labels, include_lowest=True)

# Convert to categorical codes for the neural network 
y_fert_codes = y_fert_binned.cat.codes
y_blast_codes = y_blast_binned.cat.codes

# NOW check the distributions (after they're defined)
print("Current fertilization distribution:")
print(y_fert_codes.value_counts().sort_index())

print("\nCurrent blastulation distribution:")
print(y_blast_codes.value_counts().sort_index())

print("\nAfter re-binning:")
print("Fertilization distribution:")
print(y_fert_codes.value_counts().sort_index())
print("Number of fertilization classes:", len(y_fert_codes.unique()))

print("\nBlastulation distribution:")
print(y_blast_codes.value_counts().sort_index())
print("Number of blastulation classes:", len(y_blast_codes.unique()))

Current fertilization distribution:
-1     28
 0    245
 1    659
 2    568
Name: count, dtype: int64

Current blastulation distribution:
-1     22
 0    488
 1    509
 2    481
Name: count, dtype: int64

After re-binning:
Fertilization distribution:
-1     28
 0    245
 1    659
 2    568
Name: count, dtype: int64
Number of fertilization classes: 4

Blastulation distribution:
-1     22
 0    488
 1    509
 2    481
Name: count, dtype: int64
Number of blastulation classes: 4


In [31]:
#SPLITTING UP DATA 
from sklearn.model_selection import train_test_split

# 1. FIRST SPLIT: Create temp and holdout sets
x_temp, x_holdout, y_fert_temp, y_fert_holdout, y_blast_temp, y_blast_holdout = train_test_split(
    encoded_df, y_fert_codes, y_blast_codes, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_fert_codes  # Stratify on one target (both should have similar distributions)
)

# 2. SECOND SPLIT: Create train and test sets from temp
x_train, x_test, y_fert_train, y_fert_test, y_blast_train, y_blast_test = train_test_split(
    x_temp, y_fert_temp, y_blast_temp,
    test_size=0.3, 
    random_state=42, 
    stratify=y_fert_temp
)

# 3. THIRD SPLIT: Create train and validation sets from train
x_train, x_val, y_fert_train, y_fert_val, y_blast_train, y_blast_val = train_test_split(
    x_train, y_fert_train, y_blast_train,
    test_size=0.2, 
    random_state=42, 
    stratify=y_fert_train
)

print("Data shapes after splitting:")
print("x_train shape:", x_train.shape)
print("x_val shape:", x_val.shape)
print("x_test shape:", x_test.shape)
print("x_holdout shape:", x_holdout.shape)

#Checking the distribution of data
print("\nFertilization - Train distribution:", np.bincount(y_fert_train))
print("Fertilization - Validation distribution:", np.bincount(y_fert_val))
print("Fertilization - Test distribution:", np.bincount(y_fert_test))
print("Fertilization - Holdout distribution:", np.bincount(y_fert_holdout))

print("\nBlastulation - Train distribution:", np.bincount(y_blast_train))
print("Blastulation - Validation distribution:", np.bincount(y_blast_val))
print("Blastulation - Test distribution:", np.bincount(y_blast_test))
print("Blastulation - Holdout distribution:", np.bincount(y_blast_holdout))

Data shapes after splitting:
x_train shape: (672, 39)
x_val shape: (168, 39)
x_test shape: (360, 39)
x_holdout shape: (300, 39)

Fertilization - Train distribution: [  0 294 378]
Fertilization - Validation distribution: [ 0 73 95]
Fertilization - Test distribution: [  0 158 202]
Fertilization - Holdout distribution: [  0 131 169]

Blastulation - Train distribution: [ 46 368 258]
Blastulation - Validation distribution: [10 92 66]
Blastulation - Test distribution: [ 20 201 139]
Blastulation - Holdout distribution: [ 16 174 110]


In [None]:
##Converting Panda series to numpy arrays
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

y_fert_train = y_fert_train.to_numpy().astype('int32')
y_fert_val   = y_fert_val.to_numpy().astype('int32')
y_fert_test  = y_fert_test.to_numpy().astype('int32')
y_fert_holdout = y_fert_holdout.to_numpy().astype('int32')

y_blast_train = y_blast_train.to_numpy().astype('int32')
y_blast_val   = y_blast_val.to_numpy().astype('int32')
y_blast_test  = y_blast_test.to_numpy().astype('int32')
y_blast_holdout = y_blast_holdout.to_numpy().astype('int32')

y_fert_train = y_fert_train - y_fert_train.min()
y_fert_val   = y_fert_val - y_fert_val.min()

y_blast_train = y_blast_train - y_blast_train.min()
y_blast_val   = y_blast_val - y_blast_val.min()

In [None]:
##SCALING MY DATA 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

## Initialising standard Scaler and scaling our input features for model training
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)
x_val_scaled = scaler.transform(x_val)
x_holdout_scaled = scaler.transform(x_holdout)

## Visualising data before and after Scaling 
x_train_scaled_df = pd.DataFrame(x_train_scaled, columns=x_train.columns)
print("Before Scaling:\n", x_train.head())
print("\nAfter Scaling:\n", x_train_scaled_df.head())

assert x_train_scaled.shape[0] == y_fert_train.shape[0] == y_blast_train.shape[0]
assert x_val_scaled.shape[0] == y_fert_val.shape[0] == y_blast_val.shape[0]

print("✓ All data shapes match!")

# Plotting 1 Feature to test Scaling
feature = "Sperm_Morphology (%)"
plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
plt.hist(x_train[feature] , bins=30)
plt.title(f"{feature} Before Scaling")

plt.subplot(1,2,2)
plt.hist(x_train_scaled_df[feature], bins=30)
plt.title(f"{feature} After Scaling")
plt.show()

print('x_train_scaled shape: ', x_train_scaled.shape)
print('y_train_scaled shape: ', y_fert_train.shape)
print('y_train_scaled shape: ', y_blast_train.shape)
print(y_fert_train.shape, y_fert_train[:10], y_fert_train.dtype)
print(y_blast_train.shape, y_blast_train[:10], y_blast_train.dtype)

In [None]:
## Handling class Imbalance 
from sklearn.utils.class_weight import compute_class_weight

fert_class_weights = compute_class_weight(
    'balanced', 
    classes=np.unique(y_fert_train),
    y=y_fert_train
)
blast_class_weights = compute_class_weight(
    'balanced', 
    classes=np.unique(y_blast_train),
    y=y_blast_train
)

fert_class_weight_dict = dict(enumerate(fert_class_weights))
blast_class_weight_dict = dict(enumerate(blast_class_weights))

print("Fertilization class weights:", fert_class_weight_dict)
print("Blastulation class weights:", blast_class_weight_dict)

In [None]:
#BUILD NEURAL NETWORK 
import tensorflow as tf 
from tensorflow import keras 
from tensorflow.keras import layers, Sequential, optimizers
from tensorflow.keras.metrics import AUC
from sklearn.metrics import classification_report, confusion_matrix , roc_curve
from sklearn.utils.class_weight import compute_class_weight

#Clear any previous sessions 
tf.keras.backend.clear_session()

def create_classification_model(input_dim, model_name, num_classes):
    model = keras.Sequential([
        layers.Input(shape=(input_dim,) , name=f'{model_name}_input'), 
        layers.Dense(64, activation='relu', name=f'{model_name}_hidden1'),
        layers.Dropout(0.3),
        layers.Dense(32 , activation='relu', name=f'{model_name}_hidden2' ),
        layers.Dropout(0.2),
        layers.Dense(num_classes, activation='softmax', name=f'{model_name}_output')
])
    model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
    )
    
    return model 

# Get number of classes
num_fert_classes = len(np.unique(y_fert_train))
num_blast_classes = len(np.unique(y_blast_train))

print(f"Number of fertilization classes: {num_fert_classes}")
print(f"Number of blastulation classes: {num_blast_classes}")

#Setting the training data to the input for our MLP model,
input_dim = x_train_scaled.shape[1]

fert_model = create_classification_model(input_dim, "fert", num_fert_classes)
blast_model = create_classification_model(input_dim, "blast", num_blast_classes)

#Outputting what the model looks like and what is going to be used in training
print("Fertilization Model Summary:")
fert_model.summary()
print("\nBlastulation Model Summary:")
blast_model.summary()


In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

##Implementing EarlyStopping and ModelCheckpoint
# For us to save the best model and stop the model when it stops improving
fert_callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1),
    ModelCheckpoint('best_fert_model.keras', save_best_only=True, monitor='val_loss'),
]

blast_callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1),
    ModelCheckpoint('best_blast_model.keras', save_best_only=True, monitor='val_loss')
]

In [None]:
##Training the models 

print("Training Fertilization Model...")
fert_history = fert_model.fit(
    x_train_scaled, y_fert_train,
    validation_data=(x_val_scaled, y_fert_val),
    epochs=100,
    batch_size=32,
    class_weight=fert_class_weight_dict,
    callbacks=fert_callbacks,
    verbose=1
)

print("\nTraining Blastulation Model...")
blast_history = blast_model.fit(
    x_train_scaled, y_blast_train,
    validation_data=(x_val_scaled, y_blast_val),
    epochs=100,
    batch_size=32,
    class_weight=blast_class_weight_dict,
    callbacks=blast_callbacks,
    verbose=1
)