In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit

In [None]:
# read csv file of dataset_mod

data = pd.read_csv("dataset_mod.csv")
data.shape

(1935, 13)

In [None]:
data.describe()

Unnamed: 0,mum_age,mum_height,presentation_breech,presentation_cephalic,presentation_other,placenta_site_previa,amniotic_anhydramnios,amniotic_normal,hypertension_nil,hypertension_pih,diabetes_gdm,diabetes_nil,delivery_mode
count,1935.0,1935.0,1935.0,1935.0,1935.0,1935.0,1935.0,1935.0,1935.0,1935.0,1935.0,1935.0,1935.0
mean,34.366247,157.193075,0.165375,0.748837,0.085788,0.016537,0.003618,0.913695,0.894057,0.078553,0.375194,0.596382,0.48062
std,3.821538,5.650977,0.371614,0.433794,0.280123,0.127563,0.060053,0.280886,0.307844,0.269109,0.484298,0.490749,0.566664
min,23.00937,139.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,31.52563,153.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
50%,34.174555,157.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,37.052096,160.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
max,46.243249,175.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0


In [None]:
duplicates = data.duplicated()
print(duplicates)

# Count duplicates
n_duplicates = duplicates.sum()
print("Number of duplicates:", n_duplicates)

# Remove duplicates
data = data.drop_duplicates()

0       False
1        True
2        True
3        True
4       False
        ...  
1930    False
1931    False
1932    False
1933     True
1934    False
Length: 1935, dtype: bool
Number of duplicates: 748


In [None]:
data.shape

(1187, 13)

In [None]:
# Prepare the input and target data
X = data.drop('delivery_mode', axis=1)
y = data['delivery_mode']

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Apply normalization
normalizer = MinMaxScaler()
# X_normalized = normalizer.fit_transform(X)
X = normalizer.fit_transform(X)

In [None]:
# Apply feature scaling
scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)
X = scaler.fit_transform(X)

In [None]:
# Calculate the IQR for each feature
Q1 = np.quantile(X, 0.25)
Q3 = np.quantile(X, 0.75)
IQR = Q3 - Q1

# Filter out outliers
# X_no_outliers = X[~((X < (Q1 - 1.5 * IQR)) | (X > (Q3 + 1.5 * IQR))).any(axis=1)]
y = y[~((X < (Q1 - 1.5 * IQR)) | (X > (Q3 + 1.5 * IQR))).any(axis=1)]
X = X[~((X < (Q1 - 1.5 * IQR)) | (X > (Q3 + 1.5 * IQR))).any(axis=1)]

In [None]:
# # Split the data into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import StandardScaler

# Assuming 'mum_age' is sufficient to uniquely identify the records from the same person
assert "mum_age" in data.columns, "Please make sure 'mum_age' is in the dataframe"

# Create X (input features) and y (target) from 'data'
X = data.drop(["delivery_mode"], axis=1)
y = data["delivery_mode"]

# Split the data into training and test sets based on the 'mum_age' groups
group_split = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=41)
train_idx, test_idx = next(group_split.split(X, y, groups=data["mum_age"]))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# Merge the two dataframes based on all columns
merged_df = pd.merge(X_train, X_test, on=list(X_train.columns), how='inner')

# Check if the merged dataframe has any rows
if merged_df.shape[0] > 0:
    print("Duplicate records exist in the two datasets.")
else:
    print("No duplicate records exist in the two datasets.")

# # Standardize the input data
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

No duplicate records exist in the two datasets.


Prepare Validation Set (Without Data Leakage)

In [None]:
# Concatenate training and test set, but keep the index information
X_full = pd.concat([X_train, X_test]).reset_index(drop=True)
y_full = pd.concat([y_train, y_test]).reset_index(drop=True)

# Define the split index
test_fold = [-1] * len(X_train) + [0] * len(X_test)  # -1 for training data, 0 for test data
cv = PredefinedSplit(test_fold)

## **Deep Learning**

In [None]:
# # Define the model
# model = Sequential([
#     Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
#     Dropout(0.2),
#     Dense(32, activation='relu'),
#     Dropout(0.2),
#     Dense(16, activation='relu'),
#     Dense(3, activation='softmax')
# ])

# # Compile the model
# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# # Train the model
# model.fit(X_train, y_train, epochs=300, batch_size=32, validation_split=0.1)

In [None]:
# # Evaluate the model on the test set
# test_loss, test_accuracy = model.evaluate(X_test, y_test)
# print(f'Test accuracy: {test_accuracy}')

Test accuracy: 0.5165289044380188


In [None]:
# # Predict delivery_mode for new data
# new_data = np.array([[50.2, 123, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0]])
# new_data_scaled = scaler.transform(new_data)
# predictions = model.predict(new_data_scaled)
# predicted_delivery_mode = np.argmax(predictions, axis=1)
# print(f'Predicted delivery_mode: {predicted_delivery_mode}')

Predicted delivery_mode: [0]




## **Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create the RandomForest model
rf_model = RandomForestClassifier(n_estimators=100, max_depth=25, min_samples_split=10, min_samples_leaf=2, max_features='sqrt', random_state=43)

# Train the model
rf_model.fit(X_train, y_train)

# Predict delivery_mode on the test set
y_pred_rf = rf_model.predict(X_test)

# Calculate accuracy, precision, recall, and F1-score for each class
test_accuracy_rf = accuracy_score(y_test, y_pred_rf)

# # Print the evaluation metrics
print(f'Test accuracy (RandomForest): {test_accuracy_rf}')


Test accuracy (RandomForest): 0.628099173553719


In [None]:
# Define the hyperparameters to search
param_grid = {
    'n_estimators': [100],
    'max_depth': [25],
    'min_samples_split': [10],
    'min_samples_leaf': [2],
    'max_features': ['sqrt']
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(rf_model, param_grid, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_full, y_full)

# Print the best hyperparameters and their corresponding accuracy score
print(f'Best hyperparameters: {grid_search.best_params_}')
print(f'Best cross-validation accuracy: {grid_search.best_score_}')

# Train the model with the best hyperparameters on the entire training set
best_prmt_model = grid_search.best_estimator_
best_prmt_model.fit(X_train, y_train)

# Predict delivery_mode on the test set
y_pred_best_prmt = best_prmt_model.predict(X_test)

# Evaluate the model
test_accuracy_best_prmt = accuracy_score(y_test, y_pred_best_prmt)
print(f'Test accuracy (RandomForest - tuned): {test_accuracy_best_prmt}')

Fitting 1 folds for each of 2 candidates, totalling 2 fits
Best hyperparameters: {'max_depth': 25, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}
Best cross-validation accuracy: 0.628099173553719
Test accuracy (RandomForest - tuned): 0.628099173553719


## **K-Nearest Neighbors**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Create the K-Nearest Neighbors model
knn_model = KNeighborsClassifier(n_neighbors=21,algorithm='ball_tree',leaf_size=1,metric='minkowski',p=2,weights='uniform')

# Train the model
knn_model.fit(X_train, y_train)

# Predict delivery_mode on the test set
y_pred_knn = knn_model.predict(X_test)

# Evaluate the model
test_accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f'Test accuracy (K-Nearest Neighbors): {test_accuracy_knn}')

Test accuracy (K-Nearest Neighbors): 0.5867768595041323


In [None]:
# Define the hyperparameters to search
param_grid = {
    'n_neighbors': [x for x in range(1,100)],
    'weights': ['uniform','distance'],
    'p': [1,2],
    'algorithm': ['ball_tree'],
    'leaf_size': [x for x in range(1,100)],
    'metric': ['minkowski', 'euclidean']
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(knn_model, param_grid, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_full, y_full)

# Print the best hyperparameters and their corresponding accuracy score
print(f'Best hyperparameters: {grid_search.best_params_}')
print(f'Best cross-validation accuracy: {grid_search.best_score_}')

# Train the model with the best hyperparameters on the entire training set
best_prmt_model = grid_search.best_estimator_
best_prmt_model.fit(X_train, y_train)

# Predict delivery_mode on the test set
y_pred_best_prmt = best_prmt_model.predict(X_test)

# Evaluate the model
test_accuracy_best_prmt = accuracy_score(y_test, y_pred_best_prmt)
print(f'Test accuracy (KNN - tuned): {test_accuracy_best_prmt}')

Fitting 1 folds for each of 2328 candidates, totalling 2328 fits
Best hyperparameters: {'algorithm': 'ball_tree', 'leaf_size': 1, 'metric': 'minkowski', 'n_neighbors': 21, 'p': 2, 'weights': 'uniform'}
Best cross-validation accuracy: 0.5867768595041323
Test accuracy (KNN - tuned): 0.5867768595041323


## **LightGBM**

In [None]:
import lightgbm as lgb

# Create the LightGBM model
lgb_model = lgb.LGBMClassifier(n_estimators=15,num_leaves=31,colsample_bytree=0.65, subsample=0.15,boosting_type='gbdt', objective='regression', learning_rate=0.09, random_state=42)

# Train the model
lgb_model.fit(X_train, y_train)

# Predict delivery_mode on the test set
y_pred_lgb = lgb_model.predict(X_test)

# Evaluate the model
test_accuracy_lgb = accuracy_score(y_test, y_pred_lgb)
print(f'Test accuracy (LightGBM): {test_accuracy_lgb}')


Test accuracy (LightGBM): 0.6115702479338843


In [None]:
# Define the hyperparameters to search
param_grid = {
    'learning_rate': [0.08,0.09,0.1],
    'num_leaves': [30,31,32],
    'max_depth': [-1],
    'subsample': [0.13,0.14,0.15],
    'colsample_bytree': [0.6, 0.65, 0.7],
    'n_estimators': [14,15,16],
    'boosting_type': ['gbdt', 'dart'],
    'objective': ['regression', 'binary', 'multiclass'],
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(lgb_model, param_grid, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_full, y_full)

# Print the best hyperparameters and their corresponding accuracy score
print(f'Best hyperparameters: {grid_search.best_params_}')
print(f'Best cross-validation accuracy: {grid_search.best_score_}')

# Train the model with the best hyperparameters on the entire training set
best_prmt_model = grid_search.best_estimator_
best_prmt_model.fit(X_train, y_train)

# Predict delivery_mode on the test set
y_pred_best_prmt = best_prmt_model.predict(X_test)

# Evaluate the model
test_accuracy_best_prmt = accuracy_score(y_test, y_pred_best_prmt)
print(f'Test accuracy (LightGBM - tuned): {test_accuracy_best_prmt}')

Fitting 1 folds for each of 1458 candidates, totalling 1458 fits
Best hyperparameters: {'boosting_type': 'gbdt', 'colsample_bytree': 0.65, 'learning_rate': 0.09, 'max_depth': -1, 'n_estimators': 15, 'num_leaves': 31, 'objective': 'regression', 'subsample': 0.13}
Best cross-validation accuracy: 0.6115702479338843
Test accuracy (LightGBM - tuned): 0.6115702479338843


## **XGBoost**

In [None]:
import xgboost as xgb

# Create the XGBoost model
xgb_model = xgb.XGBClassifier(n_estimators=90, max_depth=3,colsample_bytree=0.69, learning_rate=0.2, min_child_weight=1, objective='reg:squarederror', random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Predict delivery_mode on the test set
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
test_accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f'Test accuracy (XGBoost): {test_accuracy_xgb}')


Test accuracy (XGBoost): 0.6198347107438017


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters to search
param_grid = {
    'learning_rate': [0.2],
    'n_estimators': [89,90,91],
    'max_depth': [3],
    # 'subsample': [0.5, 0.7, 0.9],
    'colsample_bytree': [0.69],
    'gamma': [0, 0.1, 0.5, 1.0],
    'alpha': [0, 0.1, 0.5, 1.0],
    'lambda': [0, 0.1, 0.5, 1.0],
    'min_child_weight': [1],
    'objective': ['reg:squarederror'], #, 'binary:logistic', 'multi:softmax'
    # 'eval_metric': ['rmse', 'mae', 'logloss', 'error'],
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(xgb_model, param_grid, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_full, y_full)

# Print the best hyperparameters and their corresponding accuracy score
print(f'Best hyperparameters: {grid_search.best_params_}')
print(f'Best cross-validation accuracy: {grid_search.best_score_}')

# Train the model with the best hyperparameters on the entire training set
best_xgb_model = grid_search.best_estimator_
best_xgb_model.fit(X_train, y_train)

# Predict delivery_mode on the test set
y_pred_best_xgb = best_xgb_model.predict(X_test)

# Evaluate the model
test_accuracy_best_xgb = accuracy_score(y_test, y_pred_best_xgb)
print(f'Test accuracy (XGBoost - tuned): {test_accuracy_best_xgb}')

NameError: ignored

## **Decision Tree**

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Create the Decision Tree model
dt_model = DecisionTreeClassifier(criterion = 'gini', max_depth = 12, min_samples_leaf = 4, min_samples_split = 2, max_features=None, random_state=42)

# Train the model
dt_model.fit(X_train, y_train)

# Predict delivery_mode on the test set
y_pred_dt = dt_model.predict(X_test)

# Evaluate the model
test_accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f'Test accuracy (Decision Tree): {test_accuracy_dt}')

Test accuracy (Decision Tree): 0.6239669421487604


In [None]:
# Define the hyperparameters to search
param_grid = {
    'criterion': ['gini','entropy'],
    'max_depth': [x for x in range (10,23)],
    'min_samples_split': [2,3,4,5,6,50],
    'min_samples_leaf': [1,2,3,4,5,6,7],
    'max_features': ['sqrt', 'log2', None]
}

# Perform grid search with the custom validation set (validation set is same as test set)
grid_search = GridSearchCV(dt_model, param_grid, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_full, y_full)

# Print the best hyperparameters and their corresponding accuracy score
print(f'Best hyperparameters: {grid_search.best_params_}')
print(f'Best cross-validation accuracy: {grid_search.best_score_}')

# Train the model with the best hyperparameters on the entire training set
best_dt_model = grid_search.best_estimator_
best_dt_model.fit(X_train, y_train)

# Predict delivery_mode on the test set
y_pred_best_dt = best_dt_model.predict(X_test)

# Evaluate the model
test_accuracy_best_dt = accuracy_score(y_test, y_pred_best_dt)
print(f'Test accuracy (Decision Tree - tuned): {test_accuracy_best_dt}')


Fitting 1 folds for each of 3276 candidates, totalling 3276 fits
Best hyperparameters: {'criterion': 'gini', 'max_depth': 12, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 2}
Best cross-validation accuracy: 0.6239669421487604
Test accuracy (Decision Tree - tuned): 0.6239669421487604


## **SVM**

In [None]:
from sklearn.svm import SVC

# Create the SVM model
svm_model = SVC(kernel='poly', C=1,degree=2, gamma=10, random_state=42)

# Train the model
svm_model.fit(X_train, y_train)

# Predict delivery_mode on the test set
y_pred_svm = svm_model.predict(X_test)

# Evaluate the model
test_accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f'Test accuracy (SVM): {test_accuracy_svm}')


Test accuracy (SVM): 0.6115702479338843


In [None]:
# Define the hyperparameters to search
param_grid = {
    'kernel': [ 'linear','poly', 'rbf', 'sigmoid'],#
    'C': [0.1, 1, 2],
    'gamma': [10,11],#'scale', 'auto',
    'degree': [1,2,3 ],#3, 4
    # 'coef0': [0.0, 0.1, 1.0],
    # 'shrinking': [True, False],
    'max_iter': [-1]
}

# Perform grid search with the custom validation set (validation set is same as test set)
grid_search = GridSearchCV(svm_model, param_grid, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_full, y_full)

# Train the model with the best hyperparameters on the entire dataset
best_svm_model = grid_search.best_estimator_
best_svm_model.fit(X_train, y_train)

# Predict delivery_mode on the test set
y_pred_best_svm = best_svm_model.predict(X_test)

# Print the best hyperparameters and their corresponding accuracy score
print(f'Best hyperparameters: {grid_search.best_params_}')
print(f'Best validation accuracy: {grid_search.best_score_}')
test_accuracy_best_svm = accuracy_score(y_test, best_svm_model.predict(X_test))
print(f'Test accuracy (SVM - tuned): {test_accuracy_best_svm}')

Fitting 1 folds for each of 72 candidates, totalling 72 fits


KeyboardInterrupt: ignored

# **Models Ensembling: Hard Voting**

In [None]:
from sklearn.ensemble import VotingClassifier

# Combine the models using VotingClassifier
ensemble_model = VotingClassifier(estimators=[
    ('rf', rf_model),
    ('svm', svm_model),
    ('dt', dt_model),
    ('lgb', lgb_model),
    ('xgb', xgb_model)],
    voting='hard')

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Predict delivery_mode on the test set
y_pred_ensemble = ensemble_model.predict(X_test)

# Evaluate the ensemble model
test_accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
print(f'Test accuracy (Ensemble - Hard Voting): {test_accuracy_ensemble}')


Test accuracy (Ensemble - Hard Voting): 0.6570247933884298
