# Import Libraries all features

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve

# Importing the Dataset

In [None]:
df = pd.read_excel("fetal_health.xlsx") 

# EDA and Visualization

In [None]:
df.head()

In [None]:
#Checking the number of rows and columns in the data set
df.shape

In [None]:
# Checking the type of data
df.info()

In [None]:
# Check for duplicates
duplicate_columns = df.T[df.T.duplicated()].T

# List duplicate column names
dupes = duplicate_columns.columns.tolist()

if dupes:
    print("Duplicate feature columns found:")
    for col in dupes:
        print(f"- {col}")
else:
    print("✅ No duplicate feature columns found.")


In [None]:
# Check for missing values
print(df.isnull().sum())

In [None]:
# The Summary Statistics of the Dataset
df.describe()

In [None]:
# Checking the array of distinct values in the Class
df.fetal_health.unique()


In [None]:
df.fetal_health.value_counts()


In [None]:
# Distribution of Target Variable
custom_palette = ['#1f77b4', '#ff7f0e', 'green']  # blue and orange
ax = sns.countplot(x='fetal_health', data=df, palette=custom_palette)
plt.xlabel("fetal_health")
plt.ylabel("Count")
plt.title("Fetal Health Class Distribution")
# Add custom legend
labels = ['Normal', 'Suspect', 'Pathological']
handles = [plt.Rectangle((0, 0), 1, 1, color=patch.get_facecolor()) for patch in ax.patches[:3]]
plt.legend(handles, labels)
plt.show()


In [None]:
#change to histogram
features = df.drop(columns=['fetal_health']).select_dtypes(include=['int64', 'float64']).columns

# Plot
plt.figure(figsize=(16, 12))
for i, feature in enumerate(features):
    plt.subplot((len(features) + 2) // 3, 3, i + 1)
    sns.histplot(data=df, x=feature, hue='fetal_health', kde=True, element='step', stat='density', common_norm=False)
    plt.title(f'{feature} Distribution by fetal_health')
    plt.xlabel(feature)
    plt.ylabel('Density')

plt.tight_layout()
plt.show()


In [None]:
# Correlation Matrix
plt.figure(figsize=(18, 16))  # wider and taller
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

In [None]:
# Drop Highly Correlated Features
corr_matrix = df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]

# Drop correlated features
df = df.drop(columns=to_drop)
df

# Data Preprocessing


In [None]:
X = df.drop('fetal_health', axis=1)
y = df['fetal_health']
print("Shape of X (features):", X.shape)
print("Shape of y (target):", y.shape)

# Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42) #stratify because of imbalance class
print("Matrices of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Matrices of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)

In [None]:
# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train

In [None]:
y_train

# Baseline Model

In [None]:
from sklearn.dummy import DummyClassifier
# Most Frequent
baseline_mf = DummyClassifier(strategy="most_frequent")
baseline_mf.fit(X_train, y_train)
y_pred_mf = baseline_mf.predict(X_test)
acc_mf = accuracy_score(y_test, y_pred_mf)
print("Baseline Model - Most Frequent")
print("Accuracy:", acc_mf)
print("Classification Report:\n", classification_report(y_test, y_pred_mf))

In [None]:
# Stratified
baseline_strat = DummyClassifier(strategy="stratified", random_state=42)
baseline_strat.fit(X_train, y_train)
y_pred_strat = baseline_strat.predict(X_test)
acc_strat = accuracy_score(y_test, y_pred_strat)
print("Baseline Model - Stratified")
print("Accuracy:", acc_strat)
print("Classification Report:\n", classification_report(y_test, y_pred_strat))

In [None]:
# Uniform
baseline_uniform = DummyClassifier(strategy="uniform", random_state=42)
baseline_uniform.fit(X_train, y_train)
y_pred_uniform = baseline_uniform.predict(X_test)
acc_uniform = accuracy_score(y_test, y_pred_uniform)
print("Baseline Model - Uniform")
print("Accuracy:", acc_uniform)
print("Classification Report:\n", classification_report(y_test, y_pred_uniform))

# Build and Train Multiple Models

In [None]:
models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=1000),
    "Random Forest": RandomForestClassifier(class_weight='balanced'),
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced'),
    "K-Nearest Neighbors": KNeighborsClassifier(),  # no class_weight support
    "Support Vector Machine": SVC(probability=True, class_weight='balanced'),
    "Naive Bayes": GaussianNB(),  # handles imbalance probabilistically
    "Gradient Boosting": GradientBoostingClassifier()  # no native class_weight support
}

results = []

# Check if the task is binary or multi-class
num_classes = len(np.unique(y_test))
is_multiclass = num_classes > 2

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)
    else:
        # fallback for models like SVC without probability=True
        y_proba = None

    acc = accuracy_score(y_test, y_pred)
    
    # ROC AUC calculation
    if y_proba is not None:
        try:
            if is_multiclass:
                auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
            else:
                auc = roc_auc_score(y_test, y_proba[:, 1])
        except Exception as e:
            auc = None
            print(f"⚠️ ROC AUC calculation failed for {name}: {e}")
    else:
        auc = None

    print(f"\n{name}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Accuracy:", acc)
    if auc is not None:
        print("ROC AUC Score:", auc)
    else:
        print("ROC AUC Score: Not available")
    
    results.append((name, acc, auc))

# Add baseline results if available
results.insert(0, ("Baseline (Uniform)", acc_uniform, None))
results.insert(0, ("Baseline (Stratified)", acc_strat, None))
results.insert(0, ("Baseline (Most Frequent)", acc_mf, None))


# Model Comparison Table

In [None]:
results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "ROC AUC"])
print("\nModel Comparison:")
print(results_df.sort_values(by="ROC AUC", ascending=False))


In [None]:
from sklearn.metrics import RocCurveDisplay
from sklearn.preprocessing import label_binarize

y_test_binarized = label_binarize(y_test, classes=[1, 2, 3])

plt.figure(figsize=(10, 8))
for name, model in models.items():
    if not hasattr(model, "predict_proba"):
        continue

    y_score = model.predict_proba(X_test)
    try:
        auc_score = roc_auc_score(y_test_binarized, y_score, average='macro', multi_class='ovr')
        fpr, tpr, _ = roc_curve(y_test_binarized.ravel(), y_score.ravel())
        plt.plot(fpr, tpr, label=f"{name} (Macro AUC = {auc_score:.2f})")
    except Exception as e:
        print(f"{name}: {e}")

plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.title("ROC Curve (Macro-Average for Multi-class Models)")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.grid(True)
plt.show()


# Feature Importance

In [None]:
rf_model = models["Random Forest"]
importances = rf_model.feature_importances_
features = df.drop("fetal_health", axis=1).columns
plt.figure(figsize=(10, 6))
sns.barplot(x=importances, y=features)
plt.title("Feature Importance (Random Forest)")
plt.show()

# Cross validation

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print("Cross-Validation Results:")
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    print(f"{name}: {scores.mean():.4f} ± {scores.std():.4f}")

In [None]:
# Display the cross validation result in a tabular format
results = []
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    results.append({
        'Model': name,
        'Mean Accuracy': scores.mean(),
        'Std Dev': scores.std()
    })

# Create a DataFrame for display
results_df = pd.DataFrame(results).sort_values(by='Mean Accuracy', ascending= False)

# Format and display
results_df = results_df.round(4)
print("Cross-Validation Results:")
display(results_df)

# Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [100, 200,300,400],
    'max_depth': [None, 10, 20, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1,2,4]
    
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=cv, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Parameters (Random Forest):", grid_search.best_params_)
print("Best Cross-Validated Accuracy:", grid_search.best_score_)

In [None]:
# Evaluate best RF on test set
best_rf = grid_search.best_estimator_
y_rf_pred = best_rf.predict(X_test)
print("Test Accuracy (Best RF):", accuracy_score(y_test, y_rf_pred))
print("Classification Report (Best RF):\n", classification_report(y_test, y_rf_pred))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(best_rf, X_test, y_test, cmap='Blues')
plt.title("Confusion Matrix - Best RF Model")
plt.show()

# Save the Best Model

In [None]:
import pickle
feature_list = X.columns.tolist()
# Save the best model
best_model = models["Random Forest"] 
with open('Fetal_Health_Classification_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

with open('feature_list.pkl', 'wb') as f:
    pickle.dump(feature_list, f)

# Save the scaler
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)
